talk-llama : sync latest llama.cpp (close #922, close #954)

author Georgi Gerganov <redacted>

Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)

committer Georgi Gerganov <redacted>

Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)
author Georgi Gerganov <redacted>
Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)
committer Georgi Gerganov <redacted>
Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)
diff --git a/examples/talk-llama/llama-util.h b/examples/talk-llama/llama-util.h

index ca4dd162f59feb945a52f33c8bc63eeffa0767bf..3cac9f681800bcd83962a603ca04bf79f69f4b9f 100644 (file)
--- a/examples/talk-llama/llama-util.h
+++ b/examples/talk-llama/llama-util.h
@@ -14,6 +14,7 @@
  
  #include <string>
  #include <vector>
+#include <stdexcept>
  
  #ifdef __has_include
      #if __has_include(<unistd.h>)
@@ -74,7 +75,7 @@ struct llama_file {
      llama_file(const char * fname, const char * mode) {
          fp = std::fopen(fname, mode);
          if (fp == NULL) {
-            throw format("failed to open %s: %s", fname, std::strerror(errno));
+            throw std::runtime_error(format("failed to open %s: %s", fname, strerror(errno)));
          }
          seek(0, SEEK_END);
          size = tell();
@@ -100,17 +101,17 @@ struct llama_file {
          LLAMA_ASSERT(ret == 0); // same
      }
  
-    void read_raw(void * ptr, size_t size) {
-        if (size == 0) {
+    void read_raw(void * ptr, size_t len) const {
+        if (len == 0) {
              return;
          }
          errno = 0;
-        std::size_t ret = std::fread(ptr, size, 1, fp);
+        std::size_t ret = std::fread(ptr, len, 1, fp);
          if (ferror(fp)) {
-            throw format("read error: %s", strerror(errno));
+            throw std::runtime_error(format("read error: %s", strerror(errno)));
          }
          if (ret != 1) {
-            throw std::string("unexpectedly reached end of file");
+            throw std::runtime_error(std::string("unexpectedly reached end of file"));
          }
      }
  
@@ -126,14 +127,14 @@ struct llama_file {
          return std::string(chars.data(), len);
      }
  
-    void write_raw(const void * ptr, size_t size) {
-        if (size == 0) {
+    void write_raw(const void * ptr, size_t len) const {
+        if (len == 0) {
              return;
          }
          errno = 0;
-        size_t ret = std::fwrite(ptr, size, 1, fp);
+        size_t ret = std::fwrite(ptr, len, 1, fp);
          if (ret != 1) {
-            throw format("write error: %s", strerror(errno));
+            throw std::runtime_error(format("write error: %s", strerror(errno)));
          }
      }
  
@@ -171,7 +172,7 @@ struct llama_mmap {
  #ifdef _POSIX_MAPPED_FILES
      static constexpr bool SUPPORTED = true;
  
-    llama_mmap(struct llama_file * file, bool prefetch = true) {
+    llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
          size = file->size;
          int fd = fileno(file->fp);
          int flags = MAP_SHARED;
@@ -180,12 +181,12 @@ struct llama_mmap {
  #endif
          addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
          if (addr == MAP_FAILED) {
-            throw format("mmap failed: %s", strerror(errno));
+            throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
          }
  
-        if (prefetch) {
+        if (prefetch > 0) {
              // Advise the kernel to preload the mapped memory
-            if (madvise(addr, file->size, MADV_WILLNEED)) {
+            if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
                  fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                          strerror(errno));
              }
@@ -207,7 +208,7 @@ struct llama_mmap {
          DWORD error = GetLastError();
  
          if (hMapping == NULL) {
-            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
+            throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
          }
  
          addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
@@ -215,7 +216,7 @@ struct llama_mmap {
          CloseHandle(hMapping);
  
          if (addr == NULL) {
-            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
+            throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
          }
  
          #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
@@ -243,8 +244,9 @@ struct llama_mmap {
  #else
      static constexpr bool SUPPORTED = false;
  
-    llama_mmap(struct llama_file *) {
-        throw std::string("mmap not supported");
+    llama_mmap(struct llama_file *, bool prefetch = true) {
+        (void)prefetch;
+        throw std::runtime_error(std::string("mmap not supported"));
      }
  #endif
  };
@@ -265,9 +267,9 @@ struct llama_mlock {
          }
      }
  
-    void init(void * addr) {
-        LLAMA_ASSERT(this->addr == NULL && this->size == 0);
-        this->addr = addr;
+    void init(void * ptr) {
+        LLAMA_ASSERT(addr == NULL && size == 0);
+        addr = ptr;
      }
  
      void grow_to(size_t target_size) {
@@ -338,14 +340,14 @@ struct llama_mlock {
          return (size_t) si.dwPageSize;
      }
  
-    bool raw_lock(void * addr, size_t size) {
+    bool raw_lock(void * ptr, size_t len) {
          for (int tries = 1; ; tries++) {
-            if (VirtualLock(addr, size)) {
+            if (VirtualLock(ptr, len)) {
                  return true;
              }
              if (tries == 2) {
                  fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
-                        size, this->size, llama_format_win_err(GetLastError()).c_str());
+                    len, size, llama_format_win_err(GetLastError()).c_str());
                  return false;
              }
  
@@ -361,7 +363,7 @@ struct llama_mlock {
              // is equal to the number of pages in its minimum working set minus
              // a small overhead."
              // Hopefully a megabyte is enough overhead:
-            size_t increment = size + 1048576;
+            size_t increment = len + 1048576;
              // The minimum must be <= the maximum, so we need to increase both:
              min_ws_size += increment;
              max_ws_size += increment;
@@ -373,8 +375,8 @@ struct llama_mlock {
          }
      }
  
-    void raw_unlock(void * addr, size_t size) {
-        if (!VirtualUnlock(addr, size)) {
+    void raw_unlock(void * ptr, size_t len) {
+        if (!VirtualUnlock(ptr, len)) {
              fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
                      llama_format_win_err(GetLastError()).c_str());
          }
@@ -382,11 +384,16 @@ struct llama_mlock {
  #else
      static constexpr bool SUPPORTED = false;
  
-    void raw_lock(const void * addr, size_t size) {
+    size_t lock_granularity() {
+        return (size_t) 65536;
+    }
+
+    bool raw_lock(const void * addr, size_t len) {
          fprintf(stderr, "warning: mlock not supported on this system\n");
+        return false;
      }
  
-    void raw_unlock(const void * addr, size_t size) {}
+    void raw_unlock(const void * addr, size_t len) {}
  #endif
  };
  
@@ -395,36 +402,70 @@ struct llama_buffer {
      uint8_t * addr = NULL;
      size_t size = 0;
  
-    void resize(size_t size) {
+    llama_buffer() = default;
+
+    void resize(size_t len) {
          delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
+        addr = new uint8_t[len];
+        size = len;
      }
  
      ~llama_buffer() {
          delete[] addr;
      }
+
+    // disable copy and move
+    llama_buffer(const llama_buffer&) = delete;
+    llama_buffer(llama_buffer&&) = delete;
+    llama_buffer& operator=(const llama_buffer&) = delete;
+    llama_buffer& operator=(llama_buffer&&) = delete;
  };
  
  #ifdef GGML_USE_CUBLAS
  #include "ggml-cuda.h"
  struct llama_ctx_buffer {
      uint8_t * addr = NULL;
+    bool is_cuda;
      size_t size = 0;
  
+    llama_ctx_buffer() = default;
+
      void resize(size_t size) {
+        free();
+
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
          if (addr) {
-            ggml_cuda_host_free(addr);
+            is_cuda = true;
+        }
+        else {
+            // fall back to pageable memory
+            addr = new uint8_t[size];
+            is_cuda = false;
          }
-        addr = (uint8_t *) ggml_cuda_host_malloc(size);
          this->size = size;
      }
  
-    ~llama_ctx_buffer() {
+    void free() {
          if (addr) {
-            ggml_cuda_host_free(addr);
+            if (is_cuda) {
+                ggml_cuda_host_free(addr);
+            }
+            else {
+                delete[] addr;
+            }
          }
+        addr = NULL;
      }
+
+    ~llama_ctx_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llama_ctx_buffer(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer(llama_ctx_buffer&&) = delete;
+    llama_ctx_buffer& operator=(const llama_ctx_buffer&) = delete;
+    llama_ctx_buffer& operator=(llama_ctx_buffer&&) = delete;
  };
  #else
  typedef llama_buffer llama_ctx_buffer;
diff --git a/examples/talk-llama/llama.cpp b/examples/talk-llama/llama.cpp

index 98f49abd7cf4839df53bad2fa63bba98c8241a21..4cbc8d6b63752359f7de4b2c20600bfe9f4c703f 100644 (file)
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
@@ -1,6 +1,7 @@
  // Defines fileno on msys:
  #ifndef _GNU_SOURCE
  #define _GNU_SOURCE
+#include <cstddef>
  #include <cstdint>
  #include <cstdio>
  #endif
@@ -45,6 +46,7 @@ enum e_model {
      MODEL_65B,
  };
  
+
  static const size_t MB = 1024*1024;
  
  // computed for n_ctx == 2048
@@ -110,7 +112,7 @@ struct llama_hparams {
      enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
  
      bool operator!=(const llama_hparams & other) const {
-        return memcmp(this, &other, sizeof(llama_hparams));
+        return static_cast<bool>(memcmp(this, &other, sizeof(llama_hparams)));
      }
  };
  
@@ -406,6 +408,7 @@ enum llama_file_version {
      LLAMA_FILE_VERSION_GGMF_V1, // added version field and scores in vocab
      LLAMA_FILE_VERSION_GGJT_V1, // added padding
      LLAMA_FILE_VERSION_GGJT_V2, // changed quantization format
+    LLAMA_FILE_VERSION_GGJT_V3, // changed Q4 and Q8 quantization format
  };
  
  struct llama_file_loader {
@@ -424,24 +427,30 @@ struct llama_file_loader {
      }
      void read_magic() {
          uint32_t magic = file.read_u32();
-        uint32_t version = 0;
  
-        if (magic != 'ggml') {
-            version = file.read_u32();
+        if (magic == LLAMA_FILE_MAGIC_GGML) {
+            file_version = LLAMA_FILE_VERSION_GGML;
+            return;
          }
  
-        if (magic == 'ggml' && version == 0) {
-            file_version = LLAMA_FILE_VERSION_GGML;
-        } else if (magic == 'ggmf' && version == 1) {
-            file_version = LLAMA_FILE_VERSION_GGMF_V1;
-        } else if (magic == 'ggjt' && version == 1) {
-            file_version = LLAMA_FILE_VERSION_GGJT_V1;
-        } else if (magic == 'ggjt' && version == 2) {
-            file_version = LLAMA_FILE_VERSION_GGJT_V2;
-        } else {
-            throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
-                         magic, version);
+        uint32_t version = file.read_u32();
+
+        switch (magic) {
+            case LLAMA_FILE_MAGIC_GGMF:
+                switch (version) {
+                    case 1: file_version = LLAMA_FILE_VERSION_GGMF_V1; return;
+                }
+                break;
+            case LLAMA_FILE_MAGIC_GGJT:
+                switch (version) {
+                    case 1: file_version = LLAMA_FILE_VERSION_GGJT_V1; return;
+                    case 2: file_version = LLAMA_FILE_VERSION_GGJT_V2; return;
+                    case 3: file_version = LLAMA_FILE_VERSION_GGJT_V3; return;
+                }
          }
+
+        throw format("unknown (magic, version) combination: %08x, %08x; is this really a GGML file?",
+                     magic, version);
      }
      void read_hparams() {
          hparams.n_vocab = file.read_u32();
@@ -499,7 +508,7 @@ struct llama_file_loader {
  
              if (file_version >= LLAMA_FILE_VERSION_GGJT_V1) {
                  // skip to the next multiple of 32 bytes
-                file.seek(-file.tell() & 31, SEEK_CUR);
+                file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
              }
              shard.file_idx = file_idx;
              shard.file_off = file.tell();
@@ -574,7 +583,7 @@ struct llama_file_saver {
          file.write_u32(new_type);
          file.write_raw(tensor.ne.data(), sizeof(tensor.ne[0]) * tensor.ne.size());
          file.write_raw(tensor.name.data(), tensor.name.size());
-        file.seek(-file.tell() & 31, SEEK_CUR);
+        file.seek(-static_cast<ptrdiff_t>(file.tell()) & 31, SEEK_CUR);
          LLAMA_ASSERT(new_size == llama_calc_tensor_size(tensor.ne, new_type));
          file.write_raw(new_data, new_size);
      }
@@ -641,7 +650,7 @@ struct llama_model_loader {
          }
      }
  
-    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
+    struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne, ggml_backend backend) {
          auto it = tensors_map.name_to_idx.find(name);
          if (it == tensors_map.name_to_idx.end()) {
              throw format("llama.cpp: tensor '%s' is missing from model", name.c_str());
@@ -652,10 +661,10 @@ struct llama_model_loader {
                           name.c_str(), llama_format_tensor_shape(ne).c_str(), llama_format_tensor_shape(lt.ne).c_str());
          }
  
-        return get_tensor_for(lt);
+        return get_tensor_for(lt, backend);
      }
  
-    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt) {
+    struct ggml_tensor * get_tensor_for(llama_load_tensor & lt, ggml_backend backend) {
          struct ggml_tensor * tensor;
          if (lt.ne.size() == 2) {
              tensor = ggml_new_tensor_2d(ggml_ctx, lt.type, lt.ne.at(0), lt.ne.at(1));
@@ -665,6 +674,7 @@ struct llama_model_loader {
          }
          ggml_set_name(tensor, lt.name.c_str());
          LLAMA_ASSERT(lt.ggml_tensor == NULL); // if this fails, we called get_tensor twice on the same tensor
+        tensor->backend = backend;
          lt.ggml_tensor = tensor;
          num_ggml_tensors_created++;
          return tensor;
@@ -678,12 +688,16 @@ struct llama_model_loader {
  
      void load_all_data(llama_progress_callback progress_callback, void *  progress_callback_user_data, llama_mlock * lmlock) {
          size_t data_size = 0;
+        size_t prefetch_size = 0;
          for (const llama_load_tensor & lt : tensors_map.tensors) {
              data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                prefetch_size += lt.size;
+            }
          }
  
          if (use_mmap) {
-            mapping.reset(new llama_mmap(&file_loaders.at(0)->file));
+            mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
              if (!lmlock) {
                  // Don't call the callback since the actual loading will be lazy
                  // and we can't measure it.
@@ -696,6 +710,9 @@ struct llama_model_loader {
  
          size_t done_size = 0;
          for (llama_load_tensor & lt : tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
+                continue;
+            }
              if (progress_callback) {
                  progress_callback((float) done_size / data_size, progress_callback_user_data);
              }
@@ -708,9 +725,6 @@ struct llama_model_loader {
                  lmlock->grow_to(done_size);
              }
          }
-        if (progress_callback) {
-            progress_callback(1.0f, progress_callback_user_data);
-        }
      }
  
      void load_data_for(llama_load_tensor & lt) {
@@ -812,10 +826,9 @@ static bool kv_cache_init(
  struct llama_context_params llama_context_default_params() {
      struct llama_context_params result = {
          /*.n_ctx                       =*/ 512,
-        /*.n_parts                     =*/ -1,
          /*.gpu_layers                  =*/ 0,
          /*.seed                        =*/ -1,
-        /*.f16_kv                      =*/ false,
+        /*.f16_kv                      =*/ true,
          /*.logits_all                  =*/ false,
          /*.vocab_only                  =*/ false,
          /*.use_mmap                    =*/ true,
@@ -836,6 +849,21 @@ bool llama_mlock_supported() {
      return llama_mlock::SUPPORTED;
  }
  
+void llama_init_backend() {
+    ggml_time_init();
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+}
+
+int64_t llama_time_us() {
+    return ggml_time_us();
+}
+
  //
  // model loading
  //
@@ -845,7 +873,8 @@ static const char *llama_file_version_name(llama_file_version version) {
          case LLAMA_FILE_VERSION_GGML: return "'ggml' (old version with low tokenizer quality and no mmap support)";
          case LLAMA_FILE_VERSION_GGMF_V1: return "ggmf v1 (old version with no mmap support)";
          case LLAMA_FILE_VERSION_GGJT_V1: return "ggjt v1 (pre #1405)";
-        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (latest)";
+        case LLAMA_FILE_VERSION_GGJT_V2: return "ggjt v2 (pre #1508)";
+        case LLAMA_FILE_VERSION_GGJT_V3: return "ggjt v3 (latest)";
      }
  
      return "unknown";
@@ -925,11 +954,19 @@ static void llama_model_load_internal(
          fprintf(stderr, "%s: model size = %s\n",  __func__, llama_model_type_name(model.type));
      }
  
-    if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
+    if (file_version < LLAMA_FILE_VERSION_GGJT_V2) {
          if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
              hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
              hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
-            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1405)");
+        }
+    }
+
+    if (file_version < LLAMA_FILE_VERSION_GGJT_V3) {
+        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
+            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q4_1 ||
+            hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+            throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1508)");
          }
      }
  
@@ -942,27 +979,7 @@ static void llama_model_load_internal(
      size_t ctx_size;
      size_t mmapped_size;
      ml->calc_sizes(&ctx_size, &mmapped_size);
-    fprintf(stderr, "%s: ggml ctx size = %6.2f KB\n", __func__, ctx_size/1024.0);
-
-    // print memory requirements
-    {
-        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
-
-        // this is the total memory required to run the inference
-        const size_t mem_required =
-            ctx_size +
-            mmapped_size +
-            MEM_REQ_SCRATCH0().at(model.type) +
-            MEM_REQ_SCRATCH1().at(model.type) +
-            MEM_REQ_EVAL().at(model.type);
-
-        // this is the memory required by one llama_state
-        const size_t mem_required_state =
-            scale*MEM_REQ_KV_SELF().at(model.type);
-
-        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
-                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
-    }
+    fprintf(stderr, "%s: ggml ctx size = %7.2f MB\n", __func__, ctx_size/1024.0/1024.0);
  
      // create the ggml context
      {
@@ -984,7 +1001,14 @@ static void llama_model_load_internal(
          }
      }
  
+#ifdef GGML_USE_CUBLAS
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CUDA
+#else
+#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#endif
+
      // prepare memory for the weights
+    size_t vram_total = 0;
      {
          const uint32_t n_embd  = hparams.n_embd;
          const uint32_t n_layer = hparams.n_layer;
@@ -992,70 +1016,122 @@ static void llama_model_load_internal(
  
          ml->ggml_ctx = ctx;
  
-        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab});
-        model.norm           = ml->get_tensor("norm.weight",           {n_embd});
-        model.output         = ml->get_tensor("output.weight",         {n_embd, n_vocab});
+        model.tok_embeddings = ml->get_tensor("tok_embeddings.weight", {n_embd, n_vocab}, GGML_BACKEND_CPU);
+        model.norm           = ml->get_tensor("norm.weight",           {n_embd},          GGML_BACKEND_CPU);
+
+        // "output" tensor
+        {
+            ggml_backend backend_output;
+            if (n_gpu_layers > int(n_layer)) { // NOLINT
+                backend_output = LLAMA_BACKEND_OFFLOAD;
+            } else {
+                backend_output = GGML_BACKEND_CPU;
+            }
+
+            model.output = ml->get_tensor("output.weight", {n_embd, n_vocab}, backend_output);
+        }
+
+        const int i_gpu_start = n_layer - n_gpu_layers;
  
          model.layers.resize(n_layer);
          for (uint32_t i = 0; i < n_layer; ++i) {
+            const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
+
              auto & layer = model.layers[i];
  
              std::string layers_i = "layers." + std::to_string(i);
  
-            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd});
+            layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend);
+
+            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend);
+            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend);
+            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend);
+            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend);
  
-            layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd});
-            layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd});
-            layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd});
-            layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd});
+            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend);
  
-            layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd});
+            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff},   backend);
+            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd}, backend);
+            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff},   backend);
  
-            layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd,   n_ff});
-            layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", {  n_ff,   n_embd});
-            layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd,   n_ff});
+            if (backend == GGML_BACKEND_CUDA) {
+                vram_total +=
+                    ggml_nbytes(layer.attention_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk)             +
+                    ggml_nbytes(layer.wv)             + ggml_nbytes(layer.wo) + ggml_nbytes(layer.attention_norm) +
+                    ggml_nbytes(layer.w1)             + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
+            }
          }
      }
  
      ml->done_getting_tensors();
  
-    // populate `tensors_by_name`
-    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
-        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
-    }
+    // print memory requirements
+    {
+        const size_t scale = memory_type == GGML_TYPE_F32 ? 2 : 1;
  
-    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
+        // this is the total memory required to run the inference
+        const size_t mem_required =
+            ctx_size +
+            mmapped_size - vram_total + // weights in VRAM not in memory
+            MEM_REQ_SCRATCH0().at(model.type) +
+            MEM_REQ_SCRATCH1().at(model.type) +
+            MEM_REQ_EVAL().at(model.type);
+
+        // this is the memory required by one llama_state
+        const size_t mem_required_state =
+            scale*MEM_REQ_KV_SELF().at(model.type);
+
+        fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
+                mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
  
-    model.mapping = std::move(ml->mapping);
  #ifdef GGML_USE_CUBLAS
-    {
          const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
  
          fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
+        if (n_gpu_layers > (int) hparams.n_layer) {
+            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
+        }
+        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+#else
+        (void) n_gpu_layers;
+#endif
+    }
  
-        size_t vram_total = 0;
+    // populate `tensors_by_name`
+    for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+        model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
+    }
  
-        for (int i = 0; i < n_gpu; ++i) {
-            const auto & layer = model.layers[i];
+    ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
  
-            ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
-            ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
-            ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
-            ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
-            ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
-            ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
-            ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
+#ifdef GGML_USE_CUBLAS
+    {
+        size_t done_size = 0;
+        size_t data_size = 0;
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            data_size += lt.size;
+            if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
+                done_size += lt.size;
+            }
          }
-        if (n_gpu_layers > (int) hparams.n_layer) {
-            fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
-            ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
+        for (llama_load_tensor & lt : ml->tensors_map.tensors) {
+            if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
+                continue;
+            }
+            if (progress_callback) {
+                progress_callback((float) done_size / data_size, progress_callback_user_data);
+            }
+            ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
+            done_size += lt.size;
          }
+    }
+#endif // GGML_USE_CUBLAS
  
-        fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
+    if (progress_callback) {
+        progress_callback(1.0f, progress_callback_user_data);
      }
-#else
-    (void) n_gpu_layers;
-#endif
+
+    model.mapping = std::move(ml->mapping);
  
      // loading time will be recalculate after the first eval, so
      // we take page faults deferred by mmap() into consideration
@@ -1154,10 +1230,8 @@ static bool llama_eval_internal(
          {
              cur = ggml_rms_norm(ctx0, inpL);
  
-            // cur = attention_norm*cur
-            cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
-                        cur);
+            // cur = cur*attention_norm(broadcasted)
+            cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
          }
  
          // self-attention
@@ -1264,10 +1338,8 @@ static bool llama_eval_internal(
              {
                  cur = ggml_rms_norm(ctx0, inpFF);
  
-                // cur = ffn_norm*cur
-                cur = ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
-                        cur);
+                // cur = cur*ffn_norm(broadcasted)
+                cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
              }
  
              struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
@@ -1304,10 +1376,8 @@ static bool llama_eval_internal(
  
          inpL = ggml_rms_norm(ctx0, inpL);
  
-        // inpL = norm*inpL
-        inpL = ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.norm, inpL),
-                    inpL);
+        // inpL = inpL*norm(broadcasted)
+        inpL = ggml_mul(ctx0, inpL, model.norm);
  
          embeddings = inpL;
      }
@@ -2131,7 +2201,7 @@ struct llama_context * llama_init_from_file(
              unsigned * cur_percentage_p = (unsigned *) ctx;
              unsigned percentage = (unsigned) (100 * progress);
              while (percentage > *cur_percentage_p) {
-                ++*cur_percentage_p;
+                *cur_percentage_p = percentage;
                  fprintf(stderr, ".");
                  fflush(stderr);
                  if (percentage >= 100) {
@@ -2224,7 +2294,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
      {
          uint32_t magic;
          fin.read((char *) &magic, sizeof(magic));
-        if (magic != 'ggla') {
+        if (magic != LLAMA_FILE_MAGIC_GGLA) {
              fprintf(stderr, "%s: bad file magic\n", __func__);
              return 1;
          }
@@ -2288,7 +2358,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
  
          // maybe this should in llama_model_loader
          if (model_loader->use_mmap) {
-            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
+            model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
          }
      }
  
@@ -2381,7 +2451,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
                  }
                  size_t idx = model_loader->tensors_map.name_to_idx[base_name];
                  llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
-                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
+                base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] }, GGML_BACKEND_CPU);
                  lt.data = (uint8_t *) lt.ggml_tensor->data;
                  model_loader->load_data_for(lt);
                  lt.ggml_tensor->data = lt.data;
@@ -2607,8 +2677,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
  }
  
  // Sets the state reading from the specified source address
-size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
-    const uint8_t * inp = src;
+size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
+    uint8_t * inp = src;
  
      // set rng
      {
diff --git a/examples/talk-llama/llama.h b/examples/talk-llama/llama.h

index 21cba8cf61061a0c2263054762671d632444cc34..37bae5357400f466d9d7e011cfca1bbe55d39445 100644 (file)
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -19,10 +19,16 @@
  #    define LLAMA_API
  #endif
  
-#define LLAMA_FILE_VERSION           2
-#define LLAMA_FILE_MAGIC             'ggjt'
-#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
-#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_FILE_MAGIC_GGJT        0x67676a74u // 'ggjt'
+#define LLAMA_FILE_MAGIC_GGLA        0x67676c61u // 'ggla'
+#define LLAMA_FILE_MAGIC_GGMF        0x67676d66u // 'ggmf'
+#define LLAMA_FILE_MAGIC_GGML        0x67676d6cu // 'ggml'
+#define LLAMA_FILE_MAGIC_GGSN        0x6767736eu // 'ggsn'
+
+#define LLAMA_FILE_VERSION           3
+#define LLAMA_FILE_MAGIC             LLAMA_FILE_MAGIC_GGJT
+#define LLAMA_FILE_MAGIC_UNVERSIONED LLAMA_FILE_MAGIC_GGML
+#define LLAMA_SESSION_MAGIC          LLAMA_FILE_MAGIC_GGSN
  #define LLAMA_SESSION_VERSION        1
  
  #ifdef __cplusplus
@@ -40,9 +46,9 @@ extern "C" {
      typedef int llama_token;
  
      typedef struct llama_token_data {
-        llama_token id;  // token id
-        float logit; // log-odds of the token
-        float p;     // probability of the token
+        llama_token id; // token id
+        float logit;    // log-odds of the token
+        float p;        // probability of the token
      } llama_token_data;
  
      typedef struct llama_token_data_array {
@@ -55,7 +61,6 @@ extern "C" {
  
      struct llama_context_params {
          int n_ctx;        // text context
-        int n_parts;      // -1 for default
          int n_gpu_layers; // number of layers to store in VRAM
          int seed;         // RNG seed, -1 for random
  
@@ -74,16 +79,16 @@ extern "C" {
  
      // model file types
      enum llama_ftype {
-        LLAMA_FTYPE_ALL_F32     = 0,
-        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_ALL_F32              = 0,
+        LLAMA_FTYPE_MOSTLY_F16           = 1, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0          = 2, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1          = 3, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
-        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
-        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_2       = 5, // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3       = 6, // support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0          = 7, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0          = 8, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1          = 9, // except 1d tensors
      };
  
      LLAMA_API struct llama_context_params llama_context_default_params();
@@ -91,6 +96,13 @@ extern "C" {
      LLAMA_API bool llama_mmap_supported();
      LLAMA_API bool llama_mlock_supported();
  
+    // TODO: not great API - very likely to change
+    // Initialize the llama + ggml backend
+    // Call once at the start of the program
+    LLAMA_API void llama_init_backend();
+
+    LLAMA_API int64_t llama_time_us();
+
      // Various functions for loading a ggml llama model.
      // Allocate (almost) all memory needed for the model.
      // Return NULL on failure
@@ -139,7 +151,7 @@ extern "C" {
  
      // Set the state reading from the specified address
      // Returns the number of bytes read
-    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src);
  
      // Save/load session file
      LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp

index 45b8cb736cdbbae226aa5e0b6cccfba548620f39..cdeb2d9bf4e98aada9c8b8ab90814a2560723057 100644 (file)
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -33,8 +33,6 @@ struct whisper_params {
      int32_t max_tokens = 32;
      int32_t audio_ctx  = 0;
  
-    int32_t n_parts_llama = -1;
-
      float vad_thold    = 0.6f;
      float freq_thold   = 100.0f;
  
@@ -72,7 +70,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "--n-parts-llama")                  { params.n_parts_llama = std::stoi(argv[++i]); }
          else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
@@ -123,7 +120,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
      fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
      fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  --n-parts-llama N         [%-7d] num parts in llama model file\n",               params.n_parts_llama);
      fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
      fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
      fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
@@ -239,13 +235,14 @@ int main(int argc, char ** argv) {
  
      // llama init
  
+    llama_init_backend();
+
      auto lparams = llama_context_default_params();
  
      // tune these to your liking
      lparams.n_ctx      = 2048;
      lparams.seed       = 1;
      lparams.f16_kv     = true;
-    lparams.n_parts    = params.n_parts_llama;
  
      struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
author	Georgi Gerganov <redacted>
	Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)
committer	Georgi Gerganov <redacted>
	Tue, 23 May 2023 11:04:39 +0000 (14:04 +0300)
examples/talk-llama/llama-util.h		patch \| blob \| history
examples/talk-llama/llama.cpp		patch \| blob \| history
examples/talk-llama/llama.h		patch \| blob \| history
examples/talk-llama/talk-llama.cpp		patch \| blob \| history