llama : fix compile warnings

author Georgi Gerganov <redacted>

Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)

committer Georgi Gerganov <redacted>

Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)
author Georgi Gerganov <redacted>
Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)
committer Georgi Gerganov <redacted>
Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)
diff --git a/ggml.c b/ggml.c

index 05889d15445534e98b78061de537286681e31c12..045768fafaef73191c99d5aa52d689f355c0e0b2 100644 (file)
--- a/ggml.c
+++ b/ggml.c
@@ -14720,12 +14720,12 @@ static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fou
      const int64_t * ne = tensor->ne;
      const size_t  * nb = tensor->nb;
  
-    fprintf(fout, "%-6s %-12s %8d %8jd %jd %jd %jd %16zu %16zu %16zu %16zu %16p %32s\n",
+    fprintf(fout, "%-6s %-12s %8d %8d %d %d %d %16zu %16zu %16zu %16zu %16p %32s\n",
              ggml_type_name(tensor->type),
              ggml_op_name  (tensor->op),
              tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
+            (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
+                  nb[0],       nb[1],       nb[2],       nb[3],
              tensor->data,
              tensor->name);
  }
@@ -14734,13 +14734,13 @@ static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char
      const int64_t * ne = tensor->ne;
      const size_t  * nb = tensor->nb;
  
-    fprintf(fout, "%-6s %-6s %-12s %8d %jd %jd %jd %jd %16zu %16zu %16zu %16zu %8d %16p %32s\n",
+    fprintf(fout, "%-6s %-6s %-12s %8d %d %d %d %d %16zu %16zu %16zu %16zu %8d %16p %32s\n",
              arg,
              ggml_type_name(tensor->type),
              ggml_op_name  (tensor->op),
              tensor->n_dims,
-            ne[0], ne[1], ne[2], ne[3],
-            nb[0], nb[1], nb[2], nb[3],
+            (int) ne[0], (int) ne[1], (int) ne[2], (int) ne[3],
+                  nb[0],       nb[1],       nb[2],       nb[3],
              tensor->n_tasks,
              tensor->data,
              tensor->name);
@@ -14763,11 +14763,11 @@ void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
          FILE * fout = stdout;
  
          fprintf(fout, "\n");
-        fprintf(fout, "%-16s %8x\n",  "magic",   GGML_FILE_MAGIC);
-        fprintf(fout, "%-16s %8d\n",  "version", GGML_FILE_VERSION);
-        fprintf(fout, "%-16s %8d\n",  "leafs",   cgraph->n_leafs);
-        fprintf(fout, "%-16s %8d\n",  "nodes",   cgraph->n_nodes);
-        fprintf(fout, "%-16s %8ju\n", "eval",    size_eval);
+        fprintf(fout, "%-16s %8x\n", "magic",   GGML_FILE_MAGIC);
+        fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
+        fprintf(fout, "%-16s %8d\n", "leafs",   cgraph->n_leafs);
+        fprintf(fout, "%-16s %8d\n", "nodes",   cgraph->n_nodes);
+        fprintf(fout, "%-16s %8d\n", "eval",    (int) size_eval);
  
          // header
          fprintf(fout, "\n");
diff --git a/llama.cpp b/llama.cpp

index b992321e461fc0bffdc1ba37e9f253cba4d4e6fd..cf512ccdd53009ff681c6fe70d35a30b8e9f679f 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -1059,23 +1059,23 @@ static void llama_model_load_internal(
          }
      }
  
+    (void) main_gpu;
  #if defined(GGML_USE_CUBLAS)
      fprintf(stderr, "%s: using CUDA for GPU acceleration\n", __func__);
      ggml_cuda_set_main_device(main_gpu);
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU_SPLIT
  #elif defined(GGML_USE_CLBLAST)
      fprintf(stderr, "%s: using OpenCL for GPU acceleration\n", __func__);
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_GPU
+#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_GPU
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_GPU
  #else
-#define LLAMA_BACKEND_OFFLOAD GGML_BACKEND_CPU
+#define LLAMA_BACKEND_OFFLOAD       GGML_BACKEND_CPU
  #define LLAMA_BACKEND_OFFLOAD_SPLIT GGML_BACKEND_CPU
  #endif
  
      // prepare memory for the weights
      size_t vram_weights = 0;
-    size_t vram_scratch = 0;
      {
          const uint32_t n_embd  = hparams.n_embd;
          const uint32_t n_layer = hparams.n_layer;
@@ -1152,10 +1152,8 @@ static void llama_model_load_internal(
          fprintf(stderr, "%s: mem required  = %7.2f MB (+ %7.2f MB per state)\n", __func__,
                  mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
  
-        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
-
  #ifdef GGML_USE_CUBLAS
-        vram_scratch = n_batch * MB;
+        const size_t vram_scratch = n_batch * MB;
          ggml_cuda_set_scratch_size(vram_scratch);
          if (n_gpu_layers > 0) {
              fprintf(stderr, "%s: allocating batch_size x 1 MB = %ld MB VRAM for the scratch buffer\n",
@@ -1163,6 +1161,8 @@ static void llama_model_load_internal(
          }
  #endif // GGML_USE_CUBLAS
  #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+        const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
+
          fprintf(stderr, "%s: offloading %d layers to GPU\n", __func__, n_gpu);
          if (n_gpu_layers > (int) hparams.n_layer) {
              fprintf(stderr, "%s: offloading output layer to GPU\n", __func__);
@@ -1331,6 +1331,7 @@ static bool llama_eval_internal(
      struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
  
      const int i_gpu_start = n_layer - n_gpu_layers;
+    (void) i_gpu_start;
  
      for (int il = 0; il < n_layer; ++il) {
          offload_func_t offload_func = llama_nop;
author	Georgi Gerganov <redacted>
	Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)
committer	Georgi Gerganov <redacted>
	Tue, 6 Jun 2023 19:41:53 +0000 (22:41 +0300)