CUDA: tighter VRAM scratch size for 65b/70b (#2551)

author Johannes Gäßler <redacted>

Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)

committer GitHub <redacted>

Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)
author Johannes Gäßler <redacted>
Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)
committer GitHub <redacted>
Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)
diff --git a/llama.cpp b/llama.cpp

index 39aefd499dd0c50cc9b52361bd8b7d652b837e12..71061aab910efc3ad73ce92fd0946fd9d9adcacf 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -149,7 +149,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
  }
  
  // amount of VRAM needed per batch size to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
  {
      static std::map<e_model, size_t> k_sizes = {
@@ -157,14 +157,14 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
          { MODEL_7B,   512ull * kB },
          { MODEL_13B,  640ull * kB },
          { MODEL_30B,  768ull * kB },
-        { MODEL_65B, 1536ull * kB },
-        { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced)
+        { MODEL_65B, 1280ull * kB },
+        { MODEL_70B, 1280ull * kB },
      };
      return k_sizes;
  }
  
  // amount of VRAM needed per batch size and context to hold temporary results
-// the values for 3b and 65b are not derived from testing but instead chosen conservatively
+// the values for 3b are not derived from testing but instead chosen conservatively
  static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
  {
      static std::map<e_model, size_t> k_sizes = {
@@ -172,8 +172,8 @@ static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
          { MODEL_7B,  128ull },
          { MODEL_13B, 160ull },
          { MODEL_30B, 208ull },
-        { MODEL_65B, 416ull },
-        { MODEL_70B, 416ull }, // TODO (likely can be reduced)
+        { MODEL_65B, 256ull },
+        { MODEL_70B, 256ull },
      };
      return k_sizes;
  }
author	Johannes Gäßler <redacted>
	Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)
committer	GitHub <redacted>
	Tue, 8 Aug 2023 12:38:16 +0000 (14:38 +0200)