chore : correct typos [no ci] (#20041)

author Marcel Petrick <redacted>

Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)

committer GitHub <redacted>

Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)
author Marcel Petrick <redacted>
Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)
committer GitHub <redacted>
Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md

index 7545e790f82483c3b445882462273a439b339ef5..0fe627f4e7f52f937c8bf3312e230ad825add7a7 100644 (file)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -159,7 +159,7 @@ Maintainers reserve the right to decline review or close pull requests for any r
  
  # Code maintenance
  
-- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file reponsible for:
+- Existing code should have designated collaborators and/or maintainers specified in the [CODEOWNERS](CODEOWNERS) file responsible for:
    - Reviewing and merging related PRs
    - Fixing related bugs
    - Providing developer guidance/support
diff --git a/common/arg.cpp b/common/arg.cpp

index 05f4a5244e73723942a9034a9b953ae4660b2ea9..0260d79fef02f04408b63db3b9c3f43f0394301f 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2399,7 +2399,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                  params.fit_params = false;
              } else {
                  throw std::runtime_error(
-                    string_format("error: unkown value for --fit: '%s'\n", value.c_str()));
+                    string_format("error: unknown value for --fit: '%s'\n", value.c_str()));
              }
          }
      ).set_env("LLAMA_ARG_FIT"));
diff --git a/common/common.h b/common/common.h

index c5a8037571374d63b56282e05d9435c02956091a..ae32d5053c5192b88872940ec0aedcfc884cedb9 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -869,7 +869,7 @@ std::string common_detokenize(
  // Embedding utils
  //
  
-// TODO: repace embd_norm with an enum
+// TODO: replace embd_norm with an enum
  void common_embd_normalize(const float * inp, float * out, int n, int embd_norm);
  
  float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
diff --git a/common/debug.h b/common/debug.h

index 0c5596325865a2b135cf8a3eebbd55710adc2978..e563b40d68f2e211a60cd132372fc711e4d26041 100644 (file)
--- a/common/debug.h
+++ b/common/debug.h
@@ -18,7 +18,7 @@ template <bool abort_on_nan> void common_debug_print_tensor(uint8_t * data, ggml
  // prints tensors that are processed in the computation graph
  // by default prints all tensors, but can be configured by creating a `base_callback_data` instance with
  // non-empty filter_patterns. See examples/debug.ccp for possible usage patterns
-// The template parameter determins whether an error should be thrown whenever a NaN is encountered
+// The template parameter determines whether an error should be thrown whenever a NaN is encountered
  // in a tensor (useful for stopping debug sessions on first erroneous tensor)
  // The callback data will be passed as the third parameter (user_data)
  template <bool abort_on_nan> bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data);
diff --git a/common/jinja/README.md b/common/jinja/README.md

index 7059105ee398592651eb66b5623cf725bb33c4f4..8291240767e8564d194529b715246a1775e70e0d 100644 (file)
--- a/common/jinja/README.md
+++ b/common/jinja/README.md
@@ -63,7 +63,7 @@ The llama.cpp Jinja engine introduces `jinja::string` (see `jinja/string.h`), wh
    - **One-to-many** (e.g., split): result is marked `is_input` **only if ALL** input parts are marked `is_input`
    - **Many-to-one** (e.g., join): same as one-to-many
  
-For string concatenation, string parts will be appended to the new string as-is, while perserving the `is_input` flag.
+For string concatenation, string parts will be appended to the new string as-is, while preserving the `is_input` flag.
  
  **Enabling Input Marking:**
  
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 09544173981afb424ce54e6463da0db003d72935..a6d259a640480b57c9e26ce4e99ff7dfdb1a7c49 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4031,7 +4031,7 @@ class Qwen2VLVisionModel(MmprojModel):
                  # split Conv3D into Conv2Ds
                  c1, c2, kt, kh, kw = data_torch.shape
                  del c1, c2, kh, kw  # unused
-                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                assert kt == 2, "Current implementation only support temporal_patch_size of 2"
                  yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight"  , data_torch[:, :, 0, ...])
                  yield (gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_ENC_EMBD_PATCH] + ".weight.1", data_torch[:, :, 1, ...])
              else:
@@ -5404,7 +5404,7 @@ class KimiLinearModel(TextModel):
          # Get ssm_d_conv from linear_attn_config.short_conv_kernel_size or ssm_d_conv
          linear_attn_config = self.hparams["linear_attn_config"]
          # n_head == 0 for KDA layers, n_head > 0 for MLA layers
-        # full_attention_layers list will be used to distingush layer type
+        # full_attention_layers list will be used to distinguish layer type
          _num_kv_heads = list()
          _full_attn_layers = linear_attn_config["full_attn_layers"]
          for il in range(self.hparams["num_hidden_layers"]):
@@ -6505,7 +6505,7 @@ class Gemma3VisionModel(MmprojModel):
          super().set_gguf_parameters()
          hparams = self.hparams
          self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GEMMA3)
-        # default values below are taken from HF tranformers code
+        # default values below are taken from HF transformers code
          self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
          self.gguf_writer.add_vision_use_gelu(True)
          # calculate proj_scale_factor (used by tinygemma3 test model)
@@ -7097,7 +7097,7 @@ class Rwkv7Model(TextModel):
  
              if bid == 0 and "time_mix_a" in new_name:
                  # dummy v0/v1/v2 on first layer
-                # easist way to make llama happy
+                # easiest way to make llama happy
                  yield (new_name.replace("time_mix_a", "time_mix_v"), data_torch)
  
              yield (new_name, data_torch)
@@ -9596,7 +9596,7 @@ class GraniteHybridModel(Mamba2Model, GraniteMoeModel):
          # NOTE: Explicitly include hparam prefix prefix for d_model to
          #   disambiguate with top-level head_dim
          # NOTE 2: If needed for future models, this can be isolated in a method
-        #   to separate the prefix setting and teh keys used
+        #   to separate the prefix setting and the keys used
          self.d_model = self.find_hparam([f"{self.hparam_prefixes[0]}_head_dim", "hidden_size", "d_model"])
          self.n_group = self.find_hparam(["n_groups", "num_groups"])
          self.d_inner = self.find_hparam(["expand", "num_heads"]) * self.d_model
@@ -9743,7 +9743,7 @@ class NemotronHModel(GraniteHybridModel):
          self.gguf_writer.add_value_length(self.head_dim)
  
          # Set feed_forward_length
-        # NOTE: This will trigger an override warning. This is preferrable to
+        # NOTE: This will trigger an override warning. This is preferable to
          #   duplicating all the parent logic
          if not self.is_moe:
              n_ff = self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"])
diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md

index 23b6a627634984910b2a75e55257551c4c5497d7..51adaaf95f5df9a5736da843f1cbe4e906b8adce 100755 (executable)
--- a/docs/backend/CANN.md
+++ b/docs/backend/CANN.md
@@ -20,7 +20,7 @@
  
  **Llama.cpp + CANN**
  
-The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are intergrated to CANN Toolkit and kernels to using Ascend NPU directly.
+The llama.cpp CANN backend is designed to support Ascend NPU. It utilize the ability of AscendC and ACLNN which are integrated to CANN Toolkit and kernels to using Ascend NPU directly.
  
  ## News
  
@@ -210,7 +210,7 @@ docker run --name llamacpp --device /dev/davinci0  --device /dev/davinci_manager
      # and install driver.
      sudo sh Ascend-hdk-910b-npu-firmware_x.x.x.x.X.run --full
      ```
-    If the following messaage appers, firmware is installed successfully.
+    If the following message appears, firmware is installed successfully.
      ```sh
      Firmware package installed successfully!
      ```
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md

index 07c68be5cbd995b61e9c199442c33156e0522add..dd4c66dbe954f4266e313a3e9d10be6c10ec673a 100644 (file)
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -708,7 +708,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
  
    - Remove **build** folder or try a clean-build.
  
-- I can **not** see `[ext_oneapi_level_zero:gpu]` afer installing the GPU driver on Linux.
+- I can **not** see `[ext_oneapi_level_zero:gpu]` after installing the GPU driver on Linux.
  
    Please double-check with `sudo sycl-ls`.
  
diff --git a/docs/backend/snapdragon/README.md b/docs/backend/snapdragon/README.md

index 2c3f88e91a2747c7ed702b23c7ef380818a1f3e1..0783555ce8a63e12f921825614131f84e33f209a 100644 (file)
--- a/docs/backend/snapdragon/README.md
+++ b/docs/backend/snapdragon/README.md
@@ -116,7 +116,7 @@ Llama-3.2-1B-Instruct-Q4_0.gguf: 1 file pushed, 0 skipped. 38.3 MB/s (773025920
  ### Windows
  
  All artifacts are already installed in the `pkg-snapdragon` folder.
-To run, adapt below instructions to use Powershell scrits in `scripts/snapdragon/windows`.
+To run, adapt below instructions to use Powershell scripts in `scripts/snapdragon/windows`.
  
  ## How to Run
  
diff --git a/docs/backend/snapdragon/windows.md b/docs/backend/snapdragon/windows.md

index e9346ccadf1f8d60134ffba9c5be004a2f955ba1..6307e1b69f1e04398c56c0df1e78941bcecc1249 100644 (file)
--- a/docs/backend/snapdragon/windows.md
+++ b/docs/backend/snapdragon/windows.md
@@ -144,7 +144,7 @@ Once the build is complete HTP ops libraries will be installed like this
  -a----         1/22/2026   6:01 PM           4139 libggml-htp.cat
  ```
  
-The .cat file, the signature and proper certicate installation can be verified with
+The .cat file, the signature and proper certificate installation can be verified with
  
  ```
  > signtool.exe verify /v /pa .\pkg-snapdragon\lib\libggml-htp.cat
diff --git a/docs/build.md b/docs/build.md

index e6f572c77f3caaaeae8f571734bdc8d6d7f3ddce..772731f6418472c926bfe3c419baf5faa984b967 100644 (file)
--- a/docs/build.md
+++ b/docs/build.md
@@ -595,7 +595,7 @@ You can verify that KleidiAI is being used by running
  ```bash
  ./build/bin/llama-cli -m PATH_TO_MODEL -p "What is a car?"
  ```
-If KleidiAI is enabled, the ouput will contain a line similar to:
+If KleidiAI is enabled, the output will contain a line similar to:
  ```
  load_tensors: CPU_KLEIDIAI model buffer size =  3474.00 MiB
  ```
@@ -699,7 +699,7 @@ To read documentation for how to build on Android, [click here](./android.md)
  
  ## WebGPU [In Progress]
  
-The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The currrent implementation is up-to-date with Dawn commit `bed1a61`.
+The WebGPU backend relies on [Dawn](https://dawn.googlesource.com/dawn). Follow the instructions [here](https://dawn.googlesource.com/dawn/+/refs/heads/main/docs/quickstart-cmake.md) to install Dawn locally so that llama.cpp can find it using CMake. The current implementation is up-to-date with Dawn commit `bed1a61`.
  
  In the llama.cpp directory, build with CMake:
  
diff --git a/docs/multimodal/MobileVLM.md b/docs/multimodal/MobileVLM.md

index 3bfab9f3d2291f2bfbdbb5a881fbd59406123381..6c17dbf902eafc6d262d6faa846ff5999fd6d8ab 100644 (file)
--- a/docs/multimodal/MobileVLM.md
+++ b/docs/multimodal/MobileVLM.md
@@ -281,7 +281,7 @@ llama_print_timings:       total time =    5990.25 ms /   202 tokens
  
  Just the same as above.
  
-**ouput**
+**output**
  ```sh
  encode_image_with_clip: image embedding created: 144 tokens
  
@@ -305,7 +305,7 @@ llama_print_timings:       total time =   15513.95 ms /   412 tokens
  ## Run on Intel(R) Core(TM) Ultra7 115H
  ### operation system
  Windows11
-### comiple
+### compile
  ```sh
  make -j32
  ```
diff --git a/examples/debug/README.md b/examples/debug/README.md

index 28e00c934270a7590d296c9a0e20ed7f8435f707..2ea716eb54303df423df0c81bcf1f68d0d3501f4 100644 (file)
--- a/examples/debug/README.md
+++ b/examples/debug/README.md
@@ -2,7 +2,7 @@
  
  This is a utility intended to help debug a model by registering a callback that
  logs GGML operations and tensor data. It can also store the generated logits or
-embeddings as well as the prompt and token ids for comparision with the original
+embeddings as well as the prompt and token ids for comparison with the original
  model.
  
  ### Usage
diff --git a/examples/diffusion/README.md b/examples/diffusion/README.md

index f71d2413193aa218980eddf0b619e1c9b503d107..b3942002147afde481ff47fe6f04121f1d74eb17 100644 (file)
--- a/examples/diffusion/README.md
+++ b/examples/diffusion/README.md
@@ -43,12 +43,12 @@ Choose one of the following scheduling methods:
  - `-b`: Batch size
  
  ### Examples
-#### Dream architechture:
+#### Dream architecture:
  ```
  llama-diffusion-cli -m dream7b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-eps 0.001 --diffusion-algorithm 3 --diffusion-steps 256 --diffusion-visual
  ```
  
-#### LLaDA architechture:
+#### LLaDA architecture:
  ```
  llama-diffusion-cli -m llada-8b.gguf -p "write code to train MNIST in pytorch" -ub 512 --diffusion-block-length 32 --diffusion-steps 256 --diffusion-visual
  ```
diff --git a/examples/llama.vim b/examples/llama.vim

index 736802d365541583cb806d27e0a6bdcc9809104c..23a281fc333259daf83138784048f7b6f6b0f89e 100644 (file)
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -52,8 +52,8 @@ highlight llama_hl_info guifg=#77ff2f ctermfg=119
  "   n_prefix:         number of lines before the cursor location to include in the local prefix
  "   n_suffix:         number of lines after  the cursor location to include in the local suffix
  "   n_predict:        max number of tokens to predict
-"   t_max_prompt_ms:  max alloted time for the prompt processing (TODO: not yet supported)
-"   t_max_predict_ms: max alloted time for the prediction
+"   t_max_prompt_ms:  max allotted time for the prompt processing (TODO: not yet supported)
+"   t_max_predict_ms: max allotted time for the prediction
  "   show_info:        show extra info about the inference (0 - disabled, 1 - statusline, 2 - inline)
  "   auto_fim:         trigger FIM completion automatically on cursor movement
  "   max_line_suffix:  do not auto-trigger FIM completion if there are more than this number of characters to the right of the cursor
diff --git a/examples/model-conversion/README.md b/examples/model-conversion/README.md

index 637870a5c15cf92f3b9e8aeccd1b81b0db393561..c43e642fee7bd2ab6ae0476f245bd4536bc56863 100644 (file)
--- a/examples/model-conversion/README.md
+++ b/examples/model-conversion/README.md
@@ -69,7 +69,7 @@ Command line arguments take precedence over environment variables when both are
  
  In cases where the transformer implementation for the model has not been released
  yet it is possible to set the environment variable `UNRELEASED_MODEL_NAME` which
-will then cause the transformer implementation to be loaded explicitely and not
+will then cause the transformer implementation to be loaded explicitly and not
  use AutoModelForCausalLM:
  ```
  export UNRELEASED_MODEL_NAME=SomeNewModel
@@ -120,7 +120,7 @@ The converted model can be inspected using the following command:
  (venv) $ make causal-run-converted-model
  ```
  
-### Model logits verfication
+### Model logits verification
  The following target will run the original model and the converted model and
  compare the logits:
  ```console
@@ -235,7 +235,7 @@ new model the model can be converted to GGUF format using the following command:
  (venv) $ make embedding-run-converted-model
  ```
  
-### Model logits verfication
+### Model logits verification
  The following target will run the original model and the converted model (which
  was done manually in the previous steps) and compare the logits:
  ```console
@@ -335,7 +335,7 @@ $ make perplexity-run-full QUANTIZED_MODEL=~/path/to/quantized/model-Qxx.gguf LO
  
  ## HuggingFace utilities
  The following targets are useful for creating collections and model repositories
-on Hugging Face in the the ggml-org. These can be used when preparing a relase
+on Hugging Face in the the ggml-org. These can be used when preparing a release
  to script the process for new model releases.
  
  For the following targets a `HF_TOKEN` environment variable is required.
@@ -347,7 +347,7 @@ For the following targets a `HF_TOKEN` environment variable is required.
  > $ unset HF_TOKEN
  
  ### Create a new Hugging Face Model (model repository)
-This will create a new model repsository on Hugging Face with the specified
+This will create a new model repository on Hugging Face with the specified
  model name.
  ```console
  (venv) $ make hf-create-model MODEL_NAME='TestModel' NAMESPACE="danbev" ORIGINAL_BASE_MODEL="some-base-model"
diff --git a/examples/sycl/README.md b/examples/sycl/README.md

index 8819d87f56ec2251447fecbf7da61b0c71fcffbf..29143dd6176f9be08d5501dd1711cbea2c6399d9 100644 (file)
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -6,11 +6,11 @@ This example program provides the tools for llama.cpp for SYCL on Intel GPU.
  
  |Tool Name| Function|Status|
  |-|-|-|
-|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+|llama-ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, etc.|Support|
  
  ### llama-ls-sycl-device
  
-List all SYCL devices with ID, compute capability, max work group size, ect.
+List all SYCL devices with ID, compute capability, max work group size, etc.
  
  1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
  
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h

index a9d1778641e8ca2c0da88f158dea4ed1787b2cb0..9fd3f7f32a027353cbdccbc08d5cc5db04927347 100644 (file)
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -259,7 +259,7 @@ extern "C" {
        Example usage:
  
          // operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
-        // preferrably to run on the same backend as the buffer
+        // preferably to run on the same backend as the buffer
          ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
  
          sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true);
diff --git a/ggml/include/ggml-opt.h b/ggml/include/ggml-opt.h

index 4703a05afe198cc41581efb31700932498b5e929..1c2ed79b77420e74fd3b81b83367109164380853 100644 (file)
--- a/ggml/include/ggml-opt.h
+++ b/ggml/include/ggml-opt.h
@@ -138,7 +138,7 @@ extern "C" {
      GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
      GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
  
-    // set gradients to zero, initilize loss, and optionally reset the optimizer
+    // set gradients to zero, initialize loss, and optionally reset the optimizer
      GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
  
      GGML_API bool ggml_opt_static_graphs(ggml_opt_context_t opt_ctx); // whether the graphs are allocated_statically
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index fcc51f1f71a4b5e72d7c88386fb8ccc3e3ba5071..784d69206b4a174972a0b79223dd39e1de728892 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2575,7 +2575,7 @@ extern "C" {
          struct ggml_tensor *  grad,
          struct ggml_tensor *  sgd_params); // alpha, weight decay
  
-    // build forward mutiple tensors and select one of them for computing
+    // build forward multiple tensors and select one of them for computing
      // this is useful for creating graphs that have constant topology but compute different things based on the input
      // ref: https://github.com/ggml-org/llama.cpp/pull/18550
      //
diff --git a/ggml/src/ggml-cpu/amx/mmq.cpp b/ggml/src/ggml-cpu/amx/mmq.cpp

index b5aca76633c6c5a258148b1715390cfe6c714fd5..93a6d397f79e9ee62b3d804d49a590b1c52be5cb 100644 (file)
--- a/ggml/src/ggml-cpu/amx/mmq.cpp
+++ b/ggml/src/ggml-cpu/amx/mmq.cpp
@@ -195,7 +195,7 @@ struct tile_config_t{
  // will be needed.
  //
  // Here another commonly used pattern 1-3-3 is skipped, as it is mostly used when m <=16;
-// and the sinlge batch gemm (m=1) has a special fast path with `avx512-vnni`.
+// and the single batch gemm (m=1) has a special fast path with `avx512-vnni`.
  //
  // ref: https://www.intel.com/content/www/us/en/developer/articles/code-sample/
  //    advanced-matrix-extensions-intrinsics-functions.html
@@ -1379,8 +1379,8 @@ struct tinygemm_kernel_vnni<block_q8_0, block_q4_0, float, BLOCK_M, BLOCK_N, BLO
          // sum of offsets, shared across COLS
          //
          // avx512-vnni does not have `_mm512_dpbssd_epi32`,
-        // need to transfrom ss to us:
-        //   a * (b - 8) is equavilent to b * a - 8 * a
+        // need to transform ss to us:
+        //   a * (b - 8) is equivalent to b * a - 8 * a
          //   s    u   u                   u   s   u   s
          //
          __m512i vcomp;
diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c

index b390ab61c7851afc4e30e7a813c5d6bbd5634979..a707d63985e0bd8fa31b89dc65987b99e817f7be 100644 (file)
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@@ -968,7 +968,7 @@ void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const voi
  
      const int vector_length = ggml_cpu_get_sve_cnt()*8;
  
-    //VLA Implemenation for SVE
+    //VLA Implementation for SVE
      switch (vector_length) {
          case 128:
              {
diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp

index 3eed0105bf1ed9e8b3ca025e3401b6929708cdba..80ff5ce549bd4b1e04fc0828c204658a1ffb4857 100644 (file)
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@@ -781,7 +781,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int                        n,
  
                  const uint8_t * q4_base = q4_ptr[b].qs + sb * QK_K;
  
-                // Load the 64 quants from q8K duplicated to use vecdots with the interelaved columns
+                // Load the 64 quants from q8K duplicated to use vecdots with the interleaved columns
                  // but still need the qs to use the low and hi bits from q4
                  const int8_t * q8_base = q8_ptr[b].qs + sb * 64;
                  int8x16_t      q8_qs[8];
@@ -3796,7 +3796,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int                        n,
  
                  for (int b = 0; b < nb; b++) {
                      // bsums pairs belongs to the same q8_k subblock
-                    // 64 elemnts loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
+                    // 64 elements loaded and made sum of 0-7 and 8-15 sum || 16-23 and 24 - 31 sum
                      const int16x8_t bsums[4]{
                          vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 0), vld1q_s16(q8_ptr[b].bsums + 16 * 0 + 8)),
                          vpaddq_s16(vld1q_s16(q8_ptr[b].bsums + 16 * 1), vld1q_s16(q8_ptr[b].bsums + 16 * 1 + 8)),
diff --git a/ggml/src/ggml-cpu/arch/x86/repack.cpp b/ggml/src/ggml-cpu/arch/x86/repack.cpp

index bd6906c4159e7243d9c9cce548931f1372607638..33c6cb650987d81d14ec21ecf17de445c559c209 100644 (file)
--- a/ggml/src/ggml-cpu/arch/x86/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/x86/repack.cpp
@@ -423,7 +423,7 @@ void ggml_quantize_mat_q8_K_4x8(const float * GGML_RESTRICT x, void * GGML_RESTR
              quants_interleaved[j] = i0;
          }
  
-        // Masks to shuffle the quants of corresonding sub blocks for rearraning quants for vectorized bsums computation
+        // Masks to shuffle the quants of corresponding sub blocks for rearranging quants for vectorized bsums computation
          __m256i shuffle_mask_sb2 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 0, 1, 4, 5, 6, 7, 8, 9, 8, 9, 12, 13, 14, 15));
          shuffle_mask_sb2 = _mm256_permute2f128_si256(shuffle_mask_sb2, shuffle_mask_sb2, 0);
          __m256i shuffle_mask_sb3 = _mm256_castsi128_si256(_mm_setr_epi8(0, 1, 2, 3, 0, 1, 6, 7, 8, 9, 10, 11, 8, 9, 14, 15));
@@ -625,7 +625,7 @@ static void gemv_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                  iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(rhs_vec_0123_3 ,_mm256_shuffle_epi32(rhs_vec_4567_3, 177), 170), _mm256_shuffle_epi32(lhs_vec_1, 170));
                  iacc = mul_sum_i8_pairs_acc_int32x8(iacc, _mm256_blend_epi32(_mm256_shuffle_epi32(rhs_vec_0123_3, 177) ,rhs_vec_4567_3, 170), _mm256_shuffle_epi32(lhs_vec_1, 255));
  
-                // Accumulated values multipled with appropriate scales
+                // Accumulated values multiplied with appropriate scales
                  acc_row = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc), _mm256_mul_ps(col_scale_f32, row_scale_f32), acc_row);
              }
  
@@ -868,7 +868,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                      const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptrs[rp][b].d), loadMask), 68);
                      const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
  
-                    // Multiply with appropiate scales and accumulate
+                    // Multiply with appropriate scales and accumulate
                      acc_rows[rp * 4]     = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[rp * 4]);
                      acc_rows[rp * 4 + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[rp * 4 + 1]);
                      acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -1076,7 +1076,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                  const __m128i row_scale_f16 = _mm_shuffle_epi32(_mm_maskload_epi32((int const*)(a_ptr[b].d), loadMask), 68);
                  const __m512 row_scale_f32 = GGML_F32Cx16_REPEAT_LOAD(row_scale_f16);
  
-                // Multiply with appropiate scales and accumulate
+                // Multiply with appropriate scales and accumulate
                  acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)),   acc_rows[0]);
                  acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)),  acc_rows[1]);
                  acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -1257,7 +1257,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                      // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                      const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
  
-                    // Multiply with appropiate scales and accumulate
+                    // Multiply with appropriate scales and accumulate
                      acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                      acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                      acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -1428,7 +1428,7 @@ static void gemm_q4_b32_8x8_q8_0_lut_avx(int n, float * GGML_RESTRICT s, size_t
                  // Load the scale(d) values for all the 4 Q8_0 blocks and repeat it across lanes
                  const __m256 row_scale_f32 = GGML_F32Cx8_REPEAT_LOAD(a_ptr[b].d, loadMask);
  
-                // Multiply with appropiate scales and accumulate
+                // Multiply with appropriate scales and accumulate
                  acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                  acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                  acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -1612,7 +1612,7 @@ void ggml_gemv_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      lhs_vec_11 = _mm256_permute2f128_si256(lhs_vec_11, lhs_vec_11, 0);
  
                      // Dot product done within 32 bit lanes and accumulated in the same vector
-                    // First done for first sub block and thenn for second sub block in each sb
+                    // First done for first sub block and then for second sub block in each sb
                      // B0(0-3) B4(0-3) B1(0-3) B5(0-3) B2(0-3) B6(0-3) B3(0-3) B7(0-3) with A0(0-3)
                      // B0(4-7) B4(4-7) B1(4-7) B5(4-7) B2(4-7) B6(4-7) B3(4-7) B7(4-7) with A0(4-7)
                      // ...........................................................................
@@ -2422,7 +2422,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                          const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                          const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
  
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                          acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                          acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                          acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -2785,7 +2785,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                      const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
  
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                      acc_rows[0] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                      acc_rows[1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                      acc_rows[2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -2802,7 +2802,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                  }
              }
-            // Store accumlated values
+            // Store accumulated values
              for (int i = 0; i < 4; i++) {
                  _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
              }
@@ -3130,7 +3130,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                          const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                          const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);//GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
  
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                          acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                          acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                          acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -3460,7 +3460,7 @@ void ggml_gemm_q4_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                      const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse); //GGML_F32Cx8_REPEAT_LOAD(a_ptrs[rp][b].d, loadMask);
  
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                      acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                      acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                      acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
@@ -4268,7 +4268,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                          const __m256 row_scale_f32_ymm = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
                          const __m512 row_scale_f32 = _mm512_insertf32x8(_mm512_castps256_ps512(row_scale_f32_ymm), row_scale_f32_ymm, 1);
  
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                          acc_rows[rp * 4] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_0), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                          acc_rows[rp * 4  + 1] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_1), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                          acc_rows[rp * 4 + 2] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_2), _mm512_mul_ps(col_scale_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -5035,7 +5035,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      acc_min_rows[3] = _mm512_fmadd_ps(_mm512_cvtepi32_ps(iacc_row_min_3), _mm512_mul_ps(col_dmin_f32, _mm512_shuffle_ps(row_scale_f32, row_scale_f32, 255)), acc_min_rows[3]);
                  }
              }
-            // Store accumlated values
+            // Store accumulated values
              for (int i = 0; i < 4; i++) {
                  _mm512_storeu_ps((float * )(s + ((y * 4 + i) * bs + x * 8)), _mm512_sub_ps(acc_rows[i], acc_min_rows[i]));
              }
@@ -5677,7 +5677,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                          const __m128 row_scale_f32_sse = _mm_load_ps(a_ptrs[rp][b].d);
                          const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
  
-                        // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                        // Multiply with appropriate scales and accumulate (for both d and dmin) below
                          acc_rows[rp * 4] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[rp * 4]);
                          acc_rows[rp * 4 + 1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[rp * 4 + 1]);
                          acc_rows[rp * 4 + 2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[rp * 4 + 2]);
@@ -6349,7 +6349,7 @@ void ggml_gemm_q2_K_8x8_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const vo
                      const __m128 row_scale_f32_sse = _mm_load_ps(a_ptr[b].d);
                      const __m256 row_scale_f32 = _mm256_set_m128(row_scale_f32_sse, row_scale_f32_sse);
  
-                    // Multiply with appropiate scales and accumulate (for both d and dmin) below
+                    // Multiply with appropriate scales and accumulate (for both d and dmin) below
                      acc_rows[0] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_0), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 0)), acc_rows[0]);
                      acc_rows[1] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_1), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 85)), acc_rows[1]);
                      acc_rows[2] = _mm256_fmadd_ps(_mm256_cvtepi32_ps(iacc_row_2), _mm256_mul_ps(col_scale_f32, _mm256_shuffle_ps(row_scale_f32, row_scale_f32, 170)), acc_rows[2]);
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c

index 64eb01a4e18c90f884430830fb7c0f741ef9bbcd..7c4026fac4e067ca9aafb146a9760ff1dc337988 100644 (file)
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -2477,7 +2477,7 @@ static bool ggml_thread_apply_priority(int32_t prio) {
  
      if (prio != GGML_SCHED_PRIO_LOW) {
          // Tell Windows that this thread should not be throttled (needs its own CPU core).
-        // Newer Windows 11 versions aggresively park (offline) CPU cores and often place
+        // Newer Windows 11 versions aggressively park (offline) CPU cores and often place
          // all our threads onto the first 4 cores which results in terrible performance with
          // n_threads > 4
          #if _WIN32_WINNT >= 0x0602
diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp

index da412fd009b90cfaae57823544280a2d13832a3f..5fd452a03d2cd093cfd6ee8c58d7c2ed90cba2d7 100644 (file)
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@@ -533,7 +533,7 @@ class tinyBLAS {
          if constexpr (RN > 1) {
              return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
          } else {
-            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
              GGML_ASSERT(false); // we have miss something.
          }
      }
@@ -711,7 +711,7 @@ class tinyBLAS_RVV {
          if constexpr (RN > 1) {
              return mnpack<RM, RN-1, BM>(m, n, SIZE_N, BN);
          } else {
-            GGML_LOG_ERROR("mnpack<%d, %d> bloc size not supported\n", RM, (int)SIZE_N);
+            GGML_LOG_ERROR("mnpack<%d, %d> block size not supported\n", RM, (int)SIZE_N);
              GGML_ASSERT(false); // we have miss something.
          }
      }
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp

index b7a70e06f1d032ae2c31ae13635e4dc04e4f5583..ca1b3059b8c0dcddb15380f86f81bb052743e00a 100644 (file)
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -375,7 +375,7 @@ static void ggml_compute_forward_dup_bytes(
          const size_t rs = ne00 * type_size;
  
          if (nb00 == type_size) {
-            // src0 is contigous on first dimension, copy by rows
+            // src0 is contiguous on first dimension, copy by rows
              for (int64_t i03 = 0; i03 < ne03; i03++) {
                  for (int64_t i02 = 0; i02 < ne02; i02++) {
                      id += rs * ir0;
@@ -1795,7 +1795,7 @@ void ggml_compute_forward_repeat(
              {
                  ggml_compute_forward_repeat_f32(params, dst);
              } break;
-        // TODO: templateify the implemenation and support for I64
+        // TODO: templateify the implementation and support for I64
          //       ref https://github.com/ggml-org/llama.cpp/pull/14274#discussion_r2169492225
          //case GGML_TYPE_I64:
          //    {
diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp

index 5edba4212f617ee124d734f8d6f406c9557dc858..02c3cc3119ba273d0072724b08f19e1faedcfacc 100644 (file)
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@@ -3032,7 +3032,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
              case GGML_OP_MUL_MAT_ID:
                  {
                      size = ggml_row_size(PARAM_TYPE, ggml_nelements(op->src[1]));
-                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next bloc.
+                    size = GGML_PAD(size, sizeof(int64_t)); // + padding for next block.
  
                      const int64_t ne02 = op->src[0]->ne[2]; // n_as, n_expert
                      const int64_t ne12 = op->src[1]->ne[2]; // n_tokens
@@ -3297,7 +3297,7 @@ template <typename BLOC_TYPE, int64_t INTER_SIZE, int64_t NB_COLS, ggml_type PAR
          auto * wdata          = (char *)params->wdata;
          auto * wdata_src1_end = (char *)wdata + GGML_PAD(nbw3, sizeof(int64_t));
  
-        // total of [n_as][ne12 + 1] elemets of type mmid_row_mapping (2*int32_t = int64_t)
+        // total of [n_as][ne12 + 1] elements of type mmid_row_mapping (2*int32_t = int64_t)
          auto * matrix_row_counts = (int64_t *) (wdata_src1_end);                                        // [n_as]
          struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *) (matrix_row_counts + n_as); // [n_as][ne12]
  
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh

index beb7e32e4fc35a08c4d10244664821508ac19e9b..fff70c8eb89fbe42db8a8c2ea0693d667c4945bd 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -1215,7 +1215,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
      }
  
      // If attention sinks are used, potentially re-scale if KQ_max is small.
-    // Also add the sink as a value to KQ_rowsum, this is done after synchonization of KQ_rowsum
+    // Also add the sink as a value to KQ_rowsum, this is done after synchronization of KQ_rowsum
      //     so it's being done unconditionally for every thread.
      if (!is_fixup && (np == 1 || threadIdx.y % np == 0) && sinks_f) {
          float KQ_max_scale[cols_per_thread];
diff --git a/ggml/src/ggml-cuda/fattn-vec.cuh b/ggml/src/ggml-cuda/fattn-vec.cuh

index 3f4a78cc6e5465a58d7d0ee1f2a353ada5c9706a..7cbe32633e5877cd1c68a83fde5719950c23363f 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-vec.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec.cuh
@@ -10,7 +10,7 @@ static constexpr __device__ int ggml_cuda_fattn_vec_get_nthreads_device() {
      return 128;
  }
  
-// Currenlty llvm with the amdgcn target does not support unrolling loops
+// Currently llvm with the amdgcn target does not support unrolling loops
  // that contain a break that can not be resolved at compile time.
  #ifdef __clang__
  #pragma clang diagnostic push
diff --git a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh

index cd3bfd4051a40b4d5da00d3ec8f0a83aa8a0da9e..aaf711a618cb5549cc0d37ab6216c44b7e3a7e52 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-wmma-f16.cuh
@@ -18,7 +18,7 @@
  #if defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
  #define GGML_USE_WMMA_FATTN
  #elif defined(RDNA4)
-#warning "rocwmma fattn is not suported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
+#warning "rocwmma fattn is not supported on RDNA4 on rocwmma < v2.0.0, expect degraded performance"
  #endif // defined(RDNA4) && ROCWMMA_VERSION_MAJOR > 1
  #endif // defined(GGML_HIP_ROCWMMA_FATTN)
  
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 7e6d3303549a558a10c6408636f251c828cc2dc2..b56e3d50f58648b175feca07ed1c3515f598422a 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3330,7 +3330,7 @@ static bool ggml_cuda_can_fuse(const struct ggml_cgraph *                cgraph,
              return false;
          }
  
-        //rms_norm kernel assumes contigous rows
+        //rms_norm kernel assumes contiguous rows
          if (!ggml_is_contiguous_rows(mul->src[0]) || !ggml_is_contiguous_rows(mul->src[1])) {
              return false;
          }
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu

index a8c68e44b16ee3f721e867af7f6662d8abc9133a..4300ffc148cff279265dfb2bb90092d560692fb0 100644 (file)
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -235,7 +235,7 @@ static __global__ void quantize_mmq_q8_1(
      q.z = roundf(xi.z*d_inv);
      q.w = roundf(xi.w*d_inv);
  
-    // Write back 4 int8 values as a single 32 bit value for better memroy bandwidth:
+    // Write back 4 int8 values as a single 32 bit value for better memory bandwidth:
      char4 * yqs4 = (char4 *) y[ib].qs;
      yqs4[iqs/4] = q;
  
diff --git a/ggml/src/ggml-cuda/softmax.cu b/ggml/src/ggml-cuda/softmax.cu

index dc06d06930eff8bc883fe98826a1da5c1054c41f..285c0e9543a2a6a2e61682b7b98f482910bffbb5 100644 (file)
--- a/ggml/src/ggml-cuda/softmax.cu
+++ b/ggml/src/ggml-cuda/softmax.cu
@@ -46,7 +46,7 @@ struct soft_max_params {
  };
  
  // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
  #ifdef __clang__
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-cuda/solve_tri.cu b/ggml/src/ggml-cuda/solve_tri.cu

index 177ffc268f126d72b7daad7ce960727e45e28245..07ca33f513b0cf8b4757497bd0399b7d08059ecd 100644 (file)
--- a/ggml/src/ggml-cuda/solve_tri.cu
+++ b/ggml/src/ggml-cuda/solve_tri.cu
@@ -83,7 +83,7 @@ static void solve_tri_f32_cublas(ggml_backend_cuda_context & ctx,
  // ======================
  // When ncols_template == 0 the bounds for the loops in this function are not
  // known and can't be unrolled. As we want to keep pragma unroll for all other
-// cases we supress the clang transformation warning here.
+// cases we suppress the clang transformation warning here.
  #ifdef __clang__
  #    pragma clang diagnostic push
  #    pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp

index 7a44443a8a3e0cae98827f07ca27f295bc45e141..3006e217796ff01e2627d47bf1eb7473880d5fb5 100644 (file)
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -139,7 +139,7 @@ struct ggml_hexagon_session {
  };
  
  void ggml_hexagon_session::enqueue(struct htp_general_req &req, struct dspqueue_buffer *bufs, uint32_t n_bufs, bool sync) {
-    // Bump pending flag (cleared in the session::flush once we get the responce)
+    // Bump pending flag (cleared in the session::flush once we get the response)
      this->op_pending++;  // atomic inc
  
      int err = dspqueue_write(this->queue,
@@ -443,7 +443,7 @@ static void repack_row_q4x4x2(uint8_t * y, const block_q4_0 * x, int64_t k) {
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Repack the scales
          ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -503,7 +503,7 @@ static void unpack_row_q4x4x2(block_q4_0 * x, const uint8_t * y, int64_t k) {
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -552,7 +552,7 @@ static void init_row_q4x4x2(block_q4_0 * x, int64_t k) {
  
      // Init the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          x[i * 8 + 0].d = 0;
@@ -770,7 +770,7 @@ static void repack_row_q8x4x2(uint8_t * y, const block_q8_0 * x, int64_t k) {
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Repack the scales
          ggml_half * d = (ggml_half *) (y_d + i * dblk_size);
@@ -829,7 +829,7 @@ static void unpack_row_q8x4x2(block_q8_0 * x, const uint8_t * y, int64_t k) {
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          const ggml_half * d = (const ggml_half *) (y_d + i * dblk_size);
@@ -878,7 +878,7 @@ static void init_row_q8x4x2(block_q8_0 * x, int64_t k) {
  
      // Init the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_Q8_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          x[i * 8 + 0].d = 0;
@@ -1120,7 +1120,7 @@ static void repack_row_mxfp4x4x2(uint8_t * y, const block_mxfp4 * x, int64_t k)
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Repack the scales
          uint8_t * e = (uint8_t *) (y_e + i * eblk_size);
@@ -1180,7 +1180,7 @@ static void unpack_row_mxfp4x4x2(block_mxfp4 * x, const uint8_t * y, int64_t k)
  
      // Repack the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4_0x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          const uint8_t * e = (const uint8_t *) (y_e + i * eblk_size);
@@ -1229,7 +1229,7 @@ static void init_row_mxfp4x4x2(block_mxfp4 * x, int64_t k) {
  
      // Init the scales
      // Note: Do not combine with the loop above. For tensor sizes not multiple of 256 (QK_MXFP4x4x2)
-    // the last block is truncated and overriden by the scales.
+    // the last block is truncated and overridden by the scales.
      for (int i = 0; i < nb; i++) {
          // Unpack the scales
          x[i * 8 + 0].e = 0;
@@ -2670,7 +2670,7 @@ static std::vector<int> ggml_hexagon_graph_optimize_reorder(const std::vector<no
      // The main goal here is to stack the MUL_MAT ops with the same src1 input.
      // This allows use to reuse dynamically quantized src1 in VTCM.
  
-    // TODO: the current version might do incorrect reodering in cases where quantized src0
+    // TODO: the current version might do incorrect reordering in cases where quantized src0
      //       input is an output of another Op.
  
      for (int i0 = 0; i0 < n; i0++) {
diff --git a/ggml/src/ggml-hexagon/htp-drv.cpp b/ggml/src/ggml-hexagon/htp-drv.cpp

index 2530bb06d6cc352020a4da3e0d24e0081800e60d..4c376b5fc9187b2263be4977028f3776f0c6ac14 100644 (file)
--- a/ggml/src/ggml-hexagon/htp-drv.cpp
+++ b/ggml/src/ggml-hexagon/htp-drv.cpp
@@ -282,7 +282,7 @@ static std::string get_driver_path() {
      // Replace \SystemRoot with an absolute path from system ENV windir
      const std::wstring systemRootEnv = L"windir";
  
-    // Query the number of wide charactors this variable requires
+    // Query the number of wide characters this variable requires
      DWORD numWords = GetEnvironmentVariableW(systemRootEnv.c_str(), NULL, 0);
      if (numWords == 0) {
          GGML_LOG_ERROR("ggml-hex: Failed get systemRoot environment variable\n");
diff --git a/ggml/src/ggml-hexagon/htp/hvx-inverse.h b/ggml/src/ggml-hexagon/htp/hvx-inverse.h

index 49f3efabbccd6e89f984dd63ab3abfd0c6c0e2ae..53db94aae2bf764f3f85b91635d169a9ebef8185 100644 (file)
--- a/ggml/src/ggml-hexagon/htp/hvx-inverse.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-inverse.h
@@ -67,7 +67,7 @@ static inline HVX_Vector hvx_vec_inverse_f16(HVX_Vector vals) {
  
      HVX_Vector vcl0 = Q6_Vuh_vcl0_Vuh(rm);  //count leading zeros
  
-    // Get mantissa for 16-bit represenation
+    // Get mantissa for 16-bit representation
      HVX_Vector mant_recip = Q6_V_vand_VV(Q6_Vh_vasr_VhR(Q6_Vh_vasl_VhVh(rm, vcl0), 5), Q6_Vh_vsplat_R(0x03FF));
  
      //Compute Reciprocal Exponent
diff --git a/ggml/src/ggml-hexagon/htp/rope-ops.c b/ggml/src/ggml-hexagon/htp/rope-ops.c

index aa6a6c9008d1ef45d3d3069150ff3fe5f74caefc..9aeb80d0b8b7ae0fdcda4a628b4e226034a4ea3e 100644 (file)
--- a/ggml/src/ggml-hexagon/htp/rope-ops.c
+++ b/ggml/src/ggml-hexagon/htp/rope-ops.c
@@ -18,7 +18,7 @@
  #include "htp-msg.h"
  #include "htp-ops.h"
  
-// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we cant include ggml.h
+// Redefined the types GGML_ROPE_TYPE_NORMAL & GGML_ROPE_TYPE_NEOX as we can't include ggml.h
  #define HTP_ROPE_TYPE_NORMAL 0
  #define HTP_ROPE_TYPE_NEOX   2
  
diff --git a/ggml/src/ggml-hexagon/htp/worker-pool.c b/ggml/src/ggml-hexagon/htp/worker-pool.c

index 894815f46a548b386933b59b5818bfda235c38f0..172e28908ebb09e76b7ef2d0c018340f63bc9732 100644 (file)
--- a/ggml/src/ggml-hexagon/htp/worker-pool.c
+++ b/ggml/src/ggml-hexagon/htp/worker-pool.c
@@ -56,7 +56,7 @@ static void worker_pool_main(void * context) {
          unsigned int n = atomic_load(&pool->n_jobs);
          unsigned int i = atomic_fetch_add(&pool->next_job, 1);
          if (i >= n) {
-            // Spurios wakeup
+            // Spurious wakeup
              continue;
          }
  
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m

index 3db7f12629181ef1f63c6f03e49265aa608ae9b3..4cce414abfef44c1e75c48e4ddf0cf591301af5c 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -1281,7 +1281,7 @@ struct ggml_metal_buffer {
      bool use_residency_sets;
  
      // optional MTLResidencySet
-    // note: cannot use explicity "id<MTLResidencySet>" here because it is not available on certain OSes
+    // note: cannot use explicitly "id<MTLResidencySet>" here because it is not available on certain OSes
      id rset;
  
      // pointers to global device
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp

index 3d5db0b79f58525b5efb2f8be4c87da312e258a4..b3390352ffcfbf3bd335efb607d5092ef2d3534a 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -631,7 +631,7 @@ int ggml_metal_op_acc(ggml_metal_op_t ctx, int idx) {
      const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
  
      if (!inplace) {
-        // run a separete kernel to cpy src->dst
+        // run a separate kernel to cpy src->dst
          // not sure how to avoid this
          // TODO: make a simpler cpy_bytes kernel
  
@@ -1644,7 +1644,7 @@ int ggml_metal_op_set(ggml_metal_op_t ctx, int idx) {
      const bool inplace = (bool) ((const int32_t *) op->op_params)[4];
  
      if (!inplace) {
-        // run a separete kernel to cpy src->dst
+        // run a separate kernel to cpy src->dst
          // not sure how to avoid this
          // TODO: make a simpler cpy_bytes kernel
  
@@ -2005,7 +2005,7 @@ int ggml_metal_op_mul_mat(ggml_metal_op_t ctx, int idx) {
          const int16_t r0ptg  = nypsg*nsg;         // num src0 rows per threadgroup
                int16_t r1ptg  = 4;                 // num src1 rows per threadgroup
  
-        // note: not sure how optimal are those across all different hardware. there might be someting cleverer
+        // note: not sure how optimal are those across all different hardware. there might be something cleverer
          switch (ne11) {
              case 2:
                  r1ptg = 2; break;
diff --git a/ggml/src/ggml-metal/ggml-metal.cpp b/ggml/src/ggml-metal/ggml-metal.cpp

index 1c705362fb7a01dd393db0000c331209d27d20a5..9382ce53b36322e52920ee7952f406657c3a8613 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.cpp
+++ b/ggml/src/ggml-metal/ggml-metal.cpp
@@ -14,7 +14,7 @@
  #define GGML_METAL_MAX_DEVICES 16
  
  // number of Metal devices
-// note: can be overriden with GGML_METAL_DEVICES env to simulate virtual devices
+// note: can be overridden with GGML_METAL_DEVICES env to simulate virtual devices
  static int g_devices = 1;
  
  ////////////////////////////////////////////////////////////////////////////////
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal

index 6c349aa0c9259d4169a3c2236726b45d73446a6b..a58e641ad86b834622366f1d329765d924e50394 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -4218,7 +4218,7 @@ kernel void kernel_im2col(
  template [[host_name("kernel_im2col_f32")]] kernel im2col_t kernel_im2col<float>;
  template [[host_name("kernel_im2col_f16")]] kernel im2col_t kernel_im2col<half>;
  
-// TODO: obolete -- remove
+// TODO: obsolete -- remove
  //typedef void (im2col_ext_t)(
  //        constant ggml_metal_kargs_im2col & args,
  //        device const float * x,
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp

index a4403a5c2736cf0cf6a2f88c14bab719d72d97ef..7af032ce0e177a07ee6e09a0345605b050743b3e 100644 (file)
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -313,7 +313,7 @@ struct ProfilingInfo {
      cl_ulong cmd_duration_ns;
      // The time for the kernel to complete - COMPLETE - END
      cl_ulong cmd_complete_duration_ns;
-    // Total time to finish the kernel - COMPELTE - QUEUED
+    // Total time to finish the kernel - COMPLETE - QUEUED
      cl_ulong cmd_total_duration_ns;
      // Global and local work sizes.
      size_t global_size[3];
@@ -2555,7 +2555,7 @@ static std::vector<ggml_backend_device> ggml_opencl_probe_devices(ggml_backend_r
  
      cl_platform_id platform_ids[NPLAT];
      if (clGetPlatformIDs(NPLAT, platform_ids, &n_platforms) != CL_SUCCESS) {
-        GGML_LOG_ERROR("ggml_opencl: plaform IDs not available.\n");
+        GGML_LOG_ERROR("ggml_opencl: platform IDs not available.\n");
          return found_devices;
      }
  
@@ -3339,7 +3339,7 @@ static void ggml_backend_opencl_synchronize(ggml_backend_t backend) {
      CL_CHECK(clReleaseEvent(evt));
  }
  
-// Syncronizes the 'backend_ctx's device with others so that commands
+// Synchronizes the 'backend_ctx's device with others so that commands
  // enqueued to it won't start until commands in the other devices have
  // completed.
  static void sync_with_other_backends(ggml_backend_opencl_context * backend_ctx) {
@@ -3997,7 +3997,7 @@ struct ggml_backend_opencl_buffer_context {
  
      // The buffer_context is initially created by ggml_backend_buft_alloc_buffer
      // before any tensor is initialized (at the beginning of alloc_tensor_range).
-    // Hence, there is alway a buffer object in this vector. When each tensor is
+    // Hence, there is always a buffer object in this vector. When each tensor is
      // being initialized, this original buffer object will be released if both
      // flattening and small allocation are enabled, and additional buffer
      // objects will be created in init_tensor to represent flattened quantized
@@ -4132,7 +4132,7 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
          //GGML_ASSERT(offset == 0);
  
          // We create subbuffers from the original tensor buffer for scales and
-        // quants - i.e., scales and quants are aliases into the buffer obejct
+        // quants - i.e., scales and quants are aliases into the buffer object
          // that backs the original tensor. This is a cleaner way to adapt to the
          // new memory management.
          // In the old code, we allocate new buffers for scales and quants
diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp

index 519638fd416cd9bdc7007059171338504d06ddb4..04c9e1d786452f68c33ebfd8554c41114a997bf7 100644 (file)
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -76,10 +76,10 @@ extern int g_ggml_sycl_prioritize_dmmv;
  
  
  #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define VER_4VEC 610 // todo for hardward optimize.
-#define VER_GEN9 700 // todo for hardward optimize.
-#define VER_GEN12 1000000 // todo for hardward optimize.
-#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardward optimize.
+#define VER_4VEC 610 // todo for hardware optimize.
+#define VER_GEN9 700 // todo for hardware optimize.
+#define VER_GEN12 1000000 // todo for hardware optimize.
+#define VER_GEN13 (VER_GEN12 + 1030) // todo for hardware optimize.
  
  #define GGML_SYCL_MAX_NODES 8192 // TODO: adapt to hardwares
  
diff --git a/ggml/src/ggml-sycl/quants.hpp b/ggml/src/ggml-sycl/quants.hpp

index d0d5ac9a4e8027370d99f046003883a4d8b54d31..14490fea5be546ce675d499c30e8f91ec50f04dd 100644 (file)
--- a/ggml/src/ggml-sycl/quants.hpp
+++ b/ggml/src/ggml-sycl/quants.hpp
@@ -29,7 +29,7 @@ namespace ggml_sycl_reordered {
  // [qs0, qs1, qs2, ..., qsN]  [d0, d1, d2, ..., dN]
  //
  // Notes: out-of-bounds qs will run into d values
-// Aligment relies on the allocated size of qs
+// Alignment relies on the allocated size of qs
  
  template <ggml_type type> struct block_q_t;
  
diff --git a/ggml/src/ggml-sycl/softmax.cpp b/ggml/src/ggml-sycl/softmax.cpp

index b41124acc1399e655c112cec449f80bc60cb99cd..15d92e5e04cd459288b3ea61a2cd333a40a56b0f 100644 (file)
--- a/ggml/src/ggml-sycl/softmax.cpp
+++ b/ggml/src/ggml-sycl/softmax.cpp
@@ -37,7 +37,7 @@ struct soft_max_params {
  };
  
  // When ncols_template == 0 the bounds for the loops in this function are not known and can't be unrolled.
-// As we want to keep pragma unroll for all other cases we supress the clang transformation warning here.
+// As we want to keep pragma unroll for all other cases we suppress the clang transformation warning here.
  #ifdef __clang__
  #pragma clang diagnostic push
  #pragma clang diagnostic ignored "-Wpass-failed"
diff --git a/ggml/src/ggml-vulkan/CMakeLists.txt b/ggml/src/ggml-vulkan/CMakeLists.txt

index de01336cd3fd20f9066f5e4e7f67fe7256b2fbab..715a263a6d092c16e96dd5963238fc766a1e87e4 100644 (file)
--- a/ggml/src/ggml-vulkan/CMakeLists.txt
+++ b/ggml/src/ggml-vulkan/CMakeLists.txt
@@ -90,7 +90,7 @@ if (Vulkan_FOUND)
      target_include_directories(ggml-vulkan PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
  
      # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
-    # Posssibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
+    # Possibly relevant: https://stackoverflow.com/questions/74748276/visual-studio-no-displays-the-correct-length-of-stdvector
      if (MSVC AND CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
          add_compile_definitions(_ITERATOR_DEBUG_LEVEL=0)
      endif()
diff --git a/gguf-py/gguf/metadata.py b/gguf-py/gguf/metadata.py

index e0d478ce95dd2b6c6444b7669ea2f66b144435f9..e954644e28f9b0a75122536a54ccb35788ca584c 100644 (file)
--- a/gguf-py/gguf/metadata.py
+++ b/gguf-py/gguf/metadata.py
@@ -186,7 +186,7 @@ class Metadata:
          # Quick hack to fix the Norway problem
          # https://hitchdev.com/strictyaml/why/implicit-typing-removed/
          yaml_content = yaml_content.replace("- no\n", "- \"no\"\n")
-        # yaml should use 2 spaces insted of tab
+        # yaml should use 2 spaces instead of tab
          # this issue has came up with the Qwen/Qwen3-235B-A22B-Instruct-2507 model card
          #    (I've also sent a pr tp fix the modelcard too)
          yaml_content = yaml_content.replace("\t", "  ")
diff --git a/gguf-py/tests/test_metadata.py b/gguf-py/tests/test_metadata.py

index 40d484f4eaa9d0c1270a5b70dc2e5153c6ed8d9f..b77c563ff25efddbd958744cf7fbc743a495a083 100755 (executable)
--- a/gguf-py/tests/test_metadata.py
+++ b/gguf-py/tests/test_metadata.py
@@ -164,7 +164,7 @@ class TestMetadataMethod(unittest.TestCase):
          self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B"),
                           ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration-LoRA', None, '8B'))
  
-        # Negative size --> output is a LoRA adaper --> prune "LoRA" out of the name to avoid redundancy with the suffix
+        # Negative size --> output is a LoRA adapter --> prune "LoRA" out of the name to avoid redundancy with the suffix
          self.assertEqual(gguf.Metadata.get_model_id_components("Llama-3-Instruct-abliteration-LoRA-8B", -1234),
                           ('Llama-3-Instruct-abliteration-LoRA-8B', None, 'Llama-3', 'Instruct-abliteration', None, '8B'))
  
diff --git a/include/llama.h b/include/llama.h

index 077f66dc651fc664034c809c6cc9391a89c69e3d..a84d56a885024abbf82c8c8a7e76896a48e8b3f3 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -973,7 +973,7 @@ extern "C" {
  
      // Logits for the ith token. For positive indices, Equivalent to:
      // llama_get_logits(ctx) + ctx->output_ids[i]*n_vocab
-    // Negative indicies can be used to access logits in reverse order, -1 is the last logit.
+    // Negative indices can be used to access logits in reverse order, -1 is the last logit.
      // returns NULL for invalid ids.
      LLAMA_API float * llama_get_logits_ith(struct llama_context * ctx, int32_t i);
  
@@ -988,7 +988,7 @@ extern "C" {
  
      // Get the embeddings for the ith token. For positive indices, Equivalent to:
      // llama_get_embeddings(ctx) + ctx->output_ids[i]*n_embd
-    // Negative indicies can be used to access embeddings in reverse order, -1 is the last embedding.
+    // Negative indices can be used to access embeddings in reverse order, -1 is the last embedding.
      // shape: [n_embd] (1-dimensional)
      // returns NULL for invalid ids.
      LLAMA_API float * llama_get_embeddings_ith(struct llama_context * ctx, int32_t i);
@@ -1008,9 +1008,9 @@ extern "C" {
      // Returns LLAMA_TOKEN_NULL if no token was sampled.
      LLAMA_API llama_token llama_get_sampled_token_ith(struct llama_context * ctx, int32_t i);
  
-    // Get the backend sampled probabilites for the ith token
+    // Get the backend sampled probabilities for the ith token
      // The index matches llama_get_sampled_token_ith().
-    // Returns NULL if no probabilites were generated.
+    // Returns NULL if no probabilities were generated.
      LLAMA_API float *  llama_get_sampled_probs_ith      (struct llama_context * ctx, int32_t i);
      LLAMA_API uint32_t llama_get_sampled_probs_count_ith(struct llama_context * ctx, int32_t i);
  
@@ -1337,7 +1337,7 @@ extern "C" {
                                 float   tau,
                                 float   eta);
  
-    /// @details Intializes a GBNF grammar, see grammars/README.md for details.
+    /// @details Initializes a GBNF grammar, see grammars/README.md for details.
      /// @param vocab The vocabulary that this grammar will be used with.
      /// @param grammar_str The production rules for the grammar, encoded as a string. Returns an empty grammar if empty. Returns NULL if parsing of grammar_str fails.
      /// @param grammar_root The name of the start symbol for the grammar.
diff --git a/scripts/pr2wt.sh b/scripts/pr2wt.sh

index 067f5d466b69de4fde5798563696d3f318db54c9..e028814c43154453631efef3f40caa8c2b4b2ea3 100755 (executable)
--- a/scripts/pr2wt.sh
+++ b/scripts/pr2wt.sh
@@ -1,6 +1,6 @@
  #!/usr/bin/env bash
  
-# intialize a new worktree from a PR number:
+# initialize a new worktree from a PR number:
  #
  # - creates a new remote using the fork's clone URL
  # - creates a local branch tracking the remote branch
diff --git a/scripts/server-bench.py b/scripts/server-bench.py

index dbbb0939ffef9058b7e9df51a0a3bd378811bda9..202c35a486bbbbdd8def283f3e0d7543180a5430 100755 (executable)
--- a/scripts/server-bench.py
+++ b/scripts/server-bench.py
@@ -292,6 +292,6 @@ if __name__ == "__main__":
          "--n_predict_min", type=int, default=1024,
          help="Min. number of tokens to predict per prompt (supported for synthetic prompts only)")
      parser.add_argument("--seed_offset", type=int, default=0, help="Offset for determining the seeds for pseudorandom prompt/generation lengths. "
-                        "Corelations between seeds can occur when set >= 1000. Negative values mean no seed.")
+                        "Correlations between seeds can occur when set >= 1000. Negative values mean no seed.")
      args = parser.parse_args()
      benchmark(**vars(args))
diff --git a/src/llama-context.cpp b/src/llama-context.cpp

index 98d055d34ef70a8b86aa8cec867a922a9f442328..eee9021296ec03c31ccac9ced6776cee34d82842 100644 (file)
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -158,7 +158,7 @@ llama_context::llama_context(
      cparams.op_offload = params.op_offload;
      cparams.kv_unified = params.kv_unified;
  
-    // intialized later
+    // initialized later
      cparams.pipeline_parallel = false;
  
      {
@@ -1981,7 +1981,7 @@ ggml_cgraph * llama_context::graph_reserve(
  
      ggml_backend_sched_reset(sched.get());
  
-    // when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
+    // when the scheduler is reset, we cannot reuse the old graph, so we reset the previous graph result to prevent that
      gf_res_prev->reset();
  
      // store the n_outputs as it is, and restore it afterwards
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 23a86ea2905fba1fb26354c159d0221bb8b99ff2..b8126ce50817d49512ab070b43f112985071e23f 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1616,7 +1616,7 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
  ggml_tensor * llm_graph_context::build_inp_out_ids() const {
      // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
      //       but this would make the graph topology depend on the number of output tokens, which can interere with
-    //       features that require constant topology such as pipline parallelism
+    //       features that require constant topology such as pipeline parallelism
      //       ref: https://github.com/ggml-org/llama.cpp/pull/14275#issuecomment-2987424471
      //if (n_outputs < n_tokens) {
      //    return nullptr;
@@ -1779,7 +1779,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
          if (v_mla) {
  #if 0
              // v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
-            // However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
+            // However, the code is optimized for dimensions 0 and 1 being large, so this is inefficient.
              cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
              cur = ggml_mul_mat(ctx0, v_mla, cur);
  #else
diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp

index 6b668ee9abdd683967a505a9f613a2440b551949..4031bafe9ecf101c858e57ae7cf5ea339ac604d2 100644 (file)
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
@@ -583,7 +583,7 @@ llama_kv_cache::slot_info_vec_t llama_kv_cache::prepare(const std::vector<llama_
              break;
          }
  
-        // remeber the position that we found
+        // remember the position that we found
          res.push_back(sinfo_new);
  
          // store the old state of the cells in the recovery stack
@@ -1293,7 +1293,7 @@ static void set_input_kq_mask_impl(const args_set_input_kq_mask & args, float *
      }
  
      for (uint32_t s = 0; s < n_stream; ++s) {
-        // bookeeping of the KQ mask cells that could change for other tokens of the same sequence
+        // bookkeeping of the KQ mask cells that could change for other tokens of the same sequence
          std::unordered_map<llama_seq_id, uint32_t>              seq_srct;
          std::unordered_map<llama_seq_id, std::vector<uint32_t>> seq_idxs;
  
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index dabf3b3086ef8827a09615532a090fbdd8d6d9c7..60b7cc6946f640ee1482a4782c952852b8aaf4e2 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1524,7 +1524,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                  }
  
                  switch (hparams.n_layer) {
-                    // TODO: Jamba layers are a bit heterogenous, so naming this is hard.
+                    // TODO: Jamba layers are a bit heterogeneous, so naming this is hard.
                      case 12: // 900M  8x???M
                      case 32: // 51B  16x?B
                      default: type = LLM_TYPE_UNKNOWN;
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 194eed238ec07dfb6a1403ac5a67639fb1864680..ce83361dc79a96f4a9f3e4a1c247fc2192be23f5 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1833,7 +1833,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                  const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
                  precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
  #if defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-                // correct endiannes of data in precompiled_charsmap binary blob
+                // correct endianness of data in precompiled_charsmap binary blob
                  uint32_t * xcda_blob_size = (uint32_t *) &precompiled_charsmap[0];
                  *xcda_blob_size = __builtin_bswap32(*xcda_blob_size);
                  assert(*xcda_blob_size + sizeof(uint32_t) < n_precompiled_charsmap);
diff --git a/src/models/deepseek2.cpp b/src/models/deepseek2.cpp

index b608396e50ea73a2806ef4dc1c5d39e23136842e..be81709c50ba81abe3dc0726e18ab9cff2fee67b 100644 (file)
--- a/src/models/deepseek2.cpp
+++ b/src/models/deepseek2.cpp
@@ -146,7 +146,7 @@ llm_build_deepseek2::llm_build_deepseek2(const llama_model & model, const llm_gr
                      cb(Qcur, "Qcur_attn_temp_scaled", il);
                  }
  
-                // note: MLA with the absorption optimzation converts into MQA (ie: GQA with 1 group)
+                // note: MLA with the absorption optimization converts into MQA (ie: GQA with 1 group)
                  cur = build_attn(inp_attn_k,
                          model.layers[il].wo, NULL,
                          Qcur, Kcur, Vcur, nullptr, nullptr, model.layers[il].wv_b, kq_scale, il);
diff --git a/src/models/models.h b/src/models/models.h

index 0712d03d8d96b384a36a81705f5a6f6301873625..cf9ba04e7f7c148222846ccce3b696aa0eef2917 100644 (file)
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -3,7 +3,7 @@
  #include "llama-model.h"
  #include "llama-graph.h"
  
-// note: almost all graphs require atleast sqrtf, so include cmath globally
+// note: almost all graphs require at least sqrtf, so include cmath globally
  #include <cmath>
  
  //
diff --git a/src/unicode.cpp b/src/unicode.cpp

index 1475b53b6597400235204dda6c01b3f07aa494ec..122c8ca04a511f52eb03b5d51d2d772104ff42c5 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -773,7 +773,7 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
          // tiny_aya digit grouping pattern from tokenizer.json:
          //   {"type": "Split", "pattern": {"Regex": "\\d{1,3}(?=(?:\\d{3})*\\b)"}, "behavior": "Isolated"}
          // Splits digits into groups of 3 from the right (e.g., 1234567 -> 1, 234, 567)
-        // TODO: Revisit this regex, incase there are any subtle tokenization differences with the original regex.
+        // TODO: Revisit this regex, in case there are any subtle tokenization differences with the original regex.
          bpe_offsets = unicode_regex_split_custom_afmoe(text, offsets);
      }
  
diff --git a/tests/test-alloc.cpp b/tests/test-alloc.cpp

index 95e09c97b02e31aec2923626a8a3e25d04018cd0..7ae739ad2eff290ba8684bb104356e510db12c80 100644 (file)
--- a/tests/test-alloc.cpp
+++ b/tests/test-alloc.cpp
@@ -285,7 +285,7 @@ static void test_max_size_too_many_tensors() {
      GGML_ASSERT(backend.context->allocated_total() <= 16 + 16);
  }
  
-// Scenario where there is some space left in the first buffer, but not enough to accomodate
+// Scenario where there is some space left in the first buffer, but not enough to accommodate
  // a larger tensor, so a second buffer is required
  static void test_max_size_tensor_too_large() {
      dummy_backend backend      = dummy_backend_init(32);
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 0ac21cdcf62c58bc0259c78d3dcca3e4fc5c4aac..7c6938d447bf6fd2b9aaf0df957dcc73b3e59619 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -1868,9 +1868,9 @@ struct test_case {
  };
  
  
-// ###################################
-// ## Section 2: GGML Op Defintions ##
-// ###################################
+// ####################################
+// ## Section 2: GGML Op Definitions ##
+// ####################################
  
  
  // The following is an example showing the bare minimum for creating a test for a GGML op.
@@ -6222,7 +6222,7 @@ struct test_flash_attn_ext : public test_case {
      void initialize_tensors(ggml_context * ctx) override {
          for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
              if (strcmp(t->name, "s") == 0) {
-                // make the sink values more noticable in order to trigger a test failure when the implementation is wrong
+                // make the sink values more noticeable in order to trigger a test failure when the implementation is wrong
                  init_tensor_uniform(t, -10.0f, 10.0f);
              } else if (strcmp(t->name, "m") == 0) {
                  init_tensor_kq_mask(t);
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp

index f3d19118b584760f5082364b22decfb352b1868e..46aec8395fa66e6d33f72e1be976778b82ff111f 100644 (file)
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -438,7 +438,7 @@ static void test_templates(const struct common_chat_templates * tmpls, const std
  }
  
  /**
- * Test if streaming=true is consistant with streaming=false for given partial parser
+ * Test if streaming=true is consistent with streaming=false for given partial parser
   * Also test if there is any problem with partial message
   */
  template <typename T>
diff --git a/tools/completion/README.md b/tools/completion/README.md

index bcc088765928cc47b6a63cf508fd03be9d7a0eb5..f868c2c7d7d8e07169b7a7fe335f72232908a231 100644 (file)
--- a/tools/completion/README.md
+++ b/tools/completion/README.md
@@ -480,7 +480,7 @@ Example usage: `--mirostat 2 --mirostat-lr 0.05 --mirostat-ent 3.0`
  
  Exclude Top Choices (XTC) is a unique sampler that is designed to remove top tokens from consideration and avoid more obvious and repetitive outputs. With a chance of `xtc-probability` it searches for tokens with probabilities of `xtc-threshold` and above, then removes all such tokens except the least probable one.
  
-By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
+By removing top tokens XTC can improve the variety of answers, break writing clichés and inhibit repetition, since clichés and repeated phrases are usually more likely to appear. By keeping the last token above the threshold, XTC ensures that the answer is still coherent. XTC is meant to be used for creative tasks, but feel free to experiment with different settings for different models.
  
  Being experimental and unique, XTC is disabled by default. The recommended combination of samplers is Min-P followed by XTC on its default settings: `--sampling-seq mx --min-p 0.02 --xtc-probability 0.5`.
  
@@ -531,7 +531,7 @@ These options help improve the performance and memory usage of the LLaMA models.
  
  ### NUMA support
  
--   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
+-   `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
  -   `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
  -   `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
  
diff --git a/tools/cvector-generator/cvector-generator.cpp b/tools/cvector-generator/cvector-generator.cpp

index 4c8ca61ec4f91e232b3bce030f0f47b6155fb998..dcce0e98418b320ce1032d4115b3f0ae78a27da4 100644 (file)
--- a/tools/cvector-generator/cvector-generator.cpp
+++ b/tools/cvector-generator/cvector-generator.cpp
@@ -110,7 +110,7 @@ struct callback_data {
              auto diff_filtered = filter_nonzero_rows(v_pos[il]);
              v_diff_filtered.push_back(diff_filtered);
          }
-        return v_diff_filtered; // for convinient, we return the result std::vector
+        return v_diff_filtered; // for convenient, we return the result std::vector
      }
  
      // delete zero rows from a given 2D tensor
diff --git a/tools/imatrix/README.md b/tools/imatrix/README.md

index 4505cb4ce8c7d2f6e211098b7cfcb90562b7fb94..4cbe4fd0cf75bac1724be5ced6bf07ea6fbb17e8 100644 (file)
--- a/tools/imatrix/README.md
+++ b/tools/imatrix/README.md
@@ -95,4 +95,4 @@ Weighted averages of Σ(Act²), ZD Score and CosSim are also calculated.
  #### Important note on the computed Statistics
  
  When using these statistics, please note that they are computed on the squared activations, **not on the actual (raw) activations**.
-Whilst the results are still useful, they're less realiable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
+Whilst the results are still useful, they're less reliable than using the raw values, and in the case of the cosine similarity, could be misleading if the tensor contains opposite vectors.
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h

index a30c32ed42bf7962c5bdee1dd37b3e5cf954ade5..0c3cf8670a42a5bb537572522fb5ee9d56814a8a 100644 (file)
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -68,7 +68,7 @@
  
  #define TN_POS_EMBD        "%s.position_embd.weight"
  #define TN_CLASS_EMBD      "v.class_embd"
-#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
+#define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backward compat
  #define TN_PATCH_EMBD_1    "v.patch_embd.weight.1"
  #define TN_PATCH_BIAS      "v.patch_embd.bias"
  #define TN_NORM_EMBD       "v.norm_embd.%s"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h

index e0eb9b32c8f917cbab66d0cb6d009e1633cc50e4..eeb8da58e0829160c1e36ac00266a5977bdc5805 100644 (file)
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -46,7 +46,7 @@ struct clip_hparams {
      float image_std[3];
  
      // for models using dynamic image size, we need to have a smaller image size to warmup
-    // otherwise, user will get OOM everytime they load the model
+    // otherwise, user will get OOM every time they load the model
      int32_t warmup_image_size = 0;
      int32_t warmup_audio_size = 3000;
  
@@ -221,7 +221,7 @@ struct clip_model {
      // embeddings
      ggml_tensor * class_embedding = nullptr;
      ggml_tensor * patch_embeddings_0 = nullptr;
-    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL)
      ggml_tensor * patch_bias = nullptr;
      ggml_tensor * position_embeddings = nullptr;
      ggml_tensor * norm_embd_w = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 607d4b837318675243869b7c6ab0d87e152b1c17..b70bad33b686e708cdda01cfed3a52be1726da21 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2287,7 +2287,7 @@ static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32
      }
  }
  
-// set of tools to manupulate images
+// set of tools to manipulate images
  // in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
  struct img_tool {
      enum resize_algo {
diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py

index 944037e703eb11100771d3fedf419cef037b97f5..1f563fbfc5998c3eefb9b5f253f35637cfb8024e 100644 (file)
--- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
+++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py
@@ -186,7 +186,7 @@ def trunc_normal_tf_(
      best when :math:`a \\leq \text{mean} \\leq b`.
      NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
      bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
+    and the result is subsequently scaled and shifted by the mean and std args.
      Args:
          tensor: an n-dimensional `torch.Tensor`
          mean: the mean of the normal distribution
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp

index e8eef035ff57de528441f9632e1ce552958a7263..447f61aaa40b3d1126aeda4aac7d6a965f1661c3 100644 (file)
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -560,7 +560,7 @@ bool mtmd_audio_preprocessor_whisper::preprocess(const float *                 s
      for (size_t off = 0; off < (size_t) out_full.n_len; off += frames_per_chunk) {
          int n_len = std::min(frames_per_chunk, (size_t) out_full.n_len - off);
          if ((size_t) n_len < frames_per_chunk) {
-            break;  // last uncomplete chunk will always be a padded chunk, safe to ignore
+            break;  // last incomplete chunk will always be a padded chunk, safe to ignore
          }
  
          mtmd_audio_mel out_chunk;
diff --git a/tools/perplexity/README.md b/tools/perplexity/README.md

index eb3846072ea62c0035755f09b44f6e01e71b2816..f82d34c8a251f0c94a438081edbfdce06dca6228 100644 (file)
--- a/tools/perplexity/README.md
+++ b/tools/perplexity/README.md
@@ -27,10 +27,10 @@ In addition to the KL divergence the following statistics are calculated with `-
  * Ratio of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated. The logarithm of this metric is also calculated and printed, it is 0 if the logit distributions are the same.
  * Difference of mean FP16 PPL and quantized PPL. Uncertainty is estimated on logits, then propagated.
  * Mean change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse.
-* Pearson correlation coefficient of the "correct" token probabilites between models.
+* Pearson correlation coefficient of the "correct" token probabilities between models.
  * Percentiles of change in "correct" token probability. Positive values mean the model gets better at prediction, negative values mean it gets worse. Can be used to judge noise vs. quality loss from quantization. If the percentiles are symmetric then the quantization is essentially just adding noise. If the negative values are significantly larger than the positive values then this indicates that the model is actually becoming worse from the quantization.
  * The root mean square of the change in token probabilities. If you were to assume that the quantization simply causes Gaussian noise on the token probabilities then this would be the standard deviation of said noise. The uncertainty on the value is calculated that the change in token probabilities follows a Gaussian distribution. Related discussion: https://github.com/ggml-org/llama.cpp/discussions/2875 .
-* Same top p: Percentage of how often the token was assigned the highest probabilites by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
+* Same top p: Percentage of how often the token was assigned the highest probabilities by both models. The uncertainty is calculated from the Gaussian approximation of the binomial distribution.
  
  ## LLaMA 3 8b Scoreboard
  
diff --git a/tools/quantize/README.md b/tools/quantize/README.md

index 22f0710286723479d0702732efd20494f0c81a29..b8c225124b33ed3e392e21f229002ba9e1e0f9e2 100644 (file)
--- a/tools/quantize/README.md
+++ b/tools/quantize/README.md
@@ -100,7 +100,7 @@ Examples:
  ## Memory/Disk Requirements
  
  When running the larger models, make sure you have enough disk space to store all the intermediate files.
-As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For exmaple (Llama 3.1):
+As the models are currently fully loaded into memory, you will need adequate disk space to save them and sufficient RAM to load them. At the moment, memory and disk requirements are the same. For example (Llama 3.1):
  
  | Model | Original size | Quantized size (Q4_K_M) |
  | ----: | ------------: | ----------------------: |
diff --git a/tools/server/public_legacy/index-new.html b/tools/server/public_legacy/index-new.html

index e2f39d6687e5643aa762231a08b3023d3275a2ce..2cee7f3c3c4631b2dd2ce2ad2f6e0056686613ea 100644 (file)
--- a/tools/server/public_legacy/index-new.html
+++ b/tools/server/public_legacy/index-new.html
@@ -36,7 +36,7 @@
  
      const params = signal({
        n_predict: 358, // 358 is a nice number
-      temperature: 0.8, // adapt all following parameters to optimized min-p requierements. If for non-english, set to 0.6 or lower
+      temperature: 0.8, // adapt all following parameters to optimized min-p requirements. If for non-english, set to 0.6 or lower
        repeat_last_n: 0, // 0 = disable penalty, -1 = context size
        repeat_penalty: 1.0, // 1.0 = disabled
        dry_multiplier: 0.0, // 0.0 = disabled, 0.8 works well
@@ -108,7 +108,7 @@
      let importedTemplates = local_storage_getDataAsObject('user_templates')
  
      if (importedTemplates) {
-      // saved templates were successfuly imported.
+      // saved templates were successfully imported.
  
        console.log('Processing saved templates and updating default template')
        params.value = { ...params.value, image_data: [] };
@@ -129,7 +129,7 @@
      }
  
      function userTemplateResetToDefault() {
-      console.log('Reseting themplate to default')
+      console.log('Reseting template to default')
        selectedUserTemplate.value.name = 'default';
        selectedUserTemplate.value.data = savedUserTemplates.value['default'];
      }
diff --git a/tools/server/public_simplechat/datautils.mjs b/tools/server/public_simplechat/datautils.mjs

index 75159d6b1676bfeb0ab01014f5d04a8a1f5f0500..08ccc219bfd61263877c6ed7f5f3ce5b0afb532e 100644 (file)
--- a/tools/server/public_simplechat/datautils.mjs
+++ b/tools/server/public_simplechat/datautils.mjs
@@ -63,7 +63,7 @@ export function trim_repeat_garbage_at_end(sIn, maxSubL=10, maxMatchLenThreshold
  
  
  /**
- * Simple minded logic to help remove repeating garbage at end of the string, till it cant.
+ * Simple minded logic to help remove repeating garbage at end of the string, till it can't.
   * If its not able to trim, then it will try to skip a char at end and then trim, a few times.
   * This ensures that even if there are multiple runs of garbage with different patterns, the
   * logic still tries to munch through them.
diff --git a/tools/server/public_simplechat/readme.md b/tools/server/public_simplechat/readme.md

index 24e026d455b03f8691f6733dbed8f5eacbc14af0..cc86d62494cf57a963095c7bb3fb7a6597ec9e6c 100644 (file)
--- a/tools/server/public_simplechat/readme.md
+++ b/tools/server/public_simplechat/readme.md
@@ -30,7 +30,7 @@ The UI follows a responsive web design so that the layout can adapt to available
  enough manner, in general.
  
  Allows developer/end-user to control some of the behaviour by updating gMe members from browser's devel-tool
-console. Parallely some of the directly useful to end-user settings can also be changed using the provided
+console. Parallelly some of the directly useful to end-user settings can also be changed using the provided
  settings ui.
  
  NOTE: Current web service api doesnt expose the model context length directly, so client logic doesnt provide
@@ -38,7 +38,7 @@ any adaptive culling of old messages nor of replacing them with summary of their
  is a optional sliding window based chat logic, which provides a simple minded culling of old messages from
  the chat history before sending to the ai model.
  
-NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionaly stream for now.
+NOTE: Wrt options sent with the request, it mainly sets temperature, max_tokens and optionally stream for now.
  However if someone wants they can update the js file or equivalent member in gMe as needed.
  
  NOTE: One may be able to use this to chat with openai api web-service /chat/completions endpoint, in a very
@@ -88,7 +88,7 @@ Once inside
      then the end user needs to enter the same.
      This keeps the logic simple, while still giving flexibility to the end user to
      manage any templating/tagging requirement wrt their messages to the model.
-  * the logic doesnt insert newline at the begining and end wrt the prompt message generated.
+  * the logic doesnt insert newline at the beginning and end wrt the prompt message generated.
      However if the chat being sent to /completions end point has more than one role's message,
      then insert newline when moving from one role's message to the next role's message, so
      that it can be clearly identified/distinguished.
@@ -101,8 +101,8 @@ Once inside
    Normally Completion mode doesnt need system prompt, while Chat mode can generate better/interesting
    responses with a suitable system prompt.
    * if chat.add_system_begin is used
-    * you cant change the system prompt, after it is has been submitted once along with user query.
-    * you cant set a system prompt, after you have submitted any user query
+    * you can't change the system prompt, after it is has been submitted once along with user query.
+    * you can't set a system prompt, after you have submitted any user query
    * if chat.add_system_anytime is used
      * one can change the system prompt any time during chat, by changing the contents of system prompt.
      * inturn the updated/changed system prompt will be inserted into the chat session.
@@ -129,7 +129,7 @@ Once inside
  
  ### Reason behind this
  
-The idea is to be easy enough to use for basic purposes, while also being simple and easily discernable
+The idea is to be easy enough to use for basic purposes, while also being simple and easily discernible
  by developers who may not be from web frontend background (so inturn may not be familiar with template /
  end-use-specific-language-extensions driven flows) so that they can use it to explore/experiment things.
  
@@ -167,7 +167,7 @@ It is attached to the document object. Some of these can also be updated using t
    messages that get inserted into prompt field wrt /Completion endpoint.
  
    bTrimGarbage - whether garbage repeatation at the end of the generated ai response, should be
-  trimmed or left as is. If enabled, it will be trimmed so that it wont be sent back as part of
+  trimmed or left as is. If enabled, it will be trimmed so that it won't be sent back as part of
    subsequent chat history. At the same time the actual trimmed text is shown to the user, once
    when it was generated, so user can check if any useful info/data was there in the response.
  
@@ -244,7 +244,7 @@ full chat history. This way if there is any response with garbage/repeatation, i
  mess with things beyond the next question/request/query, in some ways. The trim garbage
  option also tries to help avoid issues with garbage in the context to an extent.
  
-Set max_tokens to 1024, so that a relatively large previous reponse doesnt eat up the space
+Set max_tokens to 1024, so that a relatively large previous response doesnt eat up the space
  available wrt next query-response. However dont forget that the server when started should
  also be started with a model context size of 1k or more, to be on safe side.
  
diff --git a/tools/server/public_simplechat/simplechat.js b/tools/server/public_simplechat/simplechat.js

index 2fcd24a860bd4d9130d314cf12604ac9b3b60ea0..c67577d5ae781fea2579672afaff71c290506733 100644 (file)
--- a/tools/server/public_simplechat/simplechat.js
+++ b/tools/server/public_simplechat/simplechat.js
@@ -318,7 +318,7 @@ class SimpleChat {
      }
  
      /**
-     * Allow setting of system prompt, but only at begining.
+     * Allow setting of system prompt, but only at beginning.
       * @param {string} sysPrompt
       * @param {string} msgTag
       */
@@ -333,7 +333,7 @@ class SimpleChat {
                      console.error(`ERRR:SimpleChat:SC:${msgTag}:You need to specify system prompt before any user query, ignoring...`);
                  } else {
                      if (this.xchat[0].content !== sysPrompt) {
-                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You cant change system prompt, mid way through, ignoring...`);
+                        console.error(`ERRR:SimpleChat:SC:${msgTag}:You can't change system prompt, mid way through, ignoring...`);
                      }
                  }
              }
diff --git a/tools/server/public_simplechat/ui.mjs b/tools/server/public_simplechat/ui.mjs

index b2d5b9aeab76c0724f9c8fa5238e3ce009aaa877..afa619a0663739c42679c72560400fda9a7708a6 100644 (file)
--- a/tools/server/public_simplechat/ui.mjs
+++ b/tools/server/public_simplechat/ui.mjs
@@ -44,7 +44,7 @@ export function el_create_button(id, callback, name=undefined, innerText=undefin
  }
  
  /**
- * Create a para and set it up. Optionaly append it to a passed parent.
+ * Create a para and set it up. Optionally append it to a passed parent.
   * @param {string} text
   * @param {HTMLElement | undefined} elParent
   * @param {string | undefined} id
@@ -111,7 +111,7 @@ export function el_creatediv_boolbutton(id, label, texts, defaultValue, cb, clas
  /**
   * Create a select ui element, with a set of options to select from.
   * * options: an object which contains name-value pairs
- * * defaultOption: the value whose name should be choosen, by default.
+ * * defaultOption: the value whose name should be chosen, by default.
   * * cb : the call back returns the name string of the option selected.
   *
   * @param {string} id
diff --git a/tools/server/tests/README.md b/tools/server/tests/README.md

index a60d3f8ea1a986e993949e776391712edc8f9f71..f566b43644b11333993c3122aaac309a172fd2b3 100644 (file)
--- a/tools/server/tests/README.md
+++ b/tools/server/tests/README.md
@@ -57,7 +57,7 @@ To run a single test:
  ./tests.sh unit/test_chat_completion.py::test_invalid_chat_completion_req
  ```
  
-Hint: You can compile and run test in single command, useful for local developement:
+Hint: You can compile and run test in single command, useful for local development:
  
  ```shell
  cmake --build build -j --target llama-server && ./tools/server/tests/tests.sh
author	Marcel Petrick <redacted>
	Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)
committer	GitHub <redacted>
	Thu, 5 Mar 2026 07:50:21 +0000 (08:50 +0100)
CONTRIBUTING.md		patch \| blob \| history
common/arg.cpp		patch \| blob \| history
common/common.h		patch \| blob \| history
common/debug.h		patch \| blob \| history
common/jinja/README.md		patch \| blob \| history
convert_hf_to_gguf.py		patch \| blob \| history
docs/backend/CANN.md		patch \| blob \| history
docs/backend/SYCL.md		patch \| blob \| history
docs/backend/snapdragon/README.md		patch \| blob \| history
docs/backend/snapdragon/windows.md		patch \| blob \| history
docs/build.md		patch \| blob \| history
docs/multimodal/MobileVLM.md		patch \| blob \| history
examples/debug/README.md		patch \| blob \| history
examples/diffusion/README.md		patch \| blob \| history
examples/llama.vim		patch \| blob \| history
examples/model-conversion/README.md		patch \| blob \| history
examples/sycl/README.md		patch \| blob \| history
ggml/include/ggml-backend.h		patch \| blob \| history
ggml/include/ggml-opt.h		patch \| blob \| history
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml-cpu/amx/mmq.cpp		patch \| blob \| history
ggml/src/ggml-cpu/arch/arm/quants.c		patch \| blob \| history
ggml/src/ggml-cpu/arch/arm/repack.cpp		patch \| blob \| history
ggml/src/ggml-cpu/arch/x86/repack.cpp		patch \| blob \| history
ggml/src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
ggml/src/ggml-cpu/llamafile/sgemm.cpp		patch \| blob \| history
ggml/src/ggml-cpu/ops.cpp		patch \| blob \| history
ggml/src/ggml-cpu/repack.cpp		patch \| blob \| history
ggml/src/ggml-cuda/fattn-mma-f16.cuh		patch \| blob \| history
ggml/src/ggml-cuda/fattn-vec.cuh		patch \| blob \| history
ggml/src/ggml-cuda/fattn-wmma-f16.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/quantize.cu		patch \| blob \| history
ggml/src/ggml-cuda/softmax.cu		patch \| blob \| history
ggml/src/ggml-cuda/solve_tri.cu		patch \| blob \| history
ggml/src/ggml-hexagon/ggml-hexagon.cpp		patch \| blob \| history
ggml/src/ggml-hexagon/htp-drv.cpp		patch \| blob \| history
ggml/src/ggml-hexagon/htp/hvx-inverse.h		patch \| blob \| history
ggml/src/ggml-hexagon/htp/rope-ops.c		patch \| blob \| history
ggml/src/ggml-hexagon/htp/worker-pool.c		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-device.m		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-ops.cpp		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.cpp		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.metal		patch \| blob \| history
ggml/src/ggml-opencl/ggml-opencl.cpp		patch \| blob \| history
ggml/src/ggml-sycl/common.hpp		patch \| blob \| history
ggml/src/ggml-sycl/quants.hpp		patch \| blob \| history
ggml/src/ggml-sycl/softmax.cpp		patch \| blob \| history
ggml/src/ggml-vulkan/CMakeLists.txt		patch \| blob \| history
gguf-py/gguf/metadata.py		patch \| blob \| history
gguf-py/tests/test_metadata.py		patch \| blob \| history
include/llama.h		patch \| blob \| history
scripts/pr2wt.sh		patch \| blob \| history
scripts/server-bench.py		patch \| blob \| history
src/llama-context.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history
src/llama-kv-cache.cpp		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history
src/models/deepseek2.cpp		patch \| blob \| history
src/models/models.h		patch \| blob \| history
src/unicode.cpp		patch \| blob \| history
tests/test-alloc.cpp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history
tests/test-chat.cpp		patch \| blob \| history
tools/completion/README.md		patch \| blob \| history
tools/cvector-generator/cvector-generator.cpp		patch \| blob \| history
tools/imatrix/README.md		patch \| blob \| history
tools/mtmd/clip-impl.h		patch \| blob \| history
tools/mtmd/clip-model.h		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py		patch \| blob \| history
tools/mtmd/mtmd-audio.cpp		patch \| blob \| history
tools/perplexity/README.md		patch \| blob \| history
tools/quantize/README.md		patch \| blob \| history
tools/server/public_legacy/index-new.html		patch \| blob \| history
tools/server/public_simplechat/datautils.mjs		patch \| blob \| history
tools/server/public_simplechat/readme.md		patch \| blob \| history
tools/server/public_simplechat/simplechat.js		patch \| blob \| history
tools/server/public_simplechat/ui.mjs		patch \| blob \| history
tools/server/tests/README.md		patch \| blob \| history