repo : update links to new url (#11886)

author Georgi Gerganov <redacted>

Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)

committer GitHub <redacted>

Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)
author Georgi Gerganov <redacted>
Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)
committer GitHub <redacted>
Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)
diff --git a/.devops/llama-cpp-cuda.srpm.spec b/.devops/llama-cpp-cuda.srpm.spec

index 7425d3a9d7a4027fcff13077ffec3caf6e66d02f..3bbf4a4def2a59cae23c4d504fa1aff9d813e668 100644 (file)
--- a/.devops/llama-cpp-cuda.srpm.spec
+++ b/.devops/llama-cpp-cuda.srpm.spec
@@ -17,10 +17,10 @@ Version:        %( date "+%%Y%%m%%d" )
  Release:        1%{?dist}
  Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
  License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
  BuildRequires:  coreutils make gcc-c++ git cuda-toolkit
  Requires:       cuda-toolkit
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp
  
  %define debug_package %{nil}
  %define source_date_epoch_from_changelog 0
diff --git a/.devops/llama-cpp.srpm.spec b/.devops/llama-cpp.srpm.spec

index 4d5560089816c6679bd02b673dc57a2206ebfbe7..45902dcf896e0b239ce17560947c532b87379a0f 100644 (file)
--- a/.devops/llama-cpp.srpm.spec
+++ b/.devops/llama-cpp.srpm.spec
@@ -18,10 +18,10 @@ Version:        %( date "+%%Y%%m%%d" )
  Release:        1%{?dist}
  Summary:        CPU Inference of LLaMA model in pure C/C++ (no CUDA/OpenCL)
  License:        MIT
-Source0:        https://github.com/ggerganov/llama.cpp/archive/refs/heads/master.tar.gz
+Source0:        https://github.com/ggml-org/llama.cpp/archive/refs/heads/master.tar.gz
  BuildRequires:  coreutils make gcc-c++ git libstdc++-devel
  Requires:       libstdc++
-URL:            https://github.com/ggerganov/llama.cpp
+URL:            https://github.com/ggml-org/llama.cpp
  
  %define debug_package %{nil}
  %define source_date_epoch_from_changelog 0
diff --git a/.devops/nix/package.nix b/.devops/nix/package.nix

index 043c4364b956ac30127e089216be545a499a687c..6e8050a49963528323e074b1e9d8bf32d618ed4b 100644 (file)
--- a/.devops/nix/package.nix
+++ b/.devops/nix/package.nix
@@ -133,12 +133,12 @@ effectiveStdenv.mkDerivation (finalAttrs: {
        --replace '[bundle pathForResource:@"default" ofType:@"metallib"];' "@\"$out/bin/default.metallib\";"
    '';
  
-  # With PR#6015 https://github.com/ggerganov/llama.cpp/pull/6015,
+  # With PR#6015 https://github.com/ggml-org/llama.cpp/pull/6015,
    # `default.metallib` may be compiled with Metal compiler from XCode
    # and we need to escape sandbox on MacOS to access Metal compiler.
    # `xcrun` is used find the path of the Metal compiler, which is varible
    # and not on $PATH
-  # see https://github.com/ggerganov/llama.cpp/pull/6118 for discussion
+  # see https://github.com/ggml-org/llama.cpp/pull/6118 for discussion
    __noChroot = effectiveStdenv.isDarwin && useMetalKit && precompileMetalShaders;
  
    nativeBuildInputs =
@@ -220,7 +220,7 @@ effectiveStdenv.mkDerivation (finalAttrs: {
      broken = (useMetalKit && !effectiveStdenv.isDarwin);
  
      description = "Inference of LLaMA model in pure C/C++${descriptionSuffix}";
-    homepage = "https://github.com/ggerganov/llama.cpp/";
+    homepage = "https://github.com/ggml-org/llama.cpp/";
      license = lib.licenses.mit;
  
      # Accommodates `nix run` and `lib.getExe`
diff --git a/.devops/rocm.Dockerfile b/.devops/rocm.Dockerfile

index a8088ea00da5bb946f7a8a2a4830907290bdf7ff..48e7e6aaa5b77fac159f796682e174dc42aefde9 100644 (file)
--- a/.devops/rocm.Dockerfile
+++ b/.devops/rocm.Dockerfile
@@ -11,7 +11,7 @@ ARG BASE_ROCM_DEV_CONTAINER=rocm/dev-ubuntu-${UBUNTU_VERSION}:${ROCM_VERSION}-co
  FROM ${BASE_ROCM_DEV_CONTAINER} AS build
  
  # Unless otherwise specified, we make a fat build.
-# List from https://github.com/ggerganov/llama.cpp/pull/1087#issuecomment-1682807878
+# List from https://github.com/ggml-org/llama.cpp/pull/1087#issuecomment-1682807878
  # This is mostly tied to rocBLAS supported archs.
  # gfx803, gfx900, gfx1032, gfx1101, gfx1102,not officialy supported
  # gfx906 is deprecated
diff --git a/.github/ISSUE_TEMPLATE/020-enhancement.yml b/.github/ISSUE_TEMPLATE/020-enhancement.yml

index 02dd4f575a686802a44e0ff66c5725daf7c7934e..cee1446f5a097a643e444cf204d22f512a665ada 100644 (file)
--- a/.github/ISSUE_TEMPLATE/020-enhancement.yml
+++ b/.github/ISSUE_TEMPLATE/020-enhancement.yml
@@ -6,7 +6,7 @@ body:
    - type: markdown
      attributes:
        value: |
-        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggerganov/llama.cpp/discussions/categories/ideas)
+        [Please post your idea first in Discussion if there is not yet a consensus for this enhancement request. This will help to keep this issue tracker focused on enhancements that the community has agreed needs to be implemented.](https://github.com/ggml-org/llama.cpp/discussions/categories/ideas)
  
    - type: checkboxes
      id: prerequisites
@@ -16,11 +16,11 @@ body:
        options:
          - label: I am running the latest code. Mention the version if possible as well.
            required: true
-        - label: I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md).
+        - label: I carefully followed the [README.md](https://github.com/ggml-org/llama.cpp/blob/master/README.md).
            required: true
          - label: I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
            required: true
-        - label: I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new and useful enhancement to share.
+        - label: I reviewed the [Discussions](https://github.com/ggml-org/llama.cpp/discussions), and have a new and useful enhancement to share.
            required: true
  
    - type: textarea
diff --git a/.github/ISSUE_TEMPLATE/030-research.yml b/.github/ISSUE_TEMPLATE/030-research.yml

index 18975dbbfd0fe536b4aab5c7d36b965c732cee98..e774550d5908c38aeb7a6823b7dc38d8e49cd3df 100644 (file)
--- a/.github/ISSUE_TEMPLATE/030-research.yml
+++ b/.github/ISSUE_TEMPLATE/030-research.yml
@@ -6,7 +6,7 @@ body:
    - type: markdown
      attributes:
        value: |
-        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
+        Don't forget to check for any [duplicate research issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3A%22research+%F0%9F%94%AC%22)
  
    - type: checkboxes
      id: research-stage
diff --git a/.github/ISSUE_TEMPLATE/040-refactor.yml b/.github/ISSUE_TEMPLATE/040-refactor.yml

index b6e6ab36defd6383b4dcbe1ee8628a4692edf7aa..2fe94e26c6988a9577fa30d91e88ed993e526a92 100644 (file)
--- a/.github/ISSUE_TEMPLATE/040-refactor.yml
+++ b/.github/ISSUE_TEMPLATE/040-refactor.yml
@@ -6,8 +6,8 @@ body:
    - type: markdown
      attributes:
        value: |
-        Don't forget to [check for existing refactor issue tickets](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
-        Also you may want to check [Pull request refactor label as well](https://github.com/ggerganov/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
+        Don't forget to [check for existing refactor issue tickets](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aopen+is%3Aissue+label%3Arefactoring) in case it's already covered.
+        Also you may want to check [Pull request refactor label as well](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Aopen+is%3Apr+label%3Arefactoring) for duplicates too.
  
    - type: textarea
      id: background-description
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml

index eb8c4b472df4c4fb22e7190a320dbf800eea4807..0d246533c95158ed3eb574f6a58fab59f02e3a8a 100644 (file)
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
  blank_issues_enabled: true
  contact_links:
    - name: Got an idea?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/ideas
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/ideas
      about: Pop it there. It may then become an enhancement ticket.
    - name: Got a question?
-    url: https://github.com/ggerganov/llama.cpp/discussions/categories/q-a
+    url: https://github.com/ggml-org/llama.cpp/discussions/categories/q-a
      about: Ask a question there!
    - name: Want to contribute?
-    url: https://github.com/ggerganov/llama.cpp/wiki/contribute
+    url: https://github.com/ggml-org/llama.cpp/wiki/contribute
      about: Head to the contribution guide page of the wiki for areas you can help with
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md

index d9f5bdc235a0024ba068e93b93b3c66255ccaae8..d0bdd73c4439c59506c94c6cfadeeb2b09375214 100644 (file)
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1 +1 @@
-*Make sure to read the [contributing guidelines](https://github.com/ggerganov/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
+*Make sure to read the [contributing guidelines](https://github.com/ggml-org/llama.cpp/blob/master/CONTRIBUTING.md) before submitting a PR*
diff --git a/.github/workflows/bench.yml.disabled b/.github/workflows/bench.yml.disabled

index 1c8787ef78f7e43f991cf9a0d1fb4df40e7e8021..0370c8943fa0e75cd07ab64f3eaa5d27b808d4d0 100644 (file)
--- a/.github/workflows/bench.yml.disabled
+++ b/.github/workflows/bench.yml.disabled
@@ -1,5 +1,5 @@
  # TODO: there have been some issues with the workflow, so disabling for now
-#       https://github.com/ggerganov/llama.cpp/issues/7893
+#       https://github.com/ggml-org/llama.cpp/issues/7893
  #
  # Benchmark
  name: Benchmark
@@ -57,17 +57,7 @@ jobs:
  
      if: |
        inputs.gpu-series == 'Standard_NC4as_T4_v3'
-      || (
-        github.event_name == 'schedule'
-        && github.ref_name == 'master'
-        && github.repository_owner == 'ggerganov'
-      )
        || github.event_name == 'pull_request_target'
-      || (
-        github.event_name == 'push'
-        && github.event.ref == 'refs/heads/master'
-        && github.repository_owner == 'ggerganov'
-      )
      steps:
        - name: Clone
          id: checkout
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml

index e632ddd8279c9ff768e3b94bf9f49b7602f61c56..e6893ddd3c9acaba16dbf2cc82cdc151178d5550 100644 (file)
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -129,7 +129,7 @@ jobs:
          run: |
            sysctl -a
            # Metal is disabled due to intermittent failures with Github runners not having a GPU:
-          # https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
+          # https://github.com/ggml-org/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
            cmake -B build \
              -DCMAKE_BUILD_RPATH="@loader_path" \
              -DLLAMA_FATAL_WARNINGS=ON \
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml

index 368dbdbe5dcccfff731c35b4ff1dc15bc9f85695..0b0f300aa402ad80634d1338eb698311e4dc3a9c 100644 (file)
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@@ -11,7 +11,7 @@ jobs:
      steps:
      - uses: actions/checkout@v4
        with:
-        repository: "ggerganov/llama.cpp"
+        repository: "ggml-org/llama.cpp"
      - uses: actions/labeler@v5
        with:
          configuration-path: '.github/labeler.yml'
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md

index 8d411982b4379ee9c9be7686809d0448f15131dc..9d4e5a56fe48d929cc6bf9bfae08cfa2f374bfd3 100644 (file)
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@
  
  - Squash-merge PRs
  - Use the following format for the squashed commit title: `<module> : <commit title> (#<issue_number>)`. For example: `utils : fix typo in utils.py (#1234)`
-- Optionally pick a `<module>` from here: https://github.com/ggerganov/llama.cpp/wiki/Modules
+- Optionally pick a `<module>` from here: https://github.com/ggml-org/llama.cpp/wiki/Modules
  - Consider adding yourself to [CODEOWNERS](CODEOWNERS)
  
  # Coding guidelines
@@ -40,14 +40,14 @@
  - Try to follow the existing patterns in the code (indentation, spaces, etc.). In case of doubt use `clang-format` to format the added code
  - For anything not covered in the current guidelines, refer to the [C++ Core Guidelines](https://isocpp.github.io/CppCoreGuidelines/CppCoreGuidelines)
  - Tensors store data in row-major order. We refer to dimension 0 as columns, 1 as rows, 2 as matrices
-- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggerganov/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
+- Matrix multiplication is unconventional: [`C = ggml_mul_mat(ctx, A, B)`](https://github.com/ggml-org/llama.cpp/blob/880e352277fc017df4d5794f0c21c44e1eae2b84/ggml.h#L1058-L1064) means $C^T = A B^T \Leftrightarrow C = B A^T.$
  
  ![matmul](media/matmul.png)
  
  # Naming guidelines
  
  - Use `snake_case` for function, variable and type names
-- Naming usually optimizes for longest common prefix (see https://github.com/ggerganov/ggml/pull/302#discussion_r1243240963)
+- Naming usually optimizes for longest common prefix (see https://github.com/ggml-org/ggml/pull/302#discussion_r1243240963)
  
      ```cpp
      // not OK
@@ -122,4 +122,4 @@
  
  The Github issues, PRs and discussions contain a lot of information that can be useful to get familiar with the codebase. For convenience, some of the more important information is referenced from Github projects:
  
-https://github.com/ggerganov/llama.cpp/projects
+https://github.com/ggml-org/llama.cpp/projects
diff --git a/Makefile b/Makefile

index dc3de3cb14e44d01d0ffdaaded39e071e3c1ad37..662194086eaaf881e4f2a12286941f4251eb00b3 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,5 @@
  ifndef LLAMA_MAKEFILE
-$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)
+$(error The Makefile build is deprecated. Use the CMake build instead. For more details, see https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md)
  endif
  
  # Define the default target now so that it is always the first target
@@ -463,7 +463,7 @@ endif
  ifneq '' '$(findstring mingw,$(shell $(CC) -dumpmachine))'
         # The stack is only 16-byte aligned on Windows, so don't let gcc emit aligned moves.
         # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54412
-       # https://github.com/ggerganov/llama.cpp/issues/2922
+       # https://github.com/ggml-org/llama.cpp/issues/2922
         MK_CFLAGS   += -Xassembler -muse-unaligned-vector-move
         MK_CXXFLAGS += -Xassembler -muse-unaligned-vector-move
  
@@ -1078,8 +1078,8 @@ endif
  ifdef REMOVE_WARNING
  $(info !!! REMOVAL WARNING !!!)
  $(info The following LLAMA_ options have been removed and are no longer supported)
-$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggerganov/llama.cpp/pull/9418))
-$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggerganov/llama.cpp/pull/9418))
+$(info   - LLAMA_DISABLE_LOGS   (https://github.com/ggml-org/llama.cpp/pull/9418))
+$(info   - LLAMA_SERVER_VERBOSE (https://github.com/ggml-org/llama.cpp/pull/9418))
  $(info )
  endif
  
diff --git a/README.md b/README.md

index 7629647d7e42059603a7f82ea4b51aa3bf4974f6..1764cad812c141842248b9de388e1da4c31a0a55 100644 (file)
--- a/README.md
+++ b/README.md
@@ -3,26 +3,26 @@
  ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
  
  [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
+[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
  
-[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
+[Roadmap](https://github.com/users/ggml-org/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
  
  Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
  
  ## Recent API changes
  
-- [Changelog for `libllama` API](https://github.com/ggerganov/llama.cpp/issues/9289)
-- [Changelog for `llama-server` REST API](https://github.com/ggerganov/llama.cpp/issues/9291)
+- [Changelog for `libllama` API](https://github.com/ggml-org/llama.cpp/issues/9289)
+- [Changelog for `llama-server` REST API](https://github.com/ggml-org/llama.cpp/issues/9291)
  
  ## Hot topics
  
-- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggerganov/llama.cpp/pull/11427
+- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
  - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
-- Universal tool call support in `llama-server`: https://github.com/ggerganov/llama.cpp/pull/9639
+- Universal tool call support in `llama-server`: https://github.com/ggml-org/llama.cpp/pull/9639
  - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
-- Introducing GGUF-my-LoRA https://github.com/ggerganov/llama.cpp/discussions/10123
-- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggerganov/llama.cpp/discussions/9669
-- Hugging Face GGUF editor: [discussion](https://github.com/ggerganov/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
+- Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123
+- Hugging Face Inference Endpoints now support GGUF out of the box! https://github.com/ggml-org/llama.cpp/discussions/9669
+- Hugging Face GGUF editor: [discussion](https://github.com/ggml-org/llama.cpp/discussions/9268) | [tool](https://huggingface.co/spaces/CISCai/gguf-editor)
  
  ----
  
@@ -39,7 +39,7 @@ range of hardware - locally and in the cloud.
  - Vulkan and SYCL backend support
  - CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity
  
-The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggerganov/ggml) library.
+The `llama.cpp` project is the main playground for developing new features for the [ggml](https://github.com/ggml-org/ggml) library.
  
  <details>
  <summary>Models</summary>
@@ -59,23 +59,23 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
  - [X] [Falcon](https://huggingface.co/models?search=tiiuae/falcon)
  - [X] [Chinese LLaMA / Alpaca](https://github.com/ymcui/Chinese-LLaMA-Alpaca) and [Chinese LLaMA-2 / Alpaca-2](https://github.com/ymcui/Chinese-LLaMA-Alpaca-2)
  - [X] [Vigogne (French)](https://github.com/bofenghuang/vigogne)
-- [X] [BERT](https://github.com/ggerganov/llama.cpp/pull/5423)
+- [X] [BERT](https://github.com/ggml-org/llama.cpp/pull/5423)
  - [X] [Koala](https://bair.berkeley.edu/blog/2023/04/03/koala/)
  - [X] [Baichuan 1 & 2](https://huggingface.co/models?search=baichuan-inc/Baichuan) + [derivations](https://huggingface.co/hiyouga/baichuan-7b-sft)
  - [X] [Aquila 1 & 2](https://huggingface.co/models?search=BAAI/Aquila)
-- [X] [Starcoder models](https://github.com/ggerganov/llama.cpp/pull/3187)
+- [X] [Starcoder models](https://github.com/ggml-org/llama.cpp/pull/3187)
  - [X] [Refact](https://huggingface.co/smallcloudai/Refact-1_6B-fim)
-- [X] [MPT](https://github.com/ggerganov/llama.cpp/pull/3417)
-- [X] [Bloom](https://github.com/ggerganov/llama.cpp/pull/3553)
+- [X] [MPT](https://github.com/ggml-org/llama.cpp/pull/3417)
+- [X] [Bloom](https://github.com/ggml-org/llama.cpp/pull/3553)
  - [x] [Yi models](https://huggingface.co/models?search=01-ai/Yi)
  - [X] [StableLM models](https://huggingface.co/stabilityai)
  - [x] [Deepseek models](https://huggingface.co/models?search=deepseek-ai/deepseek)
  - [x] [Qwen models](https://huggingface.co/models?search=Qwen/Qwen)
-- [x] [PLaMo-13B](https://github.com/ggerganov/llama.cpp/pull/3557)
+- [x] [PLaMo-13B](https://github.com/ggml-org/llama.cpp/pull/3557)
  - [x] [Phi models](https://huggingface.co/models?search=microsoft/phi)
-- [x] [PhiMoE](https://github.com/ggerganov/llama.cpp/pull/11003)
+- [x] [PhiMoE](https://github.com/ggml-org/llama.cpp/pull/11003)
  - [x] [GPT-2](https://huggingface.co/gpt2)
-- [x] [Orion 14B](https://github.com/ggerganov/llama.cpp/pull/5118)
+- [x] [Orion 14B](https://github.com/ggml-org/llama.cpp/pull/5118)
  - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
  - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
  - [x] [Gemma](https://ai.google.dev/gemma)
@@ -146,7 +146,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
  - Zig: [deins/llama.cpp.zig](https://github.com/Deins/llama.cpp.zig)
  - Flutter/Dart: [netdur/llama_cpp_dart](https://github.com/netdur/llama_cpp_dart)
  - Flutter: [xuegao-tzx/Fllama](https://github.com/xuegao-tzx/Fllama)
-- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggerganov/llama.cpp/pull/6326)
+- PHP (API bindings and features built on top of llama.cpp): [distantmagic/resonance](https://github.com/distantmagic/resonance) [(more info)](https://github.com/ggml-org/llama.cpp/pull/6326)
  - Guile Scheme: [guile_llama_cpp](https://savannah.nongnu.org/projects/guile-llama-cpp)
  - Swift [srgtuszy/llama-cpp-swift](https://github.com/srgtuszy/llama-cpp-swift)
  - Swift [ShenghaiWang/SwiftLlama](https://github.com/ShenghaiWang/SwiftLlama)
@@ -245,7 +245,7 @@ The project also includes many example programs and tools using the `llama` libr
  - Clone this repository and build locally, see [how to build](docs/build.md)
  - On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
  - Use a Docker image, see [documentation for Docker](docs/docker.md)
-- Download pre-built binaries from [releases](https://github.com/ggerganov/llama.cpp/releases)
+- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
  
  ## Obtaining and quantizing models
  
@@ -258,14 +258,14 @@ You can either manually download the GGUF file or directly use any `llama.cpp`-c
  
  After downloading a model, use the CLI tools to run it locally - see below.
  
-`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggerganov/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
+`llama.cpp` requires the model to be stored in the [GGUF](https://github.com/ggml-org/ggml/blob/master/docs/gguf.md) file format. Models in other data formats can be converted to GGUF using the `convert_*.py` Python scripts in this repo.
  
  The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with `llama.cpp`:
  
  - Use the [GGUF-my-repo space](https://huggingface.co/spaces/ggml-org/gguf-my-repo) to convert to GGUF format and quantize model weights to smaller sizes
-- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggerganov/llama.cpp/discussions/10123)
-- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggerganov/llama.cpp/discussions/9268)
-- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggerganov/llama.cpp/discussions/9669)
+- Use the [GGUF-my-LoRA space](https://huggingface.co/spaces/ggml-org/gguf-my-lora) to convert LoRA adapters to GGUF format (more info: https://github.com/ggml-org/llama.cpp/discussions/10123)
+- Use the [GGUF-editor space](https://huggingface.co/spaces/CISCai/gguf-editor) to edit GGUF meta data in the browser (more info: https://github.com/ggml-org/llama.cpp/discussions/9268)
+- Use the [Inference Endpoints](https://ui.endpoints.huggingface.co/) to directly host `llama.cpp` in the cloud (more info: https://github.com/ggml-org/llama.cpp/discussions/9669)
  
  To learn more about model quantization, [read this documentation](examples/quantize/README.md)
  
@@ -488,9 +488,9 @@ To learn more about model quantization, [read this documentation](examples/quant
  - Collaborators can push to branches in the `llama.cpp` repo and merge PRs into the `master` branch
  - Collaborators will be invited based on contributions
  - Any help with managing issues, PRs and projects is very appreciated!
-- See [good first issues](https://github.com/ggerganov/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
+- See [good first issues](https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) for tasks suitable for first contributions
  - Read the [CONTRIBUTING.md](CONTRIBUTING.md) for more information
-- Make sure to read this: [Inference at the edge](https://github.com/ggerganov/llama.cpp/discussions/205)
+- Make sure to read this: [Inference at the edge](https://github.com/ggml-org/llama.cpp/discussions/205)
  - A bit of backstory for those who are interested: [Changelog podcast](https://changelog.com/podcast/532)
  
  ## Other documentation
@@ -505,7 +505,7 @@ To learn more about model quantization, [read this documentation](examples/quant
  - [Running on Docker](docs/docker.md)
  - [Build on Android](docs/android.md)
  - [Performance troubleshooting](docs/development/token_generation_performance_tips.md)
-- [GGML tips & tricks](https://github.com/ggerganov/llama.cpp/wiki/GGML-Tips-&-Tricks)
+- [GGML tips & tricks](https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&-Tricks)
  
  #### Seminal papers and background on the models
  
diff --git a/SECURITY.md b/SECURITY.md

index f4322c6ee4d18c3fb9f694fcc01f2be8640adb50..6a1bb6c32cd8ef8059232d803a6ad2c66f905661 100644 (file)
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -62,6 +62,6 @@ Beware that none of the topics under [Using llama.cpp securely](#using-llamacpp-
  <!-- normal version -->
  However, If you have discovered a security vulnerability in this project, please report it privately. **Do not disclose it as a public issue.** This gives us time to work with you to fix the issue before public exposure, reducing the chance that the exploit will be used before a patch is released.
  
-Please disclose it as a private [security advisory](https://github.com/ggerganov/llama.cpp/security/advisories/new).
+Please disclose it as a private [security advisory](https://github.com/ggml-org/llama.cpp/security/advisories/new).
  
  A team of volunteers on a reasonable-effort basis maintains this project. As such, please give us at least 90 days to work on a fix before public exposure.
diff --git a/ci/README.md b/ci/README.md

index 4064705190697a7da430e1b85863afe3c71ea123..8245c9df65db82bacf2b839d3a12269bea4b16a1 100644 (file)
--- a/ci/README.md
+++ b/ci/README.md
@@ -1,11 +1,11 @@
  # CI
  
-In addition to [Github Actions](https://github.com/ggerganov/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
+In addition to [Github Actions](https://github.com/ggml-org/llama.cpp/actions) `llama.cpp` uses a custom CI framework:
  
  https://github.com/ggml-org/ci
  
  It monitors the `master` branch for new commits and runs the
-[ci/run.sh](https://github.com/ggerganov/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
+[ci/run.sh](https://github.com/ggml-org/llama.cpp/blob/master/ci/run.sh) script on dedicated cloud instances. This allows us
  to execute heavier workloads compared to just using Github Actions. Also with time, the cloud instances will be scaled
  to cover various hardware architectures, including GPU and Apple Silicon instances.
  
diff --git a/common/arg.cpp b/common/arg.cpp

index a4d65ad00f675597e93af9297e6cfbb525b29c2b..b016cce0808ee1635c26406694a1d156cc35fdf4 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -1569,7 +1569,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          "- isolate: only spawn threads on CPUs on the node that execution started on\n"
          "- numactl: use the CPU map provided by numactl\n"
          "if run without this previously, it is recommended to drop the system page cache before using this\n"
-        "see https://github.com/ggerganov/llama.cpp/issues/1437",
+        "see https://github.com/ggml-org/llama.cpp/issues/1437",
          [](common_params & params, const std::string & value) {
              /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
              else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 018a2a588ae9d4c94767114c3f25f63a9dff48db..8b7c75d85a6f557f78487c3482659286d1d083cc 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -558,7 +558,7 @@ class Model:
  
      # NOTE: this function is generated by convert_hf_to_gguf_update.py
      #       do not modify it manually!
-    # ref:  https://github.com/ggerganov/llama.cpp/pull/6920
+    # ref:  https://github.com/ggml-org/llama.cpp/pull/6920
      # Marker: Start get_vocab_base_pre
      def get_vocab_base_pre(self, tokenizer) -> str:
          # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
@@ -708,7 +708,7 @@ class Model:
              logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
              logger.warning("**          - the pre-tokenization config has changed upstream")
              logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
              logger.warning("**")
              logger.warning(f"** chkhsh:  {chkhsh}")
              logger.warning("**************************************************************************************")
@@ -2835,7 +2835,7 @@ class InternLM2Model(Model):
          if chat_eos_token_id is not None:
              # For the chat model, we replace the eos with '<|im_end|>'.
              # TODO: this is a hack, should be fixed
-            #       https://github.com/ggerganov/llama.cpp/pull/6745#issuecomment-2067687048
+            #       https://github.com/ggml-org/llama.cpp/pull/6745#issuecomment-2067687048
              special_vocab.special_token_ids["eos"] = chat_eos_token_id
              logger.warning(f"Replace eos:{old_eos} with a special token:{chat_eos_token_id}"
                             " in chat mode so that the conversation can end normally.")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index cea34413f441fac33f2b571bca84477689af1108..fa4989a80c5447c9b981fc5e1e6bcf3e06a7af4a 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -8,7 +8,7 @@
  # provide the necessary information to llama.cpp via the GGUF header in order to implement
  # the same pre-tokenizer.
  #
-# ref: https://github.com/ggerganov/llama.cpp/pull/6920
+# ref: https://github.com/ggml-org/llama.cpp/pull/6920
  #
  # Instructions:
  #
@@ -246,7 +246,7 @@ src_func = f"""
              logger.warning("**          - the model has not been added to convert_hf_to_gguf_update.py yet")
              logger.warning("**          - the pre-tokenization config has changed upstream")
              logger.warning("**          Check your model files and convert_hf_to_gguf_update.py and update them accordingly.")
-            logger.warning("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
+            logger.warning("** ref:     https://github.com/ggml-org/llama.cpp/pull/6920")
              logger.warning("**")
              logger.warning(f"** chkhsh:  {{chkhsh}}")
              logger.warning("**************************************************************************************")
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py

index 6dea14a2329e8a53aeccb6f97de2d0426432f904..bdc991533b4e02cf069ba8dc5de758f6f70b13a3 100755 (executable)
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -395,7 +395,7 @@ if __name__ == '__main__':
                          logger.error(f"Unexpected name '{name}': Not a lora_A or lora_B tensor")
                          if ".embed_tokens.weight" in name or ".lm_head.weight" in name:
                              logger.error("Embeddings is present in the adapter. This can be due to new tokens added during fine tuning")
-                            logger.error("Please refer to https://github.com/ggerganov/llama.cpp/pull/9948")
+                            logger.error("Please refer to https://github.com/ggml-org/llama.cpp/pull/9948")
                          sys.exit(1)
  
                      if base_name in tensor_map:
@@ -419,7 +419,7 @@ if __name__ == '__main__':
                  # some archs may have the same tensor for lm_head and output (tie word embeddings)
                  # in this case, adapters targeting lm_head will fail when using llama-export-lora
                  # therefore, we ignore them for now
-                # see: https://github.com/ggerganov/llama.cpp/issues/9065
+                # see: https://github.com/ggml-org/llama.cpp/issues/9065
                  if name == "lm_head.weight" and len(dest) == 0:
                      raise ValueError("lm_head is present in adapter, but is ignored in base model")
                  for dest_name, dest_data in dest:
diff --git a/docs/android.md b/docs/android.md

index 47530c6c1d4786fae027aa838c9e47183cc810f4..d2a835653fe5d40c88d10e7504bf9f62bd4da1a9 100644 (file)
--- a/docs/android.md
+++ b/docs/android.md
@@ -12,7 +12,7 @@ $ apt update && apt upgrade -y
  $ apt install git cmake
  ```
  
-Then, follow the [build instructions](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md), specifically for CMake.
+Then, follow the [build instructions](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md), specifically for CMake.
  
  Once the binaries are built, download your model of choice (e.g., from Hugging Face). It's recommended to place it in the `~/` directory for best performance:
  
diff --git a/docs/backend/OPENCL.md b/docs/backend/OPENCL.md

index a604058cbeb97698839ad466287cfb05c3c6ffc7..2a946dc8df0ff90bd075a937a3c1f82e0f5d91d0 100644 (file)
--- a/docs/backend/OPENCL.md
+++ b/docs/backend/OPENCL.md
@@ -122,7 +122,7 @@ cp libOpenCL.so ~/android-sdk/ndk/26.3.11579264/toolchains/llvm/prebuilt/linux-x
  ```sh
  cd ~/dev/llm
  
-git clone https://github.com/ggerganov/llama.cpp && \
+git clone https://github.com/ggml-org/llama.cpp && \
  cd llama.cpp && \
  mkdir build-android && cd build-android
  
@@ -182,7 +182,7 @@ cmake --build . --target install
  mkdir -p ~/dev/llm
  cd ~/dev/llm
  
-git clone https://github.com/ggerganov/llama.cpp && cd llama.cpp
+git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
  mkdir build && cd build
  
  cmake .. -G Ninja `
diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md

index 89ddbd669afa0e7cf0c5a26436e308e7c34e23dd..0cb39e7927cd6fd9cfa7236ba59edaf6247eca63 100644 (file)
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -36,8 +36,8 @@ The following release is verified with good quality:
  
  |Commit ID|Tag|Release|Verified  Platform| Update date|
  |-|-|-|-|-|
-|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
-|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggerganov/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
+|3bcd40b3c593d14261fb2abfabad3c0fb5b9e318|b4040 |[llama-b4040-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b4040/llama-b4040-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1| 2024-11-19|
+|fb76ec31a9914b7761c1727303ab30380fd4f05c|b3038 |[llama-b3038-bin-win-sycl-x64.zip](https://github.com/ggml-org/llama.cpp/releases/download/b3038/llama-b3038-bin-win-sycl-x64.zip) |Arc770/Linux/oneAPI 2024.1<br>MTL Arc GPU/Windows 11/oneAPI 2024.1||
  
  
  ## News
@@ -58,7 +58,7 @@ The following release is verified with good quality:
  - 2024.3
    - Release binary files of Windows.
    - A blog is published: **Run LLM on all Intel GPUs Using llama.cpp**: [intel.com](https://www.intel.com/content/www/us/en/developer/articles/technical/run-llm-on-all-gpus-using-llama-cpp-artical.html) or [medium.com](https://medium.com/@jianyu_neo/run-llm-on-all-intel-gpus-using-llama-cpp-fd2e2dcbd9bd).
-  - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437).
+  - New base line is ready: [tag b2437](https://github.com/ggml-org/llama.cpp/tree/b2437).
    - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
    - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
    - Support detecting all GPUs with level-zero and same top **Max compute units**.
diff --git a/docs/build.md b/docs/build.md

index 8b812bc899b772119350d89c75481c056ee713a1..69480aa0849a5adffee20ae1b6e96e58e344188e 100644 (file)
--- a/docs/build.md
+++ b/docs/build.md
@@ -3,7 +3,7 @@
  **To get the Code:**
  
  ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
  cd llama.cpp
  ```
  
diff --git a/docs/cuda-fedora.md b/docs/cuda-fedora.md

index 9c88b769415cf7b4175ec84894c770602b283f32..75cd2b499d086fef53399c8ab0f997ee4d82eefa 100644 (file)
--- a/docs/cuda-fedora.md
+++ b/docs/cuda-fedora.md
@@ -248,7 +248,7 @@ You have successfully set up CUDA on Fedora within a toolbox environment using t
  
  - **Building `llama.cpp`:**
  
-  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
+  - With CUDA installed, you can follow these [build instructions for `llama.cpp`](https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md) to compile it with CUDA support.
    - Ensure that any CUDA-specific build flags or paths are correctly set in your build configuration.
  
  - **Using the Toolbox Environment:**
diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md

index 8fcd7081130f25c29f8936a7b1e26b8101a33a82..78c6f76077a2b3d2e72c34b08cc2ad4f9aa066bb 100644 (file)
--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@@ -104,16 +104,16 @@ Note: to debug the inference graph: you can use [llama-eval-callback](/examples/
  
  ## GGUF specification
  
-https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
+https://github.com/ggml-org/ggml/blob/master/docs/gguf.md
  
  ## Resources
  
-- YaRN RoPE scaling https://github.com/ggerganov/llama.cpp/pull/2268
-- support Baichuan serial models https://github.com/ggerganov/llama.cpp/pull/3009
-- support attention bias https://github.com/ggerganov/llama.cpp/pull/4283
-- Mixtral support https://github.com/ggerganov/llama.cpp/pull/4406
-- BERT embeddings https://github.com/ggerganov/llama.cpp/pull/5423
-- Grok-1 support https://github.com/ggerganov/llama.cpp/pull/6204
-- Command R Plus support https://github.com/ggerganov/llama.cpp/pull/6491
-- support arch DBRX https://github.com/ggerganov/llama.cpp/pull/6515
-- How to convert HuggingFace model to GGUF format https://github.com/ggerganov/llama.cpp/discussions/2948
+- YaRN RoPE scaling https://github.com/ggml-org/llama.cpp/pull/2268
+- support Baichuan serial models https://github.com/ggml-org/llama.cpp/pull/3009
+- support attention bias https://github.com/ggml-org/llama.cpp/pull/4283
+- Mixtral support https://github.com/ggml-org/llama.cpp/pull/4406
+- BERT embeddings https://github.com/ggml-org/llama.cpp/pull/5423
+- Grok-1 support https://github.com/ggml-org/llama.cpp/pull/6204
+- Command R Plus support https://github.com/ggml-org/llama.cpp/pull/6491
+- support arch DBRX https://github.com/ggml-org/llama.cpp/pull/6515
+- How to convert HuggingFace model to GGUF format https://github.com/ggml-org/llama.cpp/discussions/2948
diff --git a/docs/docker.md b/docs/docker.md

index 58b5d381d353a7a8de13091f66ac4fa8aed0d4c6..343146dbd214f494455821d6b6e6d8991f46f87e 100644 (file)
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -7,21 +7,21 @@
  ## Images
  We have three Docker images available for this project:
  
-1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
-2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
-3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
+1. `ghcr.io/ggml-org/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
+2. `ghcr.io/ggml-org/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
+3. `ghcr.io/ggml-org/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
  
  Additionally, there the following images, similar to the above:
  
-- `ghcr.io/ggerganov/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
-- `ghcr.io/ggerganov/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
-- `ghcr.io/ggerganov/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-cuda`: Same as `full` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-cuda`: Same as `light` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-cuda`: Same as `server` but compiled with CUDA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:full-rocm`: Same as `full` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:light-rocm`: Same as `light` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:server-rocm`: Same as `server` but compiled with ROCm support. (platforms: `linux/amd64`, `linux/arm64`)
+- `ghcr.io/ggml-org/llama.cpp:full-musa`: Same as `full` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:light-musa`: Same as `light` but compiled with MUSA support. (platforms: `linux/amd64`)
+- `ghcr.io/ggml-org/llama.cpp:server-musa`: Same as `server` but compiled with MUSA support. (platforms: `linux/amd64`)
  
  The GPU enabled images are not currently tested by CI beyond being built. They are not built with any variation from the ones in the Dockerfiles defined in [.devops/](../.devops/) and the GitHub Action defined in [.github/workflows/docker.yml](../.github/workflows/docker.yml). If you need different settings (for example, a different CUDA, ROCm or MUSA library, you'll need to build the images locally for now).
  
@@ -32,25 +32,25 @@ The easiest way to download the models, convert them to ggml and optimize them i
  Replace `/path/to/models` below with the actual path where you downloaded the models.
  
  ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-one "/models/" 7B
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --all-in-one "/models/" 7B
  ```
  
  On completion, you are ready to play!
  
  ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
  ```
  
  or with a light image:
  
  ```bash
-docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
+docker run -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
  ```
  
  or with a server image:
  
  ```bash
-docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggerganov/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
+docker run -v /path/to/models:/models -p 8000:8000 ghcr.io/ggml-org/llama.cpp:server -m /models/7B/ggml-model-q4_0.gguf --port 8000 --host 0.0.0.0 -n 512
  ```
  
  ## Docker With CUDA
diff --git a/docs/install.md b/docs/install.md

index 10a568506835beef9ba1c47253175e6007fd4bf0..0e23a2c9e7ae19e37d5285c412221e5de192a7e5 100644 (file)
--- a/docs/install.md
+++ b/docs/install.md
@@ -7,7 +7,7 @@ On Mac and Linux, the homebrew package manager can be used via
  ```sh
  brew install llama.cpp
  ```
-The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggerganov/llama.cpp/discussions/7668
+The formula is automatically updated with new `llama.cpp` releases. More info: https://github.com/ggml-org/llama.cpp/discussions/7668
  
  ## Nix
  
diff --git a/examples/cvector-generator/README.md b/examples/cvector-generator/README.md

index be4dd5250f15f85fb9ca024cdb337c6b2de216e3..6d5fd74ad8ca029c8517361feb207282042b2b5c 100644 (file)
--- a/examples/cvector-generator/README.md
+++ b/examples/cvector-generator/README.md
@@ -3,9 +3,9 @@
  This example demonstrates how to generate a control vector using gguf models.
  
  Related PRs:
-- [Add support for control vectors](https://github.com/ggerganov/llama.cpp/pull/5970)
-- (Issue) [Generate control vector using llama.cpp](https://github.com/ggerganov/llama.cpp/issues/6880)
-- [Add cvector-generator example](https://github.com/ggerganov/llama.cpp/pull/7514)
+- [Add support for control vectors](https://github.com/ggml-org/llama.cpp/pull/5970)
+- (Issue) [Generate control vector using llama.cpp](https://github.com/ggml-org/llama.cpp/issues/6880)
+- [Add cvector-generator example](https://github.com/ggml-org/llama.cpp/pull/7514)
  
  ## Examples
  
diff --git a/examples/imatrix/README.md b/examples/imatrix/README.md

index 9c056986b3ed6d0cb4c06da5485c2d4cbc4e1612..bdf248cd3e3334d561fef4858b0093c26def321a 100644 (file)
--- a/examples/imatrix/README.md
+++ b/examples/imatrix/README.md
@@ -1,7 +1,7 @@
  # llama.cpp/examples/imatrix
  
  Compute an importance matrix for a model and given text dataset. Can be used during quantization to enchance the quality of the quantized models.
-More information is available here: https://github.com/ggerganov/llama.cpp/pull/4861
+More information is available here: https://github.com/ggml-org/llama.cpp/pull/4861
  
  ## Usage
  
diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp

index 395e2aa47c3019a115479fa2bce90f6e0d83cf9a..4edc0bfacf1259bddcd58396430de90be7d3206e 100644 (file)
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@@ -100,7 +100,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
      const float * data = is_host ? (const float *) src1->data : m_src1_data.data();
  
      // this has been adapted to the new format of storing merged experts in a single 3d tensor
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6387
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
      if (t->op == GGML_OP_MUL_MAT_ID) {
          //   ids  -> [n_experts_used, n_tokens]
          //   src1 -> [cols, n_expert_used, n_tokens]
diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt

index 2de496574f54abfc54eddf3e4c47ab2f09d9231b..6119fe09b0cb622a66d940c9275a65a8b0784f28 100644 (file)
--- a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
+++ b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -14,7 +14,7 @@ project("llama-android")
  #include(FetchContent)
  #FetchContent_Declare(
  #        llama
-#        GIT_REPOSITORY https://github.com/ggerganov/llama.cpp
+#        GIT_REPOSITORY https://github.com/ggml-org/llama.cpp
  #        GIT_TAG        master
  #)
  
diff --git a/examples/llama.swiftui/README.md b/examples/llama.swiftui/README.md

index 96cf743d48202f1f537dec6cb692647b9af46577..f717886d661ce5d52f438e4204fb81df53b1ac66 100644 (file)
--- a/examples/llama.swiftui/README.md
+++ b/examples/llama.swiftui/README.md
@@ -3,9 +3,9 @@
  Local inference of llama.cpp on an iPhone. This is a sample app that can be used as a starting
  point for more advanced projects.
  
-For usage instructions and performance stats, check the following discussion: https://github.com/ggerganov/llama.cpp/discussions/4508
+For usage instructions and performance stats, check the following discussion: https://github.com/ggml-org/llama.cpp/discussions/4508
  
-![image](https://github.com/ggerganov/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
+![image](https://github.com/ggml-org/llama.cpp/assets/1991296/2b40284f-8421-47a2-b634-74eece09a299)
  
  Video demonstration:
  
diff --git a/examples/llama.vim b/examples/llama.vim

index 57eb2a9772d51db960b9e4f739ba61c4d63c8ee5..af3fd3935d765d4cac760dbcbe361bf1f2abcfaa 100644 (file)
--- a/examples/llama.vim
+++ b/examples/llama.vim
@@ -39,7 +39,7 @@
  "
  "   :call llama#init()
  "
-" more info: https://github.com/ggerganov/llama.cpp/pull/9787
+" more info: https://github.com/ggml-org/llama.cpp/pull/9787
  "
  
  " colors (adjust to your liking)
diff --git a/examples/llava/README-minicpmo2.6.md b/examples/llava/README-minicpmo2.6.md

index 8713a43d64fd82f8356e982535436cbecdc17d7d..8f591506dbbb099efcdfa3c77f0cd2d4f8db8087 100644 (file)
--- a/examples/llava/README-minicpmo2.6.md
+++ b/examples/llava/README-minicpmo2.6.md
@@ -26,7 +26,7 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
  ```
  
  Build llama.cpp using `CMake`:
-https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md
+https://github.com/ggml-org/llama.cpp/blob/master/docs/build.md
  
  ```bash
  cmake -B build
diff --git a/examples/llava/README-minicpmv2.5.md b/examples/llava/README-minicpmv2.5.md

index 1c8498ff9e151cf974b162610517111592463d99..b0e72a0fa7a78bf5fcf643d8ede7218b8754aac7 100644 (file)
--- a/examples/llava/README-minicpmv2.5.md
+++ b/examples/llava/README-minicpmv2.5.md
@@ -6,7 +6,7 @@ Download [MiniCPM-Llama3-V-2_5](https://huggingface.co/openbmb/MiniCPM-Llama3-V-
  
  Clone llama.cpp:
  ```bash
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
  cd llama.cpp
  ```
  
diff --git a/examples/lookahead/README.md b/examples/lookahead/README.md

index a69a471b47d397c2318787df77081066903a87d6..aab3cd0ca49b9fae016121526c9e80f68389a7ce 100644 (file)
--- a/examples/lookahead/README.md
+++ b/examples/lookahead/README.md
@@ -4,4 +4,4 @@ Demonstration of lookahead decoding technique:
  
  https://lmsys.org/blog/2023-11-21-lookahead-decoding/
  
-More info: https://github.com/ggerganov/llama.cpp/pull/4207
+More info: https://github.com/ggml-org/llama.cpp/pull/4207
diff --git a/examples/lookup/README.md b/examples/lookup/README.md

index 71c345c037a2fbfb7ee00640f7844f1d53143152..07d73849b06018bc156b3b2a92f68774d29c08dd 100644 (file)
--- a/examples/lookup/README.md
+++ b/examples/lookup/README.md
@@ -8,5 +8,5 @@ The key parameters for lookup decoding are `ngram_min`, `ngram_max` and `n_draft
  
  More info:
  
-https://github.com/ggerganov/llama.cpp/pull/4484
-https://github.com/ggerganov/llama.cpp/issues/4226
+https://github.com/ggml-org/llama.cpp/pull/4484
+https://github.com/ggml-org/llama.cpp/issues/4226
diff --git a/examples/main/README.md b/examples/main/README.md

index ea71591bd939d571562a00f0ae8034a584344bb7..f7c2497294ab55002ac51ffc57e1e3e9fcf5260a 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -1,6 +1,6 @@
  # llama.cpp/examples/main
  
-This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggerganov/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
+This example program allows you to use various LLaMA language models easily and efficiently. It is specifically designed to work with the [llama.cpp](https://github.com/ggml-org/llama.cpp) project, which provides a plain C/C++ implementation with optional 4-bit quantization support for faster, lower memory inference, and is optimized for desktop CPUs. This program can be used to perform various inference tasks with LLaMA models, including generating text based on user-provided prompts and chat-like interactions with reverse prompts.
  
  ## Table of Contents
  
@@ -121,7 +121,7 @@ When --in-prefix or --in-suffix options are enabled the chat template ( --chat-t
  
  ### Chat templates
  
- `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
+ `--chat-template JINJA_TEMPLATE`: This option sets a custom jinja chat template. It accepts a string, not a file name.  Default: template taken from model's metadata. Llama.cpp only supports [some pre-defined templates](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template). These include llama2, llama3, gemma, monarch, chatml, orion, vicuna, vicuna-orca, deepseek, command-r, zephyr. When --in-prefix or --in-suffix options are enabled the chat template ( --chat-template ) is disabled.
  
   Example usage: `--chat-template gemma`
  
diff --git a/examples/passkey/README.md b/examples/passkey/README.md

index 2b8e910f9658d66a8623f8146e1cd7f772e539ea..2f19597c48d7f2bc7f0e955cfe51b558cfd6956c 100644 (file)
--- a/examples/passkey/README.md
+++ b/examples/passkey/README.md
@@ -5,8 +5,8 @@ models ability to recall information from long contexts.
  
  See the following PRs for more info:
  
-- https://github.com/ggerganov/llama.cpp/pull/3856
-- https://github.com/ggerganov/llama.cpp/pull/4810
+- https://github.com/ggml-org/llama.cpp/pull/3856
+- https://github.com/ggml-org/llama.cpp/pull/4810
  
  ### Usage
  
diff --git a/examples/pydantic_models_to_grammar_examples.py b/examples/pydantic_models_to_grammar_examples.py

index eb000d5ccba24b4fc304599ffcd6b36016074e58..f94b82ca47570ddfa050d046fd12d041f55e00ef 100755 (executable)
--- a/examples/pydantic_models_to_grammar_examples.py
+++ b/examples/pydantic_models_to_grammar_examples.py
@@ -23,7 +23,7 @@ def create_completion(host, prompt, gbnf_grammar):
      """Calls the /completion API on llama-server.
  
      See
-    https://github.com/ggerganov/llama.cpp/tree/HEAD/examples/server#api-endpoints
+    https://github.com/ggml-org/llama.cpp/tree/HEAD/examples/server#api-endpoints
      """
      print(f"  Request:\n    Grammar:\n{textwrap.indent(gbnf_grammar, '      ')}\n    Prompt:\n{textwrap.indent(prompt.rstrip(), '      ')}")
      headers = {"Content-Type": "application/json"}
diff --git a/examples/quantize/README.md b/examples/quantize/README.md

index f9cce7b2133347500222d441ddfca46f4bfa72e7..992d00e21b4feb0cb47bb2117510d1fc4b58ae08 100644 (file)
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -69,22 +69,22 @@ Several quantization methods are supported. They differ in the resulting model d
  |   13B | ms/tok @ 8th |      - |     73 |     82 |     98 |    105 |    128 |
  |   13B | bits/weight  |   16.0 |    4.5 |    5.0 |    5.5 |    6.0 |    8.5 |
  
-- [k-quants](https://github.com/ggerganov/llama.cpp/pull/1684)
+- [k-quants](https://github.com/ggml-org/llama.cpp/pull/1684)
  - recent k-quants improvements and new i-quants
-  - [#2707](https://github.com/ggerganov/llama.cpp/pull/2707)
-  - [#2807](https://github.com/ggerganov/llama.cpp/pull/2807)
-  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4773)
-  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggerganov/llama.cpp/pull/4856)
-  - [#4861 - importance matrix](https://github.com/ggerganov/llama.cpp/pull/4861)
-  - [#4872 - MoE models](https://github.com/ggerganov/llama.cpp/pull/4872)
-  - [#4897 - 2-bit quantization](https://github.com/ggerganov/llama.cpp/pull/4897)
-  - [#4930 - imatrix for all k-quants](https://github.com/ggerganov/llama.cpp/pull/4930)
-  - [#4951 - imatrix on the GPU](https://github.com/ggerganov/llama.cpp/pull/4957)
-  - [#4969 - imatrix for legacy quants](https://github.com/ggerganov/llama.cpp/pull/4969)
-  - [#4996 - k-quants tuning](https://github.com/ggerganov/llama.cpp/pull/4996)
-  - [#5060 - Q3_K_XS](https://github.com/ggerganov/llama.cpp/pull/5060)
-  - [#5196 - 3-bit i-quants](https://github.com/ggerganov/llama.cpp/pull/5196)
-  - [quantization tuning](https://github.com/ggerganov/llama.cpp/pull/5320), [another one](https://github.com/ggerganov/llama.cpp/pull/5334), and [another one](https://github.com/ggerganov/llama.cpp/pull/5361)
+  - [#2707](https://github.com/ggml-org/llama.cpp/pull/2707)
+  - [#2807](https://github.com/ggml-org/llama.cpp/pull/2807)
+  - [#4773 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4773)
+  - [#4856 - 2-bit i-quants (inference)](https://github.com/ggml-org/llama.cpp/pull/4856)
+  - [#4861 - importance matrix](https://github.com/ggml-org/llama.cpp/pull/4861)
+  - [#4872 - MoE models](https://github.com/ggml-org/llama.cpp/pull/4872)
+  - [#4897 - 2-bit quantization](https://github.com/ggml-org/llama.cpp/pull/4897)
+  - [#4930 - imatrix for all k-quants](https://github.com/ggml-org/llama.cpp/pull/4930)
+  - [#4951 - imatrix on the GPU](https://github.com/ggml-org/llama.cpp/pull/4957)
+  - [#4969 - imatrix for legacy quants](https://github.com/ggml-org/llama.cpp/pull/4969)
+  - [#4996 - k-quants tuning](https://github.com/ggml-org/llama.cpp/pull/4996)
+  - [#5060 - Q3_K_XS](https://github.com/ggml-org/llama.cpp/pull/5060)
+  - [#5196 - 3-bit i-quants](https://github.com/ggml-org/llama.cpp/pull/5196)
+  - [quantization tuning](https://github.com/ggml-org/llama.cpp/pull/5320), [another one](https://github.com/ggml-org/llama.cpp/pull/5334), and [another one](https://github.com/ggml-org/llama.cpp/pull/5361)
  
  **Llama 2 7B**
  
diff --git a/examples/retrieval/README.md b/examples/retrieval/README.md

index bc5f22e2ff15662a8232725670d1eaaf79384d9f..6938a1e96ee359a0942ba0cb3a6dbdb391770543 100644 (file)
--- a/examples/retrieval/README.md
+++ b/examples/retrieval/README.md
@@ -3,7 +3,7 @@
  Demonstration of simple retrieval technique based on cosine similarity
  
  More info:
-https://github.com/ggerganov/llama.cpp/pull/6193
+https://github.com/ggml-org/llama.cpp/pull/6193
  
  ### How to use
  
diff --git a/examples/server/CMakeLists.txt b/examples/server/CMakeLists.txt

index 1b7cc8c1328e463b1c332085439fa250c9f384e8..aee90388e4fb3f788e202598c7f5329fc1e2e89f 100644 (file)
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@@ -5,7 +5,7 @@ option(LLAMA_SERVER_SSL "Build SSL support for the server" OFF)
  include_directories(${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR})
  
  if (MINGW)
-    # fix: https://github.com/ggerganov/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
+    # fix: https://github.com/ggml-org/llama.cpp/actions/runs/9651004652/job/26617901362?pr=8006
      add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
  endif()
  
diff --git a/examples/server/README.md b/examples/server/README.md

index 751d4db9ee9d79410d07c1890678729772460962..a2ae614d7c4892c7c1e5e8dce1204ab2e77170ee 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -7,14 +7,14 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp.
  **Features:**
   * LLM inference of F16 and quantized models on GPU and CPU
   * [OpenAI API](https://github.com/openai/openai-openapi) compatible chat completions and embeddings routes
- * Reranking endoint (WIP: https://github.com/ggerganov/llama.cpp/pull/9510)
+ * Reranking endoint (WIP: https://github.com/ggml-org/llama.cpp/pull/9510)
   * Parallel decoding with multi-user support
   * Continuous batching
   * Multimodal (wip)
   * Monitoring endpoints
   * Schema-constrained JSON response format
  
-The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggerganov/llama.cpp/issues/4216).
+The project is under active development, and we are [looking for feedback and contributors](https://github.com/ggml-org/llama.cpp/issues/4216).
  
  ## Usage
  
@@ -65,7 +65,7 @@ The project is under active development, and we are [looking for feedback and co
  | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
  | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
  | `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
-| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggerganov/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
+| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
  | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
  | `--list-devices` | print list of available devices and exit |
  | `-ngl, --gpu-layers, --n-gpu-layers N` | number of layers to store in VRAM<br/>(env: LLAMA_ARG_N_GPU_LAYERS) |
@@ -178,7 +178,7 @@ Example usage of docker compose with environment variables:
  ```yml
  services:
    llamacpp-server:
-    image: ghcr.io/ggerganov/llama.cpp:server
+    image: ghcr.io/ggml-org/llama.cpp:server
      ports:
        - 8080:8080
      volumes:
@@ -273,10 +273,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
  ### Docker
  
  ```bash
-docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
+docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggml-org/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
  
  # or, with CUDA:
-docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
+docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggml-org/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
  ```
  
  ## Testing with CURL
@@ -1066,7 +1066,7 @@ print(completion.choices[0].text)
  
  ### POST `/v1/chat/completions`: OpenAI-compatible Chat Completions API
  
-Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
+Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only models with a [supported chat template](https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) can be used optimally with this endpoint. By default, the ChatML template will be used.
  
  *Options:*
  
@@ -1120,7 +1120,7 @@ curl http://localhost:8080/v1/chat/completions \
  
  *Tool call support*
  
-[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggerganov/llama.cpp/pull/9639):
+[Function calling](https://platform.openai.com/docs/guides/function-calling) is supported for all models (see https://github.com/ggml-org/llama.cpp/pull/9639):
  
  - Requires `--jinja` flag
  - Native tool call formats supported:
@@ -1599,7 +1599,7 @@ Apart from error types supported by OAI, we also have custom types that are spec
  
  ### Legacy completion web UI
  
-A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggerganov/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
+A new chat-based UI has replaced the old completion-based since [this PR](https://github.com/ggml-org/llama.cpp/pull/10175). If you want to use the old completion, start the server with `--path ./examples/server/public_legacy`
  
  For example:
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 71151183b81dab50770e67a7640c23dc160b5a9f..9ffec0a64ad94e362b7f4445a684d82279d0ab31 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -42,7 +42,7 @@ enum stop_type {
      STOP_TYPE_LIMIT,
  };
  
-// state diagram: https://github.com/ggerganov/llama.cpp/pull/9283
+// state diagram: https://github.com/ggml-org/llama.cpp/pull/9283
  enum slot_state {
      SLOT_STATE_IDLE,
      SLOT_STATE_STARTED, // TODO: this state is only used for setting up the initial prompt processing; maybe merge it with launch_slot_with_task in the future
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index 86de0e6d789770c2b2036e00fb0fa9eb860eb8be..b5aebebba4ac7b62ccd57c4f6c3d7d00d39ffdb9 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -367,10 +367,10 @@ inline std::string format_chat(const common_chat_template & tmpl, const std::vec
                      }
                  }
              } else {
-                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+                throw std::runtime_error("Invalid 'content' type (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
              }
          } else {
-            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggerganov/llama.cpp/issues/8367)");
+            throw std::runtime_error("Missing 'content' (ref: https://github.com/ggml-org/llama.cpp/issues/8367)");
          }
  
          chat.push_back({role, content, /* tool_calls= */ {}});
diff --git a/examples/simple-cmake-pkg/README.md b/examples/simple-cmake-pkg/README.md

index 8b30049e247cc8e965ddfbdf779de5f786193a33..d7430cc9c2083622b6c760a38adba48a5edd1c1d 100644 (file)
--- a/examples/simple-cmake-pkg/README.md
+++ b/examples/simple-cmake-pkg/README.md
@@ -1,6 +1,6 @@
  # llama.cpp/example/simple-cmake-pkg
  
-This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggerganov/llama.cpp) in projects which live outside of the source tree.
+This program builds [simple](../simple) using a relocatable CMake package. It serves as an example of using the `find_package()` CMake command to conveniently include [llama.cpp](https://github.com/ggml-org/llama.cpp) in projects which live outside of the source tree.
  
  ## Building
  
@@ -13,7 +13,7 @@ When hardware acceleration libraries are used (e.g. CUDA, Metal, Vulkan, etc.),
  ### Build llama.cpp and install to llama.cpp/inst
  
  ```sh
-git clone https://github.com/ggerganov/llama.cpp
+git clone https://github.com/ggml-org/llama.cpp
  cd llama.cpp
  cmake -S . -B build
  cmake --build build
diff --git a/examples/speculative/README.md b/examples/speculative/README.md

index a6608c5fe8e3adf7fdaa2fda4e822df2631b033f..36ab3708629d2f0a59960e6f2c8764a7530b542b 100644 (file)
--- a/examples/speculative/README.md
+++ b/examples/speculative/README.md
@@ -4,6 +4,6 @@ Demonstration of speculative decoding and tree-based speculative decoding techni
  
  More info:
  
-- https://github.com/ggerganov/llama.cpp/pull/2926
-- https://github.com/ggerganov/llama.cpp/pull/3624
-- https://github.com/ggerganov/llama.cpp/pull/5625
+- https://github.com/ggml-org/llama.cpp/pull/2926
+- https://github.com/ggml-org/llama.cpp/pull/3624
+- https://github.com/ggml-org/llama.cpp/pull/5625
diff --git a/flake.nix b/flake.nix

index 26a2588169101f58b8dab85051f8dd213d4285aa..0b5edf911fd066be7a53f0e83636c9d2cf97efb5 100644 (file)
--- a/flake.nix
+++ b/flake.nix
@@ -36,7 +36,7 @@
    # ```
    # nixConfig = {
    #   extra-substituters = [
-  #     # Populated by the CI in ggerganov/llama.cpp
+  #     # Populated by the CI in ggml-org/llama.cpp
    #     "https://llama-cpp.cachix.org"
    #
    #     # A development cache for nixpkgs imported with `config.cudaSupport = true`.
@@ -56,11 +56,11 @@
    # };
    # ```
  
-  # For inspection, use `nix flake show github:ggerganov/llama.cpp` or the nix repl:
+  # For inspection, use `nix flake show github:ggml-org/llama.cpp` or the nix repl:
    #
    # ```bash
    # ❯ nix repl
-  # nix-repl> :lf github:ggerganov/llama.cpp
+  # nix-repl> :lf github:ggml-org/llama.cpp
    # Added 13 variables.
    # nix-repl> outputs.apps.x86_64-linux.quantize
    # { program = "/nix/store/00000000000000000000000000000000-llama.cpp/bin/llama-quantize"; type = "app"; }
@@ -176,7 +176,7 @@
              #
              # We could test all outputs e.g. as `checks = confg.packages`.
              #
-            # TODO: Build more once https://github.com/ggerganov/llama.cpp/issues/6346 has been addressed
+            # TODO: Build more once https://github.com/ggml-org/llama.cpp/issues/6346 has been addressed
              checks = {
                inherit (config.packages) default vulkan;
              };
diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h

index 3aa71badb5fb083ffea5e8ef8b61f49848cb75df..d23c6b262e202159ac6c890a5706fc1c01582250 100644 (file)
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -8,7 +8,7 @@ extern "C" {
  #endif
  
      // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
+    // since https://github.com/ggml-org/ggml/issues/287
      struct ggml_cplan {
          size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
          uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
diff --git a/ggml/include/ggml-metal.h b/ggml/include/ggml-metal.h

index 669c1f84ae6e33c7e14520a10b1efc931f7d772b..a610694423483a2735ae8213fa690a52eaf08073 100644 (file)
--- a/ggml/include/ggml-metal.h
+++ b/ggml/include/ggml-metal.h
@@ -45,7 +45,7 @@ GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
  
  GGML_DEPRECATED(
          GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
-        "obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
+        "obsoleted by the new device interface - https://github.com/ggml-org/llama.cpp/pull/9713");
  
  GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
  
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c

index 0cbf8318bedb90c663cf374eb447ccf86dd74bfa..dbef5df2111c6c4ca8588fee5cce9f184022f3c2 100644 (file)
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1816,7 +1816,7 @@ inline static float ggml_silu_f32(float x) {
  
  #if __FINITE_MATH_ONLY__
  #error "some routines in ggml.c require non-finite math arithmetics -- pass -fno-finite-math-only to the compiler to fix"
-#error "ref: https://github.com/ggerganov/llama.cpp/pull/7154#issuecomment-2143844461"
+#error "ref: https://github.com/ggml-org/llama.cpp/pull/7154#issuecomment-2143844461"
  #endif
  
  #if defined(__ARM_NEON) && defined(__aarch64__)
@@ -7574,7 +7574,7 @@ UseGgmlGemm2:;
      int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
  
      // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggerganov/llama.cpp/pull/6915
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
      //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
      if (nchunk0 * nchunk1 < nth * 4 || ggml_is_numa()) {
          // distribute the thread work across the inner or outer loop based on which one is larger
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m

index 944d90af3443204eee8e28b78e5d98182289c921..0add6b51a406d9e3148584e25f6ff2891e875bff 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -1983,7 +1983,7 @@ static void ggml_metal_encode_node(
                  const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
  
                  // TODO: add ggml_metal_kargs struct
-                // TODO: optimize (see https://github.com/ggerganov/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
+                // TODO: optimize (see https://github.com/ggml-org/llama.cpp/pull/10238/commits/7941b6b9ec29a2866fec6fa6c51612515ca509f6)
                  [encoder setComputePipelineState:pipeline];
                  [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
                  if (id_src1) {
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal

index 44f04c909bfb2d59c4e17ac9b938275467e79ed7..da415184b173ca286980474111de937bc2be3d3a 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1058,7 +1058,7 @@ kernel void kernel_soft_max(
      }
  
      // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+    // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
      threadgroup_barrier(mem_flags::mem_none);
  
      float sum = simd_sum(lsum);
@@ -1163,7 +1163,7 @@ kernel void kernel_soft_max_4(
      const float lsum = lsum4[0] + lsum4[1] + lsum4[2] + lsum4[3];
  
      // This barrier fixes a failing test
-    // ref: https://github.com/ggerganov/ggml/pull/621#discussion_r1425156335
+    // ref: https://github.com/ggml-org/ggml/pull/621#discussion_r1425156335
      threadgroup_barrier(mem_flags::mem_none);
  
      float sum = simd_sum(lsum);
diff --git a/gguf-py/README.md b/gguf-py/README.md

index 2e513633d1c5abbd2c9089d36dc09cbd9f474658..dd4ab7bde763a437ad486ac82b7410485621f027 100644 (file)
--- a/gguf-py/README.md
+++ b/gguf-py/README.md
@@ -1,9 +1,9 @@
  ## gguf
  
-This is a Python package for writing binary files in the [GGUF](https://github.com/ggerganov/ggml/pull/302)
+This is a Python package for writing binary files in the [GGUF](https://github.com/ggml-org/ggml/pull/302)
  (GGML Universal File) format.
  
-See [convert_hf_to_gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)
+See [convert_hf_to_gguf.py](https://github.com/ggml-org/llama.cpp/blob/master/convert_hf_to_gguf.py)
  as an example for its usage.
  
  ## Installation
@@ -13,17 +13,17 @@ pip install gguf
  
  ## API Examples/Simple Tools
  
-[examples/writer.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
+[examples/writer.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/writer.py) — Generates `example.gguf` in the current directory to demonstrate generating a GGUF file. Note that this file cannot be used as a model.
  
-[examples/reader.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format.
+[examples/reader.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/examples/reader.py) — Extracts and displays key-value pairs and tensor details from a GGUF file in a readable format.
  
-[gguf/scripts/gguf_dump.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
+[gguf/scripts/gguf_dump.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_dump.py) — Dumps a GGUF file's metadata to the console.
  
-[gguf/scripts/gguf_set_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
+[gguf/scripts/gguf_set_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_set_metadata.py) — Allows changing simple metadata values in a GGUF file by key.
  
-[gguf/scripts/gguf_convert_endian.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
+[gguf/scripts/gguf_convert_endian.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_convert_endian.py) — Allows converting the endianness of GGUF files.
  
-[gguf/scripts/gguf_new_metadata.py](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
+[gguf/scripts/gguf_new_metadata.py](https://github.com/ggml-org/llama.cpp/blob/master/gguf-py/gguf/scripts/gguf_new_metadata.py) — Copies a GGUF file with added/modified/removed metadata values.
  
  ## Development
  Maintainers who participate in development of this package are advised to install it in editable mode:
diff --git a/gguf-py/gguf/scripts/gguf_dump.py b/gguf-py/gguf/scripts/gguf_dump.py

index f95b4fd4827c69dcc28829c67babf4ba71b5ca51..20f23d729f4b925b532b0fcd91f50dfc721b93f0 100755 (executable)
--- a/gguf-py/gguf/scripts/gguf_dump.py
+++ b/gguf-py/gguf/scripts/gguf_dump.py
@@ -181,7 +181,7 @@ def element_count_rounded_notation(count: int) -> str:
  def translate_tensor_name(name):
      words = name.split(".")
  
-    # Source: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#standardized-tensor-names
+    # Source: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#standardized-tensor-names
      abbreviation_dictionary = {
          'token_embd': 'Token embedding',
          'pos_embd': 'Position embedding',
diff --git a/gguf-py/gguf/utility.py b/gguf-py/gguf/utility.py

index 40d59b75ee04ec6b46d219ea3be0b3a8fb8b3f35..ae92d786a4068eaaa92b351ec169960566fb9e6f 100644 (file)
--- a/gguf-py/gguf/utility.py
+++ b/gguf-py/gguf/utility.py
@@ -47,7 +47,7 @@ def size_label(total_params: int, shared_params: int, expert_params: int, expert
  
  
  def naming_convention(model_name: str | None, base_name: str | None, finetune_string: str | None, version_string: str | None, size_label: str | None, output_type: str | None, model_type: Literal['vocab', 'LoRA'] | None = None) -> str:
-    # Reference: https://github.com/ggerganov/ggml/blob/master/docs/gguf.md#gguf-naming-convention
+    # Reference: https://github.com/ggml-org/ggml/blob/master/docs/gguf.md#gguf-naming-convention
  
      if base_name is not None:
          name = base_name.strip().replace(' ', '-').replace('/', '-')
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py

index f2645f92101dbd118172ac30a749add9ae4a394a..2ef7d14ab15c0a3beb300f17651b6f8b88ae8542 100644 (file)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -127,7 +127,7 @@ class SpecialVocab:
                          self.merges = merges
                      elif isinstance(merges[0], list) and len(merges[0]) == 2 and isinstance(merges[0][0], str):
                          # New format since transformers 4.45 to support spaces in merges
-                        # ref: https://github.com/ggerganov/llama.cpp/issues/9692
+                        # ref: https://github.com/ggml-org/llama.cpp/issues/9692
                          # TODO: internally store as the new format instead of converting to old
                          if any(' ' in s for pair in merges for s in pair):
                              logger.warning(f'Spaces in merges detected, encoding as {chr(ord(" ") + 256)!r}')
diff --git a/gguf-py/pyproject.toml b/gguf-py/pyproject.toml

index 78c6baa64a3656ea6f08d75de060024a701a4b0a..b4a47333dd15f85869cb7e5f2bfb92243fcd9ada 100644 (file)
--- a/gguf-py/pyproject.toml
+++ b/gguf-py/pyproject.toml
@@ -9,7 +9,7 @@ packages = [
  ]
  readme = "README.md"
  homepage = "https://ggml.ai"
-repository = "https://github.com/ggerganov/llama.cpp"
+repository = "https://github.com/ggml-org/llama.cpp"
  keywords = ["ggml", "gguf", "llama.cpp"]
  classifiers = [
      "Programming Language :: Python :: 3",
diff --git a/grammars/README.md b/grammars/README.md

index 9769540919f98d972b6644b55e1ca81ce323cb2c..935213f5c1849c66c28cf07da009074d5b8f37ad 100644 (file)
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -98,7 +98,7 @@ This guide provides a brief overview. Check out the GBNF files in this directory
  
  ## Troubleshooting
  
-Grammars currently have performance gotchas (see https://github.com/ggerganov/llama.cpp/issues/4218).
+Grammars currently have performance gotchas (see https://github.com/ggml-org/llama.cpp/issues/4218).
  
  ### Efficient optional repetitions
  
@@ -126,7 +126,7 @@ You can use GBNF grammars:
      - in CLI, with [examples/json_schema_to_grammar.py](../examples/json_schema_to_grammar.py)
      - in JavaScript with [json-schema-to-grammar.mjs](../examples/server/public_legacy/json-schema-to-grammar.mjs) (this is used by the [server](../examples/server)'s Web UI)
  
-Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggerganov/llama.cpp/pull/5978, https://github.com/ggerganov/llama.cpp/pull/6659 & https://github.com/ggerganov/llama.cpp/pull/6555).
+Take a look at [tests](../tests/test-json-schema-to-grammar.cpp) to see which features are likely supported (you'll also find usage examples in https://github.com/ggml-org/llama.cpp/pull/5978, https://github.com/ggml-org/llama.cpp/pull/6659 & https://github.com/ggml-org/llama.cpp/pull/6555).
  
  ```bash
  llama-cli \
@@ -185,10 +185,10 @@ Here is also a list of known limitations (contributions welcome):
  - `additionalProperties` defaults to `false` (produces faster grammars + reduces hallucinations).
  - `"additionalProperties": true` may produce keys that contain unescaped newlines.
  - Unsupported features are skipped silently. It is currently advised to use the command-line Python converter (see above) to see any warnings, and to inspect the resulting grammar / test it w/ [llama-gbnf-validator](../examples/gbnf-validator/gbnf-validator.cpp).
-- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggerganov/llama.cpp/issues/7703)
+- Can't mix `properties` w/ `anyOf` / `oneOf` in the same type (https://github.com/ggml-org/llama.cpp/issues/7703)
  - [prefixItems](https://json-schema.org/draft/2020-12/json-schema-core#name-prefixitems) is broken (but [items](https://json-schema.org/draft/2020-12/json-schema-core#name-items) works)
  - `minimum`, `exclusiveMinimum`, `maximum`, `exclusiveMaximum`: only supported for `"type": "integer"` for now, not `number`
-- Nested `$ref`s are broken (https://github.com/ggerganov/llama.cpp/issues/8073)
+- Nested `$ref`s are broken (https://github.com/ggml-org/llama.cpp/issues/8073)
  - [pattern](https://json-schema.org/draft/2020-12/json-schema-validation#name-pattern)s must start with `^` and end with `$`
  - Remote `$ref`s not supported in the C++ version (Python & JavaScript versions fetch https refs)
  - `string` [formats](https://json-schema.org/draft/2020-12/json-schema-validation#name-defined-formats) lack `uri`, `email`
diff --git a/include/llama.h b/include/llama.h

index 1f5f3a09b311e0d6af0746e1414a7b8f98bb9a6f..b0726cbe63ea6b1480ab4008a6c97d85c18c4a0e 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -213,7 +213,7 @@ extern "C" {
          LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
      };
  
-    // TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
+    // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
      typedef struct llama_token_data {
          llama_token id; // token id
          float logit;    // log-odds of the token
@@ -307,7 +307,7 @@ extern "C" {
      };
  
      // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
-    //       https://github.com/ggerganov/llama.cpp/pull/7544
+    //       https://github.com/ggml-org/llama.cpp/pull/7544
      struct llama_context_params {
          uint32_t n_ctx;             // text context, 0 = from model
          uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
@@ -320,7 +320,7 @@ extern "C" {
          enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
          enum llama_attention_type    attention_type;    // attention type to use for embeddings
  
-        // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+        // ref: https://github.com/ggml-org/llama.cpp/pull/2054
          float    rope_freq_base;   // RoPE base frequency, 0 = from model
          float    rope_freq_scale;  // RoPE frequency scaling factor, 0 = from model
          float    yarn_ext_factor;  // YaRN extrapolation mix factor, negative = from model
@@ -385,7 +385,7 @@ extern "C" {
      struct llama_adapter_lora;
  
      // Helpers for getting default parameters
-    // TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
+    // TODO: update API to start accepting pointers to params structs (https://github.com/ggml-org/llama.cpp/discussions/9172)
      LLAMA_API struct llama_model_params          llama_model_default_params(void);
      LLAMA_API struct llama_context_params        llama_context_default_params(void);
      LLAMA_API struct llama_sampler_chain_params  llama_sampler_chain_default_params(void);
@@ -1040,7 +1040,7 @@ extern "C" {
  
      /// Apply chat template. Inspired by hf apply_chat_template() on python.
      /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
-    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
+    /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggml-org/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
      /// @param tmpl A Jinja template to use for this chat. If this is nullptr, the model’s default chat template will be used instead.
      /// @param chat Pointer to a list of multiple llama_chat_message
      /// @param n_msg Number of llama_chat_message in this chat
@@ -1149,7 +1149,7 @@ extern "C" {
      /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
      /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
      DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax    (void),
-        "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
+        "will be removed in the future (see https://github.com/ggml-org/llama.cpp/pull/9896#discussion_r1800920915)");
  
      /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
      LLAMA_API struct llama_sampler * llama_sampler_init_top_k      (int32_t k);
@@ -1157,7 +1157,7 @@ extern "C" {
      /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
      LLAMA_API struct llama_sampler * llama_sampler_init_top_p      (float   p, size_t min_keep);
  
-    /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
+    /// @details Minimum P sampling as described in https://github.com/ggml-org/llama.cpp/pull/3841
      LLAMA_API struct llama_sampler * llama_sampler_init_min_p      (float   p, size_t min_keep);
  
      /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
@@ -1203,7 +1203,7 @@ extern "C" {
                            const char * grammar_str,
                            const char * grammar_root);
  
-    /// @details Lazy grammar sampler, introduced in https://github.com/ggerganov/llama.cpp/pull/9639
+    /// @details Lazy grammar sampler, introduced in https://github.com/ggml-org/llama.cpp/pull/9639
      /// @param trigger_words A list of words that will trigger the grammar sampler. This may be updated to a loose regex syntax (w/ ^) in a near future.
      /// @param trigger_tokens A list of tokens that will trigger the grammar sampler.
      LLAMA_API struct llama_sampler * llama_sampler_init_grammar_lazy(
diff --git a/pyproject.toml b/pyproject.toml

index 84e71de6def3854717c580f72a9efb7f710f3aae..ed62264ba62dbdb7603c4e9f15ade9159cf5081e 100644 (file)
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ description = "Scripts that ship with llama.cpp"
  authors = ["GGML <ggml@ggml.ai>"]
  readme = "README.md"
  homepage = "https://ggml.ai"
-repository = "https://github.com/ggerganov/llama.cpp"
+repository = "https://github.com/ggml-org/llama.cpp"
  keywords = ["ggml", "gguf", "llama.cpp"]
  packages = [{ include = "*.py", from = "." }]
  classifiers = [
diff --git a/scripts/check-requirements.sh b/scripts/check-requirements.sh

index d3bbded130daff611ae52b3354571c3c9a61faf0..4c3b05f68b7ba6d49ccfd91e8c6ddb6c932154a9 100755 (executable)
--- a/scripts/check-requirements.sh
+++ b/scripts/check-requirements.sh
@@ -170,7 +170,7 @@ check_convert_script examples/convert_legacy_llama.py
  for py in convert_*.py; do
      # skip convert_hf_to_gguf_update.py
      # TODO: the check is failing for some reason:
-    #       https://github.com/ggerganov/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
+    #       https://github.com/ggml-org/llama.cpp/actions/runs/8875330981/job/24364557177?pr=6920
      [[ $py == convert_hf_to_gguf_update.py ]] && continue
  
      check_convert_script "$py"
diff --git a/src/unicode.cpp b/src/unicode.cpp

index a32ae6d0824f2e3718a4d5483d585781dc386544..e63bb4ab085d68a88861cd9e3c7e9da367b6516b 100644 (file)
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -708,7 +708,7 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
      const auto cpts = unicode_cpts_from_utf8(text);
  
      // generate a "collapsed" representation of the text, where all codepoints are replaced by a single byte
-    // ref: https://github.com/ggerganov/llama.cpp/pull/6920#issuecomment-2081479935
+    // ref: https://github.com/ggml-org/llama.cpp/pull/6920#issuecomment-2081479935
      std::string text_collapsed;
      if (need_collapse) {
          // collapse all unicode categories
author	Georgi Gerganov <redacted>
	Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)
committer	GitHub <redacted>
	Sat, 15 Feb 2025 14:40:57 +0000 (16:40 +0200)
.devops/llama-cpp-cuda.srpm.spec		patch \| blob \| history
.devops/llama-cpp.srpm.spec		patch \| blob \| history
.devops/nix/package.nix		patch \| blob \| history
.devops/rocm.Dockerfile		patch \| blob \| history
.github/ISSUE_TEMPLATE/020-enhancement.yml		patch \| blob \| history
.github/ISSUE_TEMPLATE/030-research.yml		patch \| blob \| history
.github/ISSUE_TEMPLATE/040-refactor.yml		patch \| blob \| history
.github/ISSUE_TEMPLATE/config.yml		patch \| blob \| history
.github/pull_request_template.md		patch \| blob \| history
.github/workflows/bench.yml.disabled		patch \| blob \| history
.github/workflows/build.yml		patch \| blob \| history
.github/workflows/labeler.yml		patch \| blob \| history
CONTRIBUTING.md		patch \| blob \| history
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
SECURITY.md		patch \| blob \| history
ci/README.md		patch \| blob \| history
common/arg.cpp		patch \| blob \| history
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
convert_lora_to_gguf.py		patch \| blob \| history
docs/android.md		patch \| blob \| history
docs/backend/OPENCL.md		patch \| blob \| history
docs/backend/SYCL.md		patch \| blob \| history
docs/build.md		patch \| blob \| history
docs/cuda-fedora.md		patch \| blob \| history
docs/development/HOWTO-add-model.md		patch \| blob \| history
docs/docker.md		patch \| blob \| history
docs/install.md		patch \| blob \| history
examples/cvector-generator/README.md		patch \| blob \| history
examples/imatrix/README.md		patch \| blob \| history
examples/imatrix/imatrix.cpp		patch \| blob \| history
examples/llama.android/llama/src/main/cpp/CMakeLists.txt		patch \| blob \| history
examples/llama.swiftui/README.md		patch \| blob \| history
examples/llama.vim		patch \| blob \| history
examples/llava/README-minicpmo2.6.md		patch \| blob \| history
examples/llava/README-minicpmv2.5.md		patch \| blob \| history
examples/lookahead/README.md		patch \| blob \| history
examples/lookup/README.md		patch \| blob \| history
examples/main/README.md		patch \| blob \| history
examples/passkey/README.md		patch \| blob \| history
examples/pydantic_models_to_grammar_examples.py		patch \| blob \| history
examples/quantize/README.md		patch \| blob \| history
examples/retrieval/README.md		patch \| blob \| history
examples/server/CMakeLists.txt		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history
examples/simple-cmake-pkg/README.md		patch \| blob \| history
examples/speculative/README.md		patch \| blob \| history
flake.nix		patch \| blob \| history
ggml/include/ggml-cpu.h		patch \| blob \| history
ggml/include/ggml-metal.h		patch \| blob \| history
ggml/src/ggml-cpu/ggml-cpu.c		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.m		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.metal		patch \| blob \| history
gguf-py/README.md		patch \| blob \| history
gguf-py/gguf/scripts/gguf_dump.py		patch \| blob \| history
gguf-py/gguf/utility.py		patch \| blob \| history
gguf-py/gguf/vocab.py		patch \| blob \| history
gguf-py/pyproject.toml		patch \| blob \| history
grammars/README.md		patch \| blob \| history
include/llama.h		patch \| blob \| history
pyproject.toml		patch \| blob \| history
scripts/check-requirements.sh		patch \| blob \| history
src/unicode.cpp		patch \| blob \| history