ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-ARG UBUNTU_VERSION=22.04
-FROM intel/hpckit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
COPY . .
-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
- cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
- cmake --build . --config Release --target main server
+ if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+ echo "LLAMA_SYCL_F16 is set" && \
+ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+ fi && \
+ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+ cmake --build . --config Release --target main
-FROM ubuntu:$UBUNTU_VERSION as runtime
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/main /main
-COPY --from=build /app/build/bin/server /server
ENV LC_ALL=C.utf8
--- /dev/null
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+ apt update -y && \
+ apt-get install -y vulkan-sdk
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN mkdir build && \
+ cd build && \
+ cmake .. -DLLAMA_VULKAN=1 && \
+ cmake --build . --config Release --target main
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/main /main && \
+ rm -rf /app
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/main" ]
ARG ONEAPI_VERSION=2024.0.1-devel-ubuntu22.04
-ARG UBUNTU_VERSION=22.04
-FROM intel/hpckit:$ONEAPI_VERSION as build
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as build
+ARG LLAMA_SYCL_F16=OFF
RUN apt-get update && \
apt-get install -y git
COPY . .
-# for some reasons, "-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DLLAMA_NATIVE=ON" give worse performance
RUN mkdir build && \
cd build && \
- cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx && \
- cmake --build . --config Release --target main server
-
-FROM ubuntu:$UBUNTU_VERSION as runtime
+ if [ "${LLAMA_SYCL_F16}" = "ON" ]; then \
+ echo "LLAMA_SYCL_F16 is set" && \
+ export OPT_SYCL_F16="-DLLAMA_SYCL_F16=ON"; \
+ fi && \
+ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16} && \
+ cmake --build . --config Release --target server
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION as runtime
COPY --from=build /app/build/bin/server /server
--- /dev/null
+ARG UBUNTU_VERSION=jammy
+
+FROM ubuntu:$UBUNTU_VERSION as build
+
+# Install build tools
+RUN apt update && apt install -y git build-essential cmake wget
+
+# Install Vulkan SDK
+RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \
+ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \
+ apt update -y && \
+ apt-get install -y vulkan-sdk
+
+# Build it
+WORKDIR /app
+COPY . .
+RUN mkdir build && \
+ cd build && \
+ cmake .. -DLLAMA_VULKAN=1 && \
+ cmake --build . --config Release --target server
+
+# Clean up
+WORKDIR /
+RUN cp /app/build/bin/server /server && \
+ rm -rf /app
+
+ENV LC_ALL=C.utf8
+
+ENTRYPOINT [ "/server" ]
# llama.cpp for SYCL
-[Background](#background)
-
-[OS](#os)
-
-[Intel GPU](#intel-gpu)
-
-[Linux](#linux)
-
-[Windows](#windows)
-
-[Environment Variable](#environment-variable)
-
-[Known Issue](#known-issue)
-
-[Q&A](#q&a)
-
-[Todo](#todo)
+- [Background](#background)
+- [OS](#os)
+- [Intel GPU](#intel-gpu)
+- [Docker](#docker)
+- [Linux](#linux)
+- [Windows](#windows)
+- [Environment Variable](#environment-variable)
+- [Known Issue](#known-issue)
+- [Q&A](#q&a)
+- [Todo](#todo)
## Background
|OS|Status|Verified|
|-|-|-|
-|Linux|Support|Ubuntu 22.04|
+|Linux|Support|Ubuntu 22.04, Fedora Silverblue 39|
|Windows|Support|Windows 11|
|Intel Data Center Flex Series| Support| Flex 170|
|Intel Arc Series| Support| Arc 770, 730M|
|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1260P, i7-1165G7|
Note: If the EUs (Execution Unit) in iGPU is less than 80, the inference speed will be too slow to use.
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
+## Docker
+
+Note:
+- Only docker on Linux is tested. Docker on WSL may not work.
+- You may need to install Intel GPU driver on the host machine (See the [Linux](#linux) section to know how to do that)
+
+### Build the image
+
+You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
+
+
+```sh
+# For F16:
+#docker build -t llama-cpp-sycl --build-arg="LLAMA_SYCL_F16=ON" -f .devops/main-intel.Dockerfile .
+
+# Or, for F32:
+docker build -t llama-cpp-sycl -f .devops/main-intel.Dockerfile .
+
+# Note: you can also use the ".devops/main-server.Dockerfile", which compiles the "server" example
+```
+
+### Run
+
+```sh
+# Firstly, find all the DRI cards:
+ls -la /dev/dri
+# Then, pick the card that you want to use.
+
+# For example with "/dev/dri/card1"
+docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-sycl -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+
## Linux
### Setup Environment
b. Add user to group: video, render.
-```
+```sh
sudo usermod -aG render username
sudo usermod -aG video username
```
c. Check
-```
+```sh
sudo apt install clinfo
sudo clinfo -l
```
2. Install Intel® oneAPI Base toolkit.
-
a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
Recommend to install to default folder: **/opt/intel/oneapi**.
b. Check
-```
+```sh
source /opt/intel/oneapi/setvars.sh
sycl-ls
2. Build locally:
-```
+Note:
+- You can choose between **F16** and **F32** build. F16 is faster for long-prompt inference.
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
+```sh
mkdir -p build
cd build
source /opt/intel/oneapi/setvars.sh
-#for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+# For FP16:
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
-#for FP32
+# Or, for FP32:
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-#build example/main only
+# Build example/main only
#cmake --build . --config Release --target main
-#build all binary
+# Or, build all binary
cmake --build . --config Release -v
cd ..
or
-```
+```sh
./examples/sycl/build.sh
```
-Note:
-
-- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
-
### Run
1. Put model file to folder **models**
Run without parameter:
-```
+```sh
./build/bin/ls-sycl-device
-or
+# or running the "main" executable and look at the output log:
./build/bin/main
```
Set device ID = 0 by **GGML_SYCL_DEVICE=0**
-```
+```sh
GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
```
or run by script:
-```
-./examples/sycl/run-llama2.sh
+```sh
+./examples/sycl/run_llama2.sh
```
Note:
Check [BLIS.md](docs/BLIS.md) for more information.
+- #### SYCL
+ SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+ llama.cpp based on SYCL is used to **support Intel GPU** (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+ For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
+
- #### Intel oneMKL
+ Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni. Please note that this build config **does not support Intel GPU**. For Intel GPU support, please refer to [llama.cpp for SYCL](./README-sycl.md).
+
- Using manual oneAPI installation:
By default, `LLAMA_BLAS_VENDOR` is set to `Generic`, so if you already sourced intel environment script and assign `-DLLAMA_BLAS=ON` in cmake, the mkl version of Blas will automatically been selected. Otherwise please install oneAPI and follow the below steps:
```bash
mkdir build
cd build
- source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-runtime docker image, only required for manual installation
+ source /opt/intel/oneapi/setvars.sh # You can skip this step if in oneapi-basekit docker image, only required for manual installation
cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
cmake --build . --config Release
```
- Using oneAPI docker image:
- If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-runtime](https://hub.docker.com/r/intel/oneapi-runtime)
-
- ```bash
- mkdir build
- cd build
- cmake .. -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=Intel10_64lp -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_NATIVE=ON
- cmake --build . --config Release
- ```
-
- Building through oneAPI compilers will make avx_vnni instruction set available for intel processors that do not support avx512 and avx512_vnni.
+ If you do not want to source the environment vars and install oneAPI manually, you can also build the code using intel docker container: [oneAPI-basekit](https://hub.docker.com/r/intel/oneapi-basekit). Then, you can use the commands given above.
Check [Optimizing and Running LLaMA2 on Intel® CPU](https://www.intel.com/content/www/us/en/content-details/791610/optimizing-and-running-llama2-on-intel-cpu.html) for more information.
You can get a list of platforms and devices from the `clinfo -l` command, etc.
-- #### SYCL
+- #### Vulkan
- SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+ **With docker**:
- llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+ You don't need to install Vulkan SDK. It will be installed inside the container.
- For detailed info, please refer to [llama.cpp for SYCL](README-sycl.md).
+ ```sh
+ # Build the image
+ docker build -t llama-cpp-vulkan -f .devops/main-vulkan.Dockerfile .
+
+ # Then, use it:
+ docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+ ```
+
+ **Without docker**:
+
+ Firstly, you need to make sure you installed [Vulkan SDK](https://vulkan.lunarg.com/doc/view/latest/linux/getting_started_ubuntu.html)
+ For example, on Ubuntu 22.04 (jammy), use the command below:
+
+ ```bash
+ wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add -
+ wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+ apt update -y
+ apt-get install -y vulkan-sdk
+ # To verify the installation, use the command below:
+ vulkaninfo
+ ```
+
+ Then, build llama.cpp using the cmake command below:
+
+ ```bash
+ mkdir -p build
+ cd build
+ cmake .. -DLLAMA_VULKAN=1
+ cmake --build . --config Release
+ # Test the output binary (with "-ngl 33" to offload all layers to GPU)
+ ./bin/main -m "PATH_TO_MODEL" -p "Hi you how are you" -n 50 -e -ngl 33 -t 4
+
+ # You should see in the output, ggml_vulkan detected your GPU. For example:
+ # ggml_vulkan: Using Intel(R) Graphics (ADL GT2) | uma: 1 | fp16: 1 | warp size: 32
+ ```
### Prepare Data & Run