Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEATURE] Add SYCL feature flags to rllm-llamacpp build (To add support for Intel GPUs) #96

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions rllm/llama-cpp-low/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,6 @@ cmake = "0.1.50"
[features]
default = []
cuda = []
sycl = []
sycl_fp16 = []
sycl_nvidia = []
64 changes: 61 additions & 3 deletions rllm/llama-cpp-low/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,14 @@ const SUBMODULE_DIR: &str = concat!(env!("CARGO_MANIFEST_DIR"), "/llama.cpp");

fn main() {
let ccache = true;
let cuda = std::env::var("CARGO_FEATURE_CUDA").unwrap_or(String::new());
let flag_cuda = env::var("CARGO_FEATURE_CUDA").unwrap_or(String::new()) == "1";
let flag_sycl = env::var("CARGO_FEATURE_SYCL").unwrap_or(String::new()) == "1";
let flag_sycl_fp16 = env::var("CARGO_FEATURE_SYCL_FP16").unwrap_or(String::new()) == "1";
let flag_sycl_nvidia = env::var("CARGO_FEATURE_SYCL_NVIDIA").unwrap_or(String::new()) == "1";

// oneAPI environment variables
let mkl_root = env::var("MKLROOT");
let cmplr_root = env::var("CMPLR_ROOT");

let submodule_dir = &PathBuf::from(SUBMODULE_DIR);
let header_path = submodule_dir.join("llama.h");
Expand All @@ -29,15 +36,66 @@ fn main() {
.configure_arg("-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache");
}

if cuda == "1" {
if flag_cuda && flag_sycl {
panic!("Only cuda or sycl can be activated at the same time!");
}
if flag_cuda {
cmake.configure_arg("-DLLAMA_CUBLAS=ON");
println!("cargo:rustc-link-search=/usr/local/cuda/lib64");
println!("cargo:rustc-link-lib=cuda");
println!("cargo:rustc-link-lib=cudart");
println!("cargo:rustc-link-lib=cublas");
println!("cargo:rustc-link-lib=cupti");
}
} else if flag_sycl {
assert!(mkl_root.is_ok(), "MKLROOT is not set (plz `source /opt/intel/oneapi/setvars.sh` if OneAPI is installed)");
assert!(cmplr_root.is_ok(), "ICPP_COMPILER_ROOT is not set");
let mkl_root_str = mkl_root.unwrap();
//let cmplr_root_str = cmplr_root.unwrap();

cmake
.define("LLAMA_SYCL", "ON")
.define("CMAKE_C_COMPILER", "icx")
.define("CMAKE_CXX_COMPILER", "icpx");

println!("cargo:rustc-link-arg=-fiopenmp");
println!("cargo:rustc-link-arg=-fopenmp-targets=spir64_gen");
println!("cargo:rustc-link-arg=-fsycl");
println!("cargo:rustc-link-arg=-Wl,--no-as-needed");
println!("cargo:rustc-link-arg=-Wno-narrowing");
println!("cargo:rustc-link-arg=-O3");
//println!("cargo:rustc-link-search=native={}/lib", cmplr_root_str);
println!("cargo:rustc-link-search=native={}/lib", mkl_root_str);
println!("cargo:rustc-link-lib=svml");
println!("cargo:rustc-link-lib=mkl_sycl_blas");
println!("cargo:rustc-link-lib=mkl_sycl_lapack");
println!("cargo:rustc-link-lib=mkl_sycl_dft");
println!("cargo:rustc-link-lib=mkl_sycl_sparse");
println!("cargo:rustc-link-lib=mkl_sycl_vm");
println!("cargo:rustc-link-lib=mkl_sycl_rng");
println!("cargo:rustc-link-lib=mkl_sycl_stats");
println!("cargo:rustc-link-lib=mkl_sycl_data_fitting");
println!("cargo:rustc-link-lib=mkl_intel_ilp64");
println!("cargo:rustc-link-lib=mkl_intel_thread");
println!("cargo:rustc-link-lib=mkl_tbb_thread");
println!("cargo:rustc-link-lib=mkl_core");
println!("cargo:rustc-link-lib=iomp5");
println!("cargo:rustc-link-lib=sycl");
println!("cargo:rustc-link-lib=pthread");
println!("cargo:rustc-link-lib=m");
println!("cargo:rustc-link-lib=dl");
println!("cargo:rustc-link-lib=intlc");
println!("cargo:rustc-link-lib=imf");
//println!("cargo:rustc-link-lib=static=ggml_sycl");
//println!("cargo:rustc-link-arg=")
}
if flag_sycl_fp16 {
cmake.configure_arg("-DLLAMA_SYCL_F16=ON");
}
if flag_sycl_nvidia {
cmake.configure_arg("-DLLAMA_SYCL_TARGET=NVIDIA");
}
cmake.very_verbose(true);

let dst = cmake.build();

println!("cargo:rustc-link-search=native={}/lib", dst.display());
Expand Down
48 changes: 44 additions & 4 deletions rllm/rllm-cuda/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,42 @@ while [ "$1" != "" ] ; do
exit 1
fi
;;
--sycl )
if [ "$CPP" = 1 ] ; then
VER="$VER --features sycl"
ADD_ARGS="--gpu-layers 1000"
else
echo "--sycl only valid for llama.cpp"
exit 1
fi
;;
--sycl-fp16 )
if [ "$CPP" = 1 ] ; then
VER="$VER --features sycl,sycl_fp16"
ADD_ARGS="--gpu-layers 1000"
else
echo "--sycl-fp16 only valid for llama.cpp"
exit 1
fi
;;
--sycl-nvidia )
if [ "$CPP" = 1 ] ; then
VER="$VER --features sycl,sycl_nvidia"
ADD_ARGS="--gpu-layers 1000"
else
echo "--sycl-nvidia only valid for llama.cpp"
exit 1
fi
;;
--sycl-nvidia-fp16 )
if [ "$CPP" = 1 ] ; then
VER="$VER --features sycl,sycl_nvidia,sycl_fp16"
ADD_ARGS="--gpu-layers 1000"
else
echo "--sycl-nvidia-fp16 only valid for llama.cpp"
exit 1
fi
;;
--trace )
R_LOG=info,tokenizers=error,rllm=trace,aicirt=info,llama_cpp_low=trace
;;
Expand Down Expand Up @@ -84,7 +120,7 @@ if [ "$CPP" = 1 ] ; then
* )
SELF="server.sh"
cat <<EOF
usage: $SELF [--loop] [--cuda] [--debug] [model_name] [rllm_args...]
usage: $SELF [--loop] [--cuda] [--sycl] [--sycl-fp16] [--sycl-nvidia] [--debug] [model_name] [rllm_args...]

model_name can a HuggingFace URL pointing to a .gguf file, or one of the following:

Expand All @@ -96,9 +132,13 @@ model_name can a HuggingFace URL pointing to a .gguf file, or one of the followi

Additionally, "$SELF build" will just build the server, and not run a model.

--cuda try to build llama.cpp against installed CUDA
--loop restart server when it crashes and store logs in ./logs
--debug don't build in --release mode
--cuda try to build llama.cpp against installed CUDA
--sycl try to build llama.cpp against SYCL with fp32 support (Make sure the required sycl environement variables are set)
--sycl-fp16 try to build llama.cpp against SYCL with fp16 support
--sycl-nvidia try to build llama.cpp against SYCL with nvidia support
--sycl-nvidia-fp16 try to build llama.cpp against SYCL with fp16 and nvidia support
--loop restart server when it crashes and store logs in ./logs
--debug don't build in --release mode

Try $SELF phi2 --help to see available rllm_args
EOF
Expand Down
3 changes: 3 additions & 0 deletions rllm/rllm-llamacpp/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,6 @@ path = "src/rllm-llamacpp.rs"
[features]
default = []
cuda = ["llama_cpp_low/cuda"]
sycl = ["llama_cpp_low/sycl"]
sycl_fp16 = ["llama_cpp_low/sycl_fp16"]
sycl_nvidia = ["llama_cpp_low/sycl_nvidia"]
Loading