From 638d13a6d746179ecb001e91af8207c42c610c6a Mon Sep 17 00:00:00 2001 From: Wang Zheng Date: Wed, 8 Jan 2025 22:34:54 +0800 Subject: [PATCH] llama factory --- redhat/ocp4/4.16/2024.12.llama.factory.md | 41 ++++++++++++++++--- .../ray.notebook/2.5_run.llama.factory.ipynb | 6 ++- 2 files changed, 39 insertions(+), 8 deletions(-) diff --git a/redhat/ocp4/4.16/2024.12.llama.factory.md b/redhat/ocp4/4.16/2024.12.llama.factory.md index 45681945..e427260e 100644 --- a/redhat/ocp4/4.16/2024.12.llama.factory.md +++ b/redhat/ocp4/4.16/2024.12.llama.factory.md @@ -50,7 +50,7 @@ done mkdir -p /data/git cd /data/git -git clone -b wzh https://github.com/wangzheng422/LLaMA-Factory +git clone -b wzh-stable https://github.com/wangzheng422/LLaMA-Factory cd /data/git/LLaMA-Factory @@ -106,7 +106,7 @@ podman run --rm -it --pod llama-factory-pod \ quay.io/wangzheng422/qimgs:llama-factory-20241225-v01 \ /bin/bash -FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 OMPI_MCA_btl=tcp,self OMPI_MCA_btl_tcp_if_include=eth0 llamafactory-cli train wzh/tinyllama_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml @@ -121,7 +121,7 @@ quay.io/wangzheng422/qimgs:llama-factory-20241225-v01 \ /bin/bash -FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 OMPI_MCA_btl=tcp,self OMPI_MCA_btl_tcp_if_include=eth0 llamafactory-cli train wzh/tinyllama_lora_sft.yaml +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml ``` @@ -139,13 +139,13 @@ But this will not stop us, our target is to run the multiple node task using `ra cd /data/git/LLaMA-Factory -podman build -t quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v05 -f wzh/ray.dockerfile . +podman build -t quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v07 -f wzh/ray.dockerfile . podman run --rm -it quay.io/wangzheng422/qimgs:llama-factory-ray-20250102-v02 /bin/bash -podman push quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v05 +podman push quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v07 ``` @@ -168,7 +168,6 @@ podman push quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v06 ```bash - # first instance podman run --rm -it --pod llama-factory-pod \ @@ -196,6 +195,36 @@ quay.io/wangzheng422/qimgs:llama-factory-ray-20241226-v01 \ FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml +``` + +## try to run in deepspeed + +```bash + +# first instance + +podman run --rm -it --pod llama-factory-pod \ +--network llama-factory-network \ +--ip 10.5.0.3 \ +quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v07 \ +/bin/bash + +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=0 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft_dp.yaml + + + + +# 2nd instance + +podman run --rm -it \ +--pod llama-factory-pod \ +--network llama-factory-network \ +quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v07 \ +/bin/bash + + +FORCE_TORCHRUN=1 NNODES=2 NODE_RANK=1 MASTER_ADDR=10.5.0.3 MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft_dp.yaml + ``` diff --git a/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb b/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb index 4aa1f45c..e3064fa7 100644 --- a/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb +++ b/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb @@ -374,11 +374,13 @@ " except subprocess.CalledProcessError as e:\n", " return f\"Error getting IP address: {e}\"\n", " def execute_short_command(self, ip_address, nnodes, node_rank):\n", - " command = f'source /opt/py_env/bin/activate; cd /app; llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", + " # command = f'source /opt/py_env/bin/activate; cd /app; llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", + " command = f'llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", " return self._run_command_in_host_env(command)\n", "\n", " def execute_command(self, ip_address, nnodes, node_rank):\n", - " command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", + " # command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", + " command = f'FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", " return self._run_command_in_host_env(command)\n", "\n", " def _run_command_in_host_env(self, command):\n",