diff --git a/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb b/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb index 136ce14e..4aa1f45c 100644 --- a/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb +++ b/redhat/ocp4/4.16/files/ray.notebook/2.5_run.llama.factory.ipynb @@ -10,31 +10,64 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "id": "b55bc3ea-4ce3-49bf-bb1f-e209de8ca47a", "metadata": {}, "outputs": [], "source": [ "# Import pieces from codeflare-sdk\n", - "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication\n", - "\n", - "# import os\n", - "\n", - "# %env RAY_TLS_VERIFY=0" + "from codeflare_sdk import Cluster, ClusterConfiguration, TokenAuthentication" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, + "id": "abe0a3d8-1c53-4714-9c48-19a9158f4bd0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.23.1\n" + ] + } + ], + "source": [ + "import codeflare_sdk\n", + "print(codeflare_sdk.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, "id": "614daa0c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Insecure request warnings have been disabled\n" + ] + }, + { + "data": { + "text/plain": [ + "'Logged into https://api.demo-01-rhsys.wzhlab.top:6443'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Create authentication object for user permissions\n", "# IF unused, SDK will automatically check for default kubeconfig, then in-cluster config\n", "# KubeConfigFileAuthentication can also be used to specify kubeconfig path manually\n", "auth = TokenAuthentication(\n", - " token = \"sha256~DVnZ1tbr11kPu9ltluH0M_Xa3O_6dEKFZScmvHuio1Y\",\n", + " token = \"sha256~oMdcvXQ6WV5ZxlRwCniQXn9_VzyIGbrs05b8ccrDvbQ\",\n", " server = \"https://api.demo-01-rhsys.wzhlab.top:6443\",\n", " skip_tls= True\n", ")\n", @@ -58,10 +91,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "id": "0f4bc870-091f-4e11-9642-cba145710159", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Warning: TLS verification has been disabled - Endpoint checks will be bypassed\n", + "Yaml resources loaded for llama-factory-test\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fde461846e9d47ef9b85453c043506d8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "VBox(children=(HBox(children=(Button(description='Cluster Up', icon='play', style=ButtonStyle(), tooltip='Creaโ€ฆ" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "46990e1ca6c44602a3fe011552af88f5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "# Create and configure our cluster object\n", "# The SDK will try to find the name of your default local queue based on the annotation \"kueue.x-k8s.io/default-queue\": \"true\" unless you specify the local queue manually below\n", @@ -75,8 +145,8 @@ " head_extended_resource_requests={'nvidia.com/gpu':0}, # For GPU enabled workloads set the head_extended_resource_requests and worker_extended_resource_requests\n", " worker_extended_resource_requests={'nvidia.com/gpu':0},\n", " num_workers=2,\n", - " worker_cpu_requests='250m',\n", - " worker_cpu_limits=1,\n", + " worker_cpu_requests='2',\n", + " worker_cpu_limits=8,\n", " worker_memory_requests=4,\n", " worker_memory_limits=6,\n", " image=\"quay.io/wangzheng422/qimgs:llama-factory-ray-20250106-v06\", # Optional Field \n", @@ -88,10 +158,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "id": "f0884bbc-c224-4ca0-98a0-02dfa09c2200", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray Cluster: 'llama-factory-test' has successfully been created\n", + "Waiting for requested resources to be set up...\n", + "Requested cluster is up and running!\n", + "Dashboard is ready!\n" + ] + } + ], "source": [ "# Bring up the cluster\n", "cluster.up()\n", @@ -100,10 +181,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "id": "df71c1ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
                           ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€                           \n",
+       "                                                                                     \n",
+       " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n",
+       " โ”‚   Name                                                                          โ”‚ \n",
+       " โ”‚   llama-factory-test                                                Active โœ…   โ”‚ \n",
+       " โ”‚                                                                                 โ”‚ \n",
+       " โ”‚   URI: ray://llama-factory-test-head-svc.rhods-notebooks.svc:10001              โ”‚ \n",
+       " โ”‚                                                                                 โ”‚ \n",
+       " โ”‚   Dashboard๐Ÿ”—                                                                   โ”‚ \n",
+       " โ”‚                                                                                 โ”‚ \n",
+       " โ”‚                       Cluster Resources                                         โ”‚ \n",
+       " โ”‚   โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ  โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ                     โ”‚ \n",
+       " โ”‚   โ”‚  # Workers  โ”‚  โ”‚  Memory      CPU         GPU         โ”‚                     โ”‚ \n",
+       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚                     โ”‚ \n",
+       " โ”‚   โ”‚  2          โ”‚  โ”‚  4G~6G       2~8         0           โ”‚                     โ”‚ \n",
+       " โ”‚   โ”‚             โ”‚  โ”‚                                      โ”‚                     โ”‚ \n",
+       " โ”‚   โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ  โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ                     โ”‚ \n",
+       " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n",
+       "
\n" + ], + "text/plain": [ + "\u001b[3m \u001b[0m\u001b[1;3m ๐Ÿš€ CodeFlare Cluster Details ๐Ÿš€\u001b[0m\u001b[3m \u001b[0m\n", + "\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\n", + " โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ \n", + " โ”‚ \u001b[1;37;42mName\u001b[0m โ”‚ \n", + " โ”‚ \u001b[1;4mllama-factory-test\u001b[0m Active โœ… โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[1mURI:\u001b[0m ray://llama-factory-test-head-svc.rhods-notebooks.svc:10001 โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b]8;id=948038;https://ray-dashboard-llama-factory-test-rhods-notebooks.apps.demo-01-rhsys.wzhlab.top\u001b\\\u001b[4;34mDashboard๐Ÿ”—\u001b[0m\u001b]8;;\u001b\\ โ”‚ \n", + " โ”‚ โ”‚ \n", + " โ”‚ \u001b[3m Cluster Resources \u001b[0m โ”‚ \n", + " โ”‚ โ•ญโ”€โ”€ Workers โ”€โ”€โ•ฎ โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ Worker specs(each) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ โ”‚ \n", + " โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1m# Workers\u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \u001b[1m \u001b[0m\u001b[1mMemory \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mCPU \u001b[0m\u001b[1m \u001b[0m\u001b[1m \u001b[0m\u001b[1mGPU \u001b[0m\u001b[1m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m2 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m4G~6G \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m2~8 \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m0 \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ”‚ \u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[36m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m\u001b[35m \u001b[0m โ”‚ โ”‚ \n", + " โ”‚ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ โ”‚ \n", + " โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ \n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "RayCluster(name='llama-factory-test', status=, head_cpu_requests=1, head_cpu_limits=1, head_mem_requests='6G', head_mem_limits='6G', num_workers=2, worker_mem_requests='4G', worker_mem_limits='6G', worker_cpu_requests='2', worker_cpu_limits=8, namespace='rhods-notebooks', dashboard='https://ray-dashboard-llama-factory-test-rhods-notebooks.apps.demo-01-rhsys.wzhlab.top', worker_extended_resources={'nvidia.com/gpu': 0}, head_extended_resources={'nvidia.com/gpu': 0})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "cluster.details()" ] @@ -120,10 +259,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "id": "c1719bca", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "https://ray-dashboard-llama-factory-test-rhods-notebooks.apps.demo-01-rhsys.wzhlab.top\n", + "ray://llama-factory-test-head-svc.rhods-notebooks.svc:10001\n" + ] + } + ], "source": [ "ray_dashboard_uri = cluster.cluster_dashboard_uri()\n", "ray_cluster_uri = cluster.cluster_uri()\n", @@ -141,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "id": "c9436436", "metadata": {}, "outputs": [], @@ -154,18 +302,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "id": "300146dc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2025-01-07 16:29:14,056\tINFO client_builder.py:244 -- Passing the following kwargs to ray.init() on the server: ignore_reinit_error\n", + "SIGTERM handler is not set because current thread is not the main thread.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray cluster is up and running: True\n" + ] + } + ], "source": [ "# before proceeding make sure the cluster exists and the uri is not empty\n", "assert ray_cluster_uri, \"Ray cluster needs to be started and set before proceeding\"\n", "\n", "import ray\n", - "import os\n", - "\n", - "%env RAY_TLS_VERIFY=0\n", "\n", "# reset the ray context in case there's already one. \n", "ray.shutdown()\n", @@ -176,7 +337,7 @@ "runtime_env = {}\n", "# NOTE: This will work for in-cluster notebook servers (RHODS/ODH), but not for local machines\n", "# To see how to connect from your laptop, go to demo-notebooks/additional-demos/local_interactive.ipynb\n", - "ray.init(address=ray_cluster_uri, runtime_env=runtime_env, ignore_reinit_error=True, _ssl_verify=False)\n", + "ray.init(address=ray_cluster_uri, runtime_env=runtime_env, ignore_reinit_error=True)\n", "\n", "print(\"Ray cluster is up and running: \", ray.is_initialized())" ] @@ -191,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "1b36e0d9", "metadata": {}, "outputs": [], @@ -218,11 +379,7 @@ "\n", " def execute_command(self, ip_address, nnodes, node_rank):\n", " command = f'source /opt/py_env/bin/activate; cd /app; FORCE_TORCHRUN=1 NNODES={nnodes} NODE_RANK={node_rank} MASTER_ADDR={ip_address} MASTER_PORT=29500 NPROC_PER_NODE=1 llamafactory-cli train wzh/tinyllama_lora_sft.yaml'\n", - " try:\n", - " os.system(command)\n", - " return \"Command executed successfully\"\n", - " except Exception as e:\n", - " return f\"Error executing command: {e}\"\n", + " return self._run_command_in_host_env(command)\n", "\n", " def _run_command_in_host_env(self, command):\n", " try:\n", @@ -246,10 +403,86 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "id": "5901d958", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Actor 1 IP: 10.132.0.152\n", + "Actor 2 IP: 10.132.0.153\n", + "Actor 1 command result: [2025-01-07 16:29:35,112] [WARNING] [real_accelerator.py:162:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.\n", + "[2025-01-07 16:29:35,124] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)\n", + "[INFO|2025-01-07 16:29:39] llamafactory.cli:157 >> Initializing distributed tasks at: 10.132.0.152:29500\n", + "[2025-01-07 16:29:50,342] [WARNING] [real_accelerator.py:162:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.\n", + "[2025-01-07 16:29:50,354] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)\n", + "[INFO|2025-01-07 16:29:52] llamafactory.hparams.parser:355 >> Process rank: 0, device: cpu:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16\n", + "[INFO|2025-01-07 16:29:52] llamafactory.data.template:157 >> Add pad token: \n", + "[INFO|2025-01-07 16:29:52] llamafactory.data.loader:157 >> Loading dataset identity.json...\n", + "[INFO|2025-01-07 16:29:53] llamafactory.data.loader:157 >> Loading dataset alpaca_en_demo.json...\n", + "training example:\n", + "input_ids:\n", + "[1, 836, 1389, 2313, 31908, 23980, 836, 31873, 1389, 2313, 31908, 16644, 31905, 312, 705, 16717, 3227, 28035, 363, 7421, 8825, 3321, 417, 16717, 10935, 2338, 31843, 1035, 473, 312, 2803, 365, 31822, 31824, 16346, 31902, 2]\n", + "inputs:\n", + " [INST] hi [/INST] Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?\n", + "label_ids:\n", + "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 16644, 31905, 312, 705, 16717, 3227, 28035, 363, 7421, 8825, 3321, 417, 16717, 10935, 2338, 31843, 1035, 473, 312, 2803, 365, 31822, 31824, 16346, 31902, 2]\n", + "labels:\n", + "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.model_utils.checkpointing:157 >> Gradient checkpointing enabled.\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.model_utils.attention:157 >> Using torch SDPA for faster training and inference.\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.adapter:157 >> Upcasting trainable params to float32.\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.adapter:157 >> Fine-tuning method: LoRA\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.model_utils.misc:157 >> Found linear modules: down_proj,v_proj,k_proj,gate_proj,q_proj,up_proj,o_proj\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.loader:157 >> trainable params: 94,208 || all params: 4,715,584 || trainable%: 1.9978\n", + "{'train_runtime': 11.9649, 'train_samples_per_second': 4.513, 'train_steps_per_second': 0.251, 'train_loss': 12.782352447509766, 'epoch': 2.67}\n", + "***** train metrics *****\n", + " epoch = 2.6667\n", + " total_flos = 93GF\n", + " train_loss = 12.7824\n", + " train_runtime = 0:00:11.96\n", + " train_samples_per_second = 4.513\n", + " train_steps_per_second = 0.251\n", + "[WARNING|2025-01-07 16:30:14] llamafactory.extras.ploting:162 >> No metric loss to plot.\n", + "[WARNING|2025-01-07 16:30:14] llamafactory.extras.ploting:162 >> No metric eval_loss to plot.\n", + "[WARNING|2025-01-07 16:30:14] llamafactory.extras.ploting:162 >> No metric eval_accuracy to plot.\n", + "***** eval metrics *****\n", + " epoch = 2.6667\n", + " eval_loss = 7.5598\n", + " eval_runtime = 0:00:00.17\n", + " eval_samples_per_second = 11.622\n", + " eval_steps_per_second = 5.811\n", + "\n", + "Actor 2 command result: [2025-01-07 16:29:34,217] [WARNING] [real_accelerator.py:162:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.\n", + "[2025-01-07 16:29:34,228] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)\n", + "[INFO|2025-01-07 16:29:37] llamafactory.cli:157 >> Initializing distributed tasks at: 10.132.0.152:29500\n", + "[2025-01-07 16:29:48,299] [WARNING] [real_accelerator.py:162:get_accelerator] Setting accelerator to CPU. If you have GPU or other accelerator, we were unable to detect it.\n", + "[2025-01-07 16:29:48,310] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cpu (auto detect)\n", + "[INFO|2025-01-07 16:29:52] llamafactory.hparams.parser:355 >> Process rank: 0, device: cpu:0, n_gpu: 1, distributed training: True, compute dtype: torch.bfloat16\n", + "[INFO|2025-01-07 16:29:52] llamafactory.data.template:157 >> Add pad token: \n", + "[INFO|2025-01-07 16:29:52] llamafactory.data.loader:157 >> Loading dataset identity.json...\n", + "[INFO|2025-01-07 16:29:52] llamafactory.data.loader:157 >> Loading dataset alpaca_en_demo.json...\n", + "training example:\n", + "input_ids:\n", + "[1, 836, 1389, 2313, 31908, 23980, 836, 31873, 1389, 2313, 31908, 16644, 31905, 312, 705, 16717, 3227, 28035, 363, 7421, 8825, 3321, 417, 16717, 10935, 2338, 31843, 1035, 473, 312, 2803, 365, 31822, 31824, 16346, 31902, 2]\n", + "inputs:\n", + " [INST] hi [/INST] Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?\n", + "label_ids:\n", + "[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 16644, 31905, 312, 705, 16717, 3227, 28035, 363, 7421, 8825, 3321, 417, 16717, 10935, 2338, 31843, 1035, 473, 312, 2803, 365, 31822, 31824, 16346, 31902, 2]\n", + "labels:\n", + "Hello! I am {{name}}, an AI assistant developed by {{author}}. How can I assist you today?\n", + "[INFO|2025-01-07 16:30:00] llamafactory.model.model_utils.checkpointing:157 >> Gradient checkpointing enabled.\n", + "[INFO|2025-01-07 16:30:00] llamafactory.model.model_utils.attention:157 >> Using torch SDPA for faster training and inference.\n", + "[INFO|2025-01-07 16:30:00] llamafactory.model.adapter:157 >> Upcasting trainable params to float32.\n", + "[INFO|2025-01-07 16:30:00] llamafactory.model.adapter:157 >> Fine-tuning method: LoRA\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.model_utils.misc:157 >> Found linear modules: o_proj,q_proj,v_proj,up_proj,down_proj,gate_proj,k_proj\n", + "[INFO|2025-01-07 16:30:01] llamafactory.model.loader:157 >> trainable params: 94,208 || all params: 4,715,584 || trainable%: 1.9978\n", + "\n" + ] + } + ], "source": [ "#call the above cell as a remote ray function\n", "actor1 = NetworkCommandActor.remote()\n", @@ -288,23 +521,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "id": "5f36db0f-31f6-4373-9503-dc3c1c4c3f57", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ray Cluster: 'llama-factory-test' has successfully been deleted\n" + ] + } + ], "source": [ "cluster.down()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "0d41b90e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'Successfully logged out of https://api.demo-01-rhsys.wzhlab.top:6443'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "auth.logout()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1029b8b8-56b9-4caf-9396-396ec54f4877", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {