From 6d9e4e6e65e4878c2772349e35abd70c9e8742a2 Mon Sep 17 00:00:00 2001
From: Less Wright <lessw@etrillium.com>
Date: Tue, 27 Feb 2024 10:41:40 -0800
Subject: [PATCH] Add color to console output if local logging, auto avoid
 color logging on slurm (#93)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds the ability to do colored console outputs in order to
highlight the training data outputs.
It also adds a check to not use this color formatting on slurm, where it
will add 33= instead of the color if not avoided.

Note that I've just added some color to highlight the main training
data. Users that fork/clone can use it to enhance their outputs as
desired.

<img width="1372" alt="Screenshot 2024-02-26 at 10 20 15 PM"
src="https://github.com/pytorch/torchtrain/assets/46302957/44849821-1677-40bf-896c-39344cd661d6">


Note that on slurm it remains plain:
<img width="847" alt="Screenshot 2024-02-26 at 10 46 24 PM"
src="https://github.com/pytorch/torchtrain/assets/46302957/172eaa58-4f5c-48f5-8ec1-bc349e3e82f2">

if you dont' check this, then it would otherwise look like this (this
does not happen with this PR, just showing if we didn't check and credit
to Yifu for noting this would be an issue):
<img width="847" alt="Screenshot 2024-02-26 at 10 39 23 PM"
src="https://github.com/pytorch/torchtrain/assets/46302957/4a87fb9a-dd3a-417c-a29e-286ded069358">
---
 torchtrain/utils.py | 35 +++++++++++++++++++++++++++++++++++
 train.py            | 39 +++++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 8 deletions(-)
diff --git a/torchtrain/utils.py b/torchtrain/utils.py
index 9ae71cae..823e8843 100644
--- a/torchtrain/utils.py
+++ b/torchtrain/utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+from dataclasses import dataclass
 from typing import Union
 
 import torch
@@ -17,3 +18,37 @@ def dist_max(x: Union[int, float], mesh: DeviceMesh) -> float:
 def dist_mean(x: Union[int, float], mesh: DeviceMesh) -> float:
     tensor = torch.tensor(x).cuda()
     return funcol.all_reduce(tensor, reduceOp=c10d.ReduceOp.AVG.name, group=mesh)
+
+
+@dataclass
+class Color:
+    black = "\033[30m"
+    red = "\033[31m"
+    green = "\033[32m"
+    yellow = "\033[33m"
+    blue = "\033[34m"
+    magenta = "\033[35m"
+    cyan = "\033[36m"
+    white = "\033[37m"
+    reset = "\033[39m"
+
+
+@dataclass
+class Background:
+    black = "\033[40m"
+    red = "\033[41m"
+    green = "\033[42m"
+    yellow = "\033[43m"
+    blue = "\033[44m"
+    magenta = "\033[45m"
+    cyan = "\033[46m"
+    white = "\033[47m"
+    reset = "\033[49m"
+
+
+@dataclass
+class Style:
+    bright = "\033[1m"
+    dim = "\033[2m"
+    normal = "\033[22m"
+    reset = "\033[0m"
diff --git a/train.py b/train.py
index 5ce5de37..95d42226 100644
--- a/train.py
+++ b/train.py
@@ -2,6 +2,7 @@
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
 import os
+
 from dataclasses import dataclass, field
 from timeit import default_timer as timer
 from typing import Any, Dict, List
@@ -27,7 +28,11 @@
 from torchtrain.parallelisms import models_parallelize_fns, ParallelDims
 
 from torchtrain.profiling import maybe_run_profiler
-from torchtrain.utils import dist_max, dist_mean
+from torchtrain.utils import Color, dist_max, dist_mean
+
+_is_local_logging = True
+if "SLURM_JOB_ID" in os.environ:
+    _is_local_logging = False
 
 
 @dataclass
@@ -119,9 +124,16 @@ def main(job_config: JobConfig):
 
     # log model size
     model_param_count = get_num_params(model)
-    rank0_log(
-        f"Model {model_name} {job_config.model.flavor} size: {model_param_count:,} total parameters"
-    )
+    if _is_local_logging:
+        rank0_log(
+            f"{Color.blue}Model {model_name} {job_config.model.flavor} {Color.red}size: {model_param_count:,}"
+            f" total parameters{Color.reset}"
+        )
+    else:
+        rank0_log(
+            f"{model_name} {job_config.model.flavor} size: {model_param_count:,} total parameters"
+        )
+
     gpu_metrics = GPUMemoryMonitor("cuda")
     rank0_log(f"GPU memory usage: {gpu_metrics}")
 
@@ -268,10 +280,21 @@ def main(job_config: JobConfig):
                 nwords_since_last_log = 0
                 time_last_log = timer()
 
-            rank0_log(
-                f"step: {train_state.step:>2}  loss: {round(train_state.current_loss,4):>7}"
-                f"  iter: {curr_iter_time:>7}  data: {data_load_time:>5}  lr: {round(float(scheduler.get_last_lr()[0]), 8):<6}"
-            )
+            if _is_local_logging:
+                rank0_log(
+                    f"{Color.cyan}step: {train_state.step:>2}  {Color.green}loss: {round(train_state.current_loss,4):>7}"
+                    f"  {Color.reset}iter: {Color.blue}{curr_iter_time:>7}{Color.reset}"
+                    f"  data: {Color.blue}{data_load_time:>5}  {Color.reset}"
+                    f"lr: {Color.yellow}{round(float(scheduler.get_last_lr()[0]), 8):<6}{Color.reset}"
+                )
+            else:
+                rank0_log(
+                    f"step: {train_state.step:>2}  loss: {round(train_state.current_loss,4):>7}"
+                    f"  iter: {curr_iter_time:>7}"
+                    f"  data: {data_load_time:>5}  "
+                    f"lr: {round(float(scheduler.get_last_lr()[0]), 8):<6}"
+                )
+
             scheduler.step()
 
             checkpoint.save(