Merge branch 'pytorch:main' into add_miniPile

pytorch · Feb 27, 2024 · d1bf5d6 · d1bf5d6
2 parents 65ee4de + 5a1689f
commit d1bf5d6
Show file tree

Hide file tree

Showing 6 changed files with 152 additions and 55 deletions.
diff --git a/test/test_job_config.py b/test/test_job_config.py
@@ -9,12 +9,12 @@ class TestJobConfig:
     def test_command_line_args(self):
         config = JobConfig()
         config.parse_args([])
-        assert config.model.name == "llama"
+        assert config.training.steps == -1
 
     def test_job_config_file(self):
         config = JobConfig()
         config.parse_args(["--job.config_file", "./train_configs/debug_model.toml"])
-        assert config.model.name == "llama"
+        assert config.training.steps == 10
 
     def test_job_file_does_not_exist(self):
         with pytest.raises(FileNotFoundError):

diff --git a/torchtrain/config_manager.py b/torchtrain/config_manager.py
@@ -1,3 +1,6 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 import argparse
@@ -17,16 +20,16 @@ class JobConfig:
     Semantics:
     - Default config is loaded from a toml file. If no toml file is provided,
     then the default config is loaded from argparse defaults.
+    - if toml file has missing keys, they are filled with argparse defaults.
     """
 
     def parse_args(self, args_list: list = sys.argv[1:]):
         args = JobConfig.init_args_from_command_line(args_list)
         config_file = getattr(args, "job.config_file", None)
-        if config_file is None:
-            args_dict = self._args_to_two_level_dict(args)
-        else:
+        args_dict = self._args_to_two_level_dict(args)
+        if config_file is not None:
             with open(config_file, "rb") as f:
-                args_dict = tomllib.load(f)
+                args_dict |= tomllib.load(f)
         for k, v in args_dict.items():
             class_type = type(k.title(), (), v)
             setattr(self, k, class_type())

diff --git a/torchtrain/datasets/alpaca.py b/torchtrain/datasets/alpaca.py
@@ -20,6 +20,7 @@ class AlpacaDataset(IterableDataset):
         seq_len (int): max sequence length
         world_size (int): number of data parallel processes participating in training
         rank (int): rank of the current data parallel process
+        infinite: whether to loop infinitely over the dataset
 
     Data input format:
     {
@@ -43,38 +44,47 @@ def __init__(
         seq_len: int = 2048,
         world_size: int = 1,
         rank: int = 0,
+        infinite: bool = False,
         **kwargs
     ) -> None:
         # TODO: This is a temporary solution for small datasets like Alpaca.
         #       For larger datasets we need to use a more scalable approach.
         # Setting `streaming=True` works for large dataset, but the speed is slow.
         ds = load_dataset("tatsu-lab/alpaca", split="train")
-        self.data_iterator = iter(split_dataset_by_node(ds, rank, world_size))
+        self._data = split_dataset_by_node(ds, rank, world_size)
         self._tokenizer = tokenizer
         self.seq_len = seq_len
+        self.infinite = infinite
 
     def __iter__(self):
         max_buffer_token_len = 1 + self.seq_len
         all_tokens: List[int] = []
 
-        for sample in self.data_iterator:
-            sample_text = sample["text"]
-            sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True)
-            all_tokens.extend(sample_tokens)
+        while True:
+            for sample in iter(self._data):
+                sample_text = sample["text"]
+                sample_tokens = self._tokenizer.encode(sample_text, bos=True, eos=True)
+                all_tokens.extend(sample_tokens)
 
-            while len(all_tokens) >= max_buffer_token_len:
-                x = torch.LongTensor(all_tokens[:max_buffer_token_len])
-                # batched_x = x.reshape(self.batch_size, -1)
-                # update tokens to the remaining tokens
-                all_tokens = all_tokens[max_buffer_token_len:]
-                input = x[:-1]
-                label = x[1:]
-                yield input, label
+                while len(all_tokens) >= max_buffer_token_len:
+                    x = torch.LongTensor(all_tokens[:max_buffer_token_len])
+                    # update tokens to the remaining tokens
+                    all_tokens = all_tokens[max_buffer_token_len:]
+                    input = x[:-1]
+                    label = x[1:]
+                    yield input, label
+            if not self.infinite:
+                break
 
 
 def build_alpaca_data_loader(
-    tokenizer: TokenizerIf, batch_size: int, seq_len: int, world_size, rank
+    tokenizer: TokenizerIf,
+    batch_size: int,
+    seq_len: int,
+    world_size: int,
+    rank: int,
+    infinite: bool = True,
 ):
-    alpaca_ds = AlpacaDataset(tokenizer, seq_len, world_size, rank)
+    alpaca_ds = AlpacaDataset(tokenizer, seq_len, world_size, rank, infinite)
 
     return DataLoader(alpaca_ds, batch_size=batch_size)
diff --git a/torchtrain/metrics.py b/torchtrain/metrics.py
@@ -16,19 +16,20 @@
 
 from torchtrain.logging_utils import rank0_log
 
-_gb_in_bytes = 1024 * 1024 * 1024
-_mb_in_bytes = 1024 * 1024
+# note that GiB (gibibyte) is 1024, vs GB is 1000
+_gib_in_bytes = 1024 * 1024 * 1024
+_mib_in_bytes = 1024 * 1024
 
 
-def format_to_gb(item, precision=4):
-    """quick function to format numbers to gigabyte and round to (default) 4 digit precision"""
-    metric_num = item / _gb_in_bytes
+def _format_to_gib(item, precision=4):
+    """quick function to format numbers to gibibyte and round to (default) 4 digit precision"""
+    metric_num = item / _gib_in_bytes
     metric_num = round(metric_num, ndigits=precision)
     return metric_num
 
 
-def convert_to_gpu_pct(value, total_gpu_memory):
-    return round(100 * (value / total_gpu_memory), 2)
+def _convert_to_gpu_pct(value, total_gpu_memory, precision=4):
+    return round(100 * (value / total_gpu_memory), precision)
 
 
 # named tuple for passing memory stats (as % of device capacity) for Tensorboard logging
@@ -58,7 +59,7 @@ def __init__(self, device: str = "cuda:0"):
         self.device_capacity = torch.cuda.get_device_properties(
             self.device
         ).total_memory
-        self.device_capacity_gb = format_to_gb(self.device_capacity)
+        self.device_capacity_gib = _format_to_gib(self.device_capacity)
         self.num_retries = 0
         self.num_ooms = 0
         self.peak_active_memory = 0
@@ -67,17 +68,17 @@ def __init__(self, device: str = "cuda:0"):
         self.curr_reserved_memory = 0
 
         self.device_reserved_memory_usage = 0
-        self.device_reserved_memory_gb = 0
+        self.device_reserved_memory_gib = 0
         self.device_reserved_memory_pct = 0
 
         self.device_active_memory_usage = 0
-        self.device_active_memory_gb = 0
+        self.device_active_memory_gib = 0
         self.device_active_memory_pct = 0
 
         # current stats
         self.device_alloc_memory_usage = torch.cuda.memory_allocated(self.device)
-        self.device_alloc_memory_gb = format_to_gb(self.device_alloc_memory_usage)
-        self.device_alloc_memory_pct = convert_to_gpu_pct(
+        self.device_alloc_memory_gib = _format_to_gib(self.device_alloc_memory_usage)
+        self.device_alloc_memory_pct = _convert_to_gpu_pct(
             self.device_alloc_memory_usage, self.device_capacity
         )
 
@@ -90,10 +91,8 @@ def get_pct_memory(self, memory_num):
         pct_memory = round(100 * (pct_memory), 2)
         return pct_memory
 
-    def get_gb_memory(self, memory_num):
-        gb_memory = memory_num / _gb_in_bytes
-        gb_memory = round(gb_memory, 2)
-        return gb_memory
+    def get_gib_memory(self, memory_num):
+        return _format_to_gib(memory_num, precision=2)
 
     def get_current_stats(self, return_data: bool = False):
         """
@@ -104,21 +103,23 @@ def get_current_stats(self, return_data: bool = False):
         curr_mem = torch.cuda.memory_stats(self.device)
 
         self.device_alloc_memory_usage = curr_mem["allocated_bytes.all.current"]
-        self.device_alloc_memory_gb = format_to_gb(self.device_alloc_memory_usage)
-        self.device_alloc_memory_pct = convert_to_gpu_pct(
+        self.device_alloc_memory_gib = _format_to_gib(self.device_alloc_memory_usage)
+        self.device_alloc_memory_pct = _convert_to_gpu_pct(
             self.device_alloc_memory_usage, self.device_capacity
         )
 
         self.device_reserved_memory_usage = curr_mem["reserved_bytes.all.current"]
-        self.device_reserved_memory_gb = format_to_gb(self.device_reserved_memory_usage)
-        self.device_reserved_memory_pct = convert_to_gpu_pct(
+        self.device_reserved_memory_gib = _format_to_gib(
+            self.device_reserved_memory_usage
+        )
+        self.device_reserved_memory_pct = _convert_to_gpu_pct(
             self.device_reserved_memory_usage, self.device_capacity
         )
 
         self.device_active_memory_usage = curr_mem["active_bytes.all.current"]
-        self.device_active_memory_gb = format_to_gb(self.device_active_memory_usage)
-        self.device_active_memory_pct = convert_to_gpu_pct(
-            self.device_active_memory_usage, self.device_capacity
+        self.device_active_memory_gib = _format_to_gib(self.device_active_memory_usage)
+        self.device_active_memory_pct = _convert_to_gpu_pct(
+            self.device_active_memory_usage, self.device_capacity, precision=2
         )
 
         display_str = ""
@@ -179,8 +180,8 @@ def reset_peak_stats(self):
 
     def __str__(self):
         _ = self.get_current_stats()
-        display_str = f"{self.device_name} ({self.device_index}): {self.device_capacity_gb} GB capacity, "
-        display_str += f"{self.device_alloc_memory_gb} GB in-use, {self.device_alloc_memory_pct}% in-use"
+        display_str = f"{self.device_name} ({self.device_index}): {self.device_capacity_gib} GiB capacity, "
+        display_str += f"{self.device_alloc_memory_gib} GiB in-use, {self.device_alloc_memory_pct}% in-use"
         return f"{display_str}"
 
 

diff --git a/torchtrain/utils.py b/torchtrain/utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
+from dataclasses import dataclass
 from typing import Union
 
 import torch
@@ -17,3 +18,37 @@ def dist_max(x: Union[int, float], mesh: DeviceMesh) -> float:
 def dist_mean(x: Union[int, float], mesh: DeviceMesh) -> float:
     tensor = torch.tensor(x).cuda()
     return funcol.all_reduce(tensor, reduceOp=c10d.ReduceOp.AVG.name, group=mesh)
+
+
+@dataclass
+class Color:
+    black = "\033[30m"
+    red = "\033[31m"
+    green = "\033[32m"
+    yellow = "\033[33m"
+    blue = "\033[34m"
+    magenta = "\033[35m"
+    cyan = "\033[36m"
+    white = "\033[37m"
+    reset = "\033[39m"
+
+
+@dataclass
+class Background:
+    black = "\033[40m"
+    red = "\033[41m"
+    green = "\033[42m"
+    yellow = "\033[43m"
+    blue = "\033[44m"
+    magenta = "\033[45m"
+    cyan = "\033[46m"
+    white = "\033[47m"
+    reset = "\033[49m"
+
+
+@dataclass
+class Style:
+    bright = "\033[1m"
+    dim = "\033[2m"
+    normal = "\033[22m"
+    reset = "\033[0m"