-
Notifications
You must be signed in to change notification settings - Fork 1
/
deepspeed.patch
120 lines (112 loc) · 5.48 KB
/
deepspeed.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
diff --git a/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py b/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
index bd19ed3..0fc8311 100644
--- a/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
+++ b/Megatron-LM-v1.1.5-3D_parallelism/megatron/training.py
@@ -278,7 +278,11 @@ def backward_step(optimizer, model, loss):
if args.deepspeed:
model.backward(loss)
else:
- optimizer.zero_grad(set_grads_to_None=True)
+# optimizer.zero_grad(set_grads_to_None=True)
+ if args.fp16:
+ optimizer.zero_grad(set_grads_to_None=True)
+ else:
+ optimizer.zero_grad()
if args.fp16:
optimizer.backward(loss, update_master_grads=False)
else:
diff --git a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
index 0245cae..54b5b1f 100644
--- a/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
+++ b/Megatron-LM-v1.1.5-ZeRO3/megatron/training.py
@@ -270,7 +270,11 @@ def backward_step(optimizer, model, loss):
if args.deepspeed:
model.backward(loss)
else:
- optimizer.zero_grad(set_grads_to_None=True)
+# optimizer.zero_grad(set_grads_to_None=True)
+ if args.fp16:
+ optimizer.zero_grad(set_grads_to_None=True)
+ else:
+ optimizer.zero_grad()
if args.fp16:
optimizer.backward(loss, update_master_grads=False)
else:
@@ -312,20 +316,20 @@ def train_step(forward_step_func, data_iterator,
args = get_args()
timers = get_timers()
- see_memory_usage(f'before forward {model.global_steps}', force=True)
+# see_memory_usage(f'before forward {model.global_steps}', force=True)
# Forward model for one step.
timers('forward').start()
loss, loss_reduced = forward_step_func(data_iterator, model)
timers('forward').stop()
- see_memory_usage(f'before backward {model.global_steps}', force=True)
+# see_memory_usage(f'before backward {model.global_steps}', force=True)
# Calculate gradients, reduce across processes, and clip.
timers('backward').start()
backward_step(optimizer, model, loss)
timers('backward').stop()
- see_memory_usage(f'before optimizer {model.global_steps}', force=True)
+# see_memory_usage(f'before optimizer {model.global_steps}', force=True)
# Update parameters.
skipped_iter = 0
timers('optimizer').start()
diff --git a/Megatron-LM/arguments.py b/Megatron-LM/arguments.py
index 67c726c..d4b3d1d 100755
--- a/Megatron-LM/arguments.py
+++ b/Megatron-LM/arguments.py
@@ -329,18 +329,18 @@ def get_args():
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
- if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
- # We are using (OpenMPI) mpirun for launching distributed data parallel processes
- local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
- local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
-
- # Possibly running with Slurm
- num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
- nodeid = int(os.getenv('SLURM_NODEID', '0'))
-
- args.local_rank = local_rank
- args.rank = nodeid*local_size + local_rank
- args.world_size = num_nodes*local_size
+# if os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'):
+# # We are using (OpenMPI) mpirun for launching distributed data parallel processes
+# local_rank = int(os.getenv('OMPI_COMM_WORLD_LOCAL_RANK'))
+# local_size = int(os.getenv('OMPI_COMM_WORLD_LOCAL_SIZE'))
+
+# # Possibly running with Slurm
+# num_nodes = int(os.getenv('SLURM_JOB_NUM_NODES', '1'))
+# nodeid = int(os.getenv('SLURM_NODEID', '0'))
+
+# args.local_rank = local_rank
+# args.rank = nodeid*local_size + local_rank
+# args.world_size = num_nodes*local_size
args.model_parallel_size = min(args.model_parallel_size, args.world_size)
if args.rank == 0:
diff --git a/Megatron-LM/data_utils/corpora.py b/Megatron-LM/data_utils/corpora.py
index 49877ac..6cda080 100755
--- a/Megatron-LM/data_utils/corpora.py
+++ b/Megatron-LM/data_utils/corpora.py
@@ -22,7 +22,7 @@ class wikipedia(json_dataset):
command line usage: `--train-data wikipedia`
"""
- PATH = 'data/wikipedia/wikidump_lines.json'
+ PATH = 'data/wikipedia//wiki_AA_presplited.json'
assert_str = "make sure to set PATH for wikipedia data_utils/corpora.py"
def __init__(self, **kwargs):
assert os.path.exists(wikipedia.PATH), \
diff --git a/Megatron-LM/data_utils/wordpiece.py b/Megatron-LM/data_utils/wordpiece.py
index 81121e4..674a78d 100755
--- a/Megatron-LM/data_utils/wordpiece.py
+++ b/Megatron-LM/data_utils/wordpiece.py
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
- 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+# 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+ 'bert-large-uncased': "bert-large-uncased-vocab.txt",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",