From 7f1646e6698901a74f263f466b09a0a57bf1bfa0 Mon Sep 17 00:00:00 2001 From: ewrfcas Date: Tue, 15 Oct 2019 23:35:08 +0800 Subject: [PATCH] update the finetune codes for tensorflow --- CJRC_finetune.py => CJRC_finetune_pytorch.py | 0 DRCD_finetune.py => DRCD_finetune_pytorch.py | 16 +- README.md | 23 +- ...inetune.py => cmrc2018_finetune_pytorch.py | 0 cmrc2018_finetune_tf.py | 256 +++++++ convert_tf_checkpoint_to_pytorch.py | 6 +- evaluate/CJRC_output.py | 3 - models/pytorch_modeling.py | 18 +- models/tf_modeling.py | 661 ++++++++++++++++++ optimizations/tf_optimization.py | 201 ++++++ utils.py | 23 +- 11 files changed, 1171 insertions(+), 36 deletions(-) rename CJRC_finetune.py => CJRC_finetune_pytorch.py (100%) rename DRCD_finetune.py => DRCD_finetune_pytorch.py (97%) rename cmrc2018_finetune.py => cmrc2018_finetune_pytorch.py (100%) create mode 100644 cmrc2018_finetune_tf.py create mode 100644 models/tf_modeling.py create mode 100644 optimizations/tf_optimization.py diff --git a/CJRC_finetune.py b/CJRC_finetune_pytorch.py similarity index 100% rename from CJRC_finetune.py rename to CJRC_finetune_pytorch.py diff --git a/DRCD_finetune.py b/DRCD_finetune_pytorch.py similarity index 97% rename from DRCD_finetune.py rename to DRCD_finetune_pytorch.py index b41c791..06c2c91 100644 --- a/DRCD_finetune.py +++ b/DRCD_finetune_pytorch.py @@ -81,15 +81,15 @@ def evaluate(model, args, eval_examples, eval_features, device, global_steps, be if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--gpu_ids', type=str, default='0,1,2,3') + parser.add_argument('--gpu_ids', type=str, default='0,1,2,3,4,5,6,7') # training parameter - parser.add_argument('--train_epochs', type=int, default=2) + parser.add_argument('--train_epochs', type=int, default=3) parser.add_argument('--n_batch', type=int, default=32) - parser.add_argument('--lr', type=float, default=3e-5) + parser.add_argument('--lr', type=float, default=2.5e-5) parser.add_argument('--dropout', type=float, default=0.1) parser.add_argument('--clip_norm', type=float, default=1.0) - parser.add_argument('--warmup_rate', type=float, default=0.05) + parser.add_argument('--warmup_rate', type=float, default=0.06) parser.add_argument("--schedule", default='warmup_linear', type=str, help='schedule') parser.add_argument("--weight_decay_rate", default=0.01, type=float, help='weight_decay_rate') parser.add_argument('--seed', type=list, default=[123, 456, 789, 556, 977]) @@ -112,13 +112,13 @@ def evaluate(model, args, eval_examples, eval_features, device, global_steps, be parser.add_argument('--dev_file', type=str, default='origin_data/DRCD/DRCD_dev.json') parser.add_argument('--bert_config_file', type=str, - default='check_points/pretrain_models/albert_large_zh/albert_config_large.json') + default='check_points/pretrain_models/albert_xlarge_zh/bert_config.json') parser.add_argument('--vocab_file', type=str, - default='check_points/pretrain_models/albert_large_zh/vocab.txt') + default='check_points/pretrain_models/albert_xlarge_zh/vocab.txt') parser.add_argument('--init_restore_dir', type=str, - default='check_points/pretrain_models/albert_large_zh/pytorch_albert_model.pth') + default='check_points/pretrain_models/albert_xlarge_zh/pytorch_model.pth') parser.add_argument('--checkpoint_dir', type=str, - default='check_points/DRCD/albert_large_zh/') + default='check_points/DRCD/albert_xlarge_zh/') parser.add_argument('--setting_file', type=str, default='setting.txt') parser.add_argument('--log_file', type=str, default='log.txt') diff --git a/README.md b/README.md index edde3d7..b5fd42e 100755 --- a/README.md +++ b/README.md @@ -1,6 +1,12 @@ ## BERT下游任务finetune列表 -finetune基于官方代码改造的模型都是基于pytorch的,因为tensorflow的fp16和多gpu还要重写,有兴趣的童鞋可以补充一下。 +finetune基于官方代码改造的模型基于pytorch/tensorflow双版本 + +*** 2019-10-15: 增加tensorflow(bert/roberta)在cmrc2018上的finetune代码 *** + +2019-10-14: 新增DRCD test结果 + +*** 2019-10-12: pytorch支持albert *** ### 模型及相关代码来源 @@ -14,7 +20,7 @@ finetune基于官方代码改造的模型都是基于pytorch的,因为tensorfl 5. 自己瞎折腾的siBert (https://github.com/ewrfcas/SiBert_tensorflow) -### 关于FP16 +### 关于pytorch的FP16 FP16的训练可以显著降低显存压力(如果有V100等GPU资源还能提高速度)。但是最新版编译的apex-FP16对并行的支持并不友好(https://github.com/NVIDIA/apex/issues/227)。 实践下来bert相关任务的finetune任务对fp16的数值压力是比较小的,因此可以更多的以计算精度换取效率,所以我还是倾向于使用老版的FusedAdam+FP16_Optimizer的组合。 @@ -23,6 +29,19 @@ FP16的训练可以显著降低显存压力(如果有V100等GPU资源还能提 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" --global-option="--deprecated_fused_adam" ./ ``` +### 关于tensorflow的blocksparse + +blocksparse(https://github.com/openai/blocksparse)可以在tensorflow1.13版本直接pip安装,否则可以自己clone后编译。 +其中fast_gelu以及self-attention中的softmax能够极大缓解显存压力。另外部分dropout位置我有所调整,整体显存占用下降大约30%~40%。 + +tensorflow roberta_large length=512 fp16 + +model | length | batch | memory | +| ------ | ------ | ------ | ------ | +| roberta_base | 512 | 32 | 16GB | +| roberta_large | 512 | 12 | 16GB | + + ### 参与任务 1. CMRC 2018:篇章片段抽取型阅读理解(简体中文,只测了dev) diff --git a/cmrc2018_finetune.py b/cmrc2018_finetune_pytorch.py similarity index 100% rename from cmrc2018_finetune.py rename to cmrc2018_finetune_pytorch.py diff --git a/cmrc2018_finetune_tf.py b/cmrc2018_finetune_tf.py new file mode 100644 index 0000000..1b5f6b2 --- /dev/null +++ b/cmrc2018_finetune_tf.py @@ -0,0 +1,256 @@ +import argparse +import numpy as np +import tensorflow as tf +import os +from models.tf_modeling import BertModelMRC, BertConfig +from optimizations.tf_optimization import Optimizer +import json +import utils +from evaluate.cmrc2018_evaluate import get_eval +from evaluate.cmrc2018_output import write_predictions +import random +from tqdm import tqdm +import collections +from tokenizations.offical_tokenization import BertTokenizer +from preprocess.cmrc2018_preprocess import json2features + + +def data_generator(data, n_batch, shuffle=False, drop_last=False): + steps_per_epoch = len(data) // n_batch + if len(data) % n_batch != 0 and not drop_last: + steps_per_epoch += 1 + data_set = dict() + for k in data[0]: + data_set[k] = np.array([data_[k] for data_ in data]) + index_all = np.arange(len(data)) + + while True: + if shuffle: + random.shuffle(index_all) + for i in range(steps_per_epoch): + yield {k: data_set[k][index_all[i * n_batch:(i + 1) * n_batch]] for k in data_set} + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser() + tf.logging.set_verbosity(tf.logging.INFO) + + parser.add_argument('--gpu_ids', type=str, default='0') + + # training parameter + parser.add_argument('--train_epochs', type=int, default=2) + parser.add_argument('--n_batch', type=int, default=32) + parser.add_argument('--lr', type=float, default=3e-5) + parser.add_argument('--dropout', type=float, default=0.1) + parser.add_argument('--clip_norm', type=float, default=1.0) + parser.add_argument('--loss_scale', type=float, default=2.0 ** 15) + parser.add_argument('--warmup_iters', type=int, default=0.1) + parser.add_argument('--loss_count', type=int, default=1000) + parser.add_argument('--seed', type=list, default=[123, 456, 789, 556, 977]) + parser.add_argument('--float16', type=int, default=True) # only sm >= 7.0 (tensorcores) + parser.add_argument('--max_ans_length', type=int, default=50) + parser.add_argument('--log_interval', type=int, default=30) # show the average loss per 30 steps args. + parser.add_argument('--n_best', type=int, default=20) + parser.add_argument('--eval_epochs', type=float, default=0.5) + parser.add_argument('--save_best', type=bool, default=True) + parser.add_argument('--vocab_size', type=int, default=21128) + parser.add_argument('--max_seq_length', type=int, default=512) + + # data dir + parser.add_argument('--vocab_file', type=str, + default='check_points/pretrain_models/roberta_wwm_ext_base/vocab.txt') + + parser.add_argument('--train_dir', type=str, default='dataset/cmrc2018/train_features_roberta512.json') + parser.add_argument('--dev_dir1', type=str, default='dataset/cmrc2018/dev_examples_roberta512.json') + parser.add_argument('--dev_dir2', type=str, default='dataset/cmrc2018/dev_features_roberta512.json') + parser.add_argument('--train_file', type=str, default='origin_data/cmrc2018/cmrc2018_train.json') + parser.add_argument('--dev_file', type=str, default='origin_data/cmrc2018/cmrc2018_dev.json') + parser.add_argument('--bert_config_file', type=str, + default='check_points/pretrain_models/roberta_wwm_ext_base/bert_config.json') + parser.add_argument('--init_restore_dir', type=str, + default='check_points/pretrain_models/roberta_wwm_ext_base/bert_model.ckpt') + parser.add_argument('--checkpoint_dir', type=str, + default='check_points/cmrc2018/roberta_wwm_ext_base/') + parser.add_argument('--setting_file', type=str, default='setting.txt') + parser.add_argument('--log_file', type=str, default='log.txt') + + # use some global vars for convenience + args = parser.parse_args() + args.checkpoint_dir += ('/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/' + .format(args.train_epochs, args.n_batch, args.lr, args.warmup_iters, args.max_ans_length)) + args = utils.check_args(args) + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids + + print_rank0('######## generating data ########') + + tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True) + assert args.vocab_size == len(tokenizer.vocab) + if not os.path.exists(args.train_dir): + json2features(args.train_file, [args.train_dir.replace('_features_', '_examples_'), + args.train_dir], tokenizer, is_training=True) + + if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2): + json2features(args.dev_file, [args.dev_dir1, args.dev_dir2], tokenizer, is_training=False) + + train_data = json.load(open(args.train_dir, 'r')) + dev_examples = json.load(open(args.dev_dir1, 'r')) + dev_data = json.load(open(args.dev_dir2, 'r')) + if os.path.exists(args.log_file): + os.remove(args.log_file) + + steps_per_epoch = len(train_data) // args.n_batch + eval_steps = int(steps_per_epoch * args.eval_epochs) + dev_steps_per_epoch = len(dev_data) // args.n_batch + if len(train_data) % args.n_batch != 0: + steps_per_epoch += 1 + if len(dev_data) % args.n_batch != 0: + dev_steps_per_epoch += 1 + total_steps = steps_per_epoch * args.train_epochs + args.warmup_iters = int(args.warmup_iters * total_steps) + + print('steps per epoch:', steps_per_epoch) + print('total steps:', total_steps) + print('warmup steps:', args.warmup_iters) + + F1s = [] + EMs = [] + best_f1_em = 0 + with tf.device("/gpu:0"): + input_ids = tf.placeholder(tf.int32, shape=[None, args.max_seq_length], name='input_ids') + input_masks = tf.placeholder(tf.float32, shape=[None, args.max_seq_length], name='input_masks') + segment_ids = tf.placeholder(tf.int32, shape=[None, args.max_seq_length], name='segment_ids') + start_positions = tf.placeholder(tf.int32, shape=[None, ], name='start_positions') + end_positions = tf.placeholder(tf.int32, shape=[None, ], name='end_positions') + + # build the models for training and testing/validation + print('######## init model ########') + bert_config = BertConfig.from_json_file(args.bert_config_file) + train_model = BertModelMRC(config=bert_config, + is_training=True, + input_ids=input_ids, + input_mask=input_masks, + token_type_ids=segment_ids, + start_positions=start_positions, + end_positions=end_positions, + use_float16=args.float16) + + eval_model = BertModelMRC(config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_masks, + token_type_ids=segment_ids, + use_float16=args.float16) + + optimization = Optimizer(loss=train_model.train_loss, + init_lr=args.lr, + num_train_steps=total_steps, + num_warmup_steps=args.warmup_iters, + hvd=None, + use_fp16=args.float16, + loss_count=args.loss_count, + clip_norm=args.clip_norm, + init_loss_scale=args.loss_scale) + + for seed_ in args.seed: + best_f1, best_em = 0, 0 + with open(args.log_file, 'a') as aw: + aw.write('===================================' + + 'SEED:' + str(seed_) + + '===================================' + '\n') + print('SEED:', seed_) + # random seed + np.random.seed(seed_) + random.seed(seed_) + tf.set_random_seed(seed_) + + train_gen = data_generator(train_data, args.n_batch, shuffle=True, drop_last=False) + dev_gen = data_generator(dev_data, args.n_batch, shuffle=False, drop_last=False) + + config = tf.ConfigProto() + config.allow_soft_placement = True + config.gpu_options.allow_growth = True + + utils.show_all_variables() + utils.init_from_checkpoint(args.init_restore_dir) + RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1) + global_steps = 0 + with tf.Session(config=config) as sess: + sess.run(tf.global_variables_initializer()) + for i in range(args.train_epochs): + print('Starting epoch %d' % (i + 1)) + total_loss = 0 + iteration = 1 + with tqdm(total=steps_per_epoch, desc='Epoch %d' % (i + 1)) as pbar: + for _ in range(steps_per_epoch): + batch_data = next(train_gen) + feed_data = {input_ids: batch_data['input_ids'], + input_masks: batch_data['input_mask'], + segment_ids: batch_data['segment_ids'], + start_positions: batch_data['start_position'], + end_positions: batch_data['end_position']} + loss, _ = sess.run([train_model.train_loss, optimization.train_op], feed_dict=feed_data) + total_loss += loss + pbar.set_postfix({'loss': '{0:1.5f}'.format(total_loss / (iteration + 1e-5))}) + pbar.update(1) + iteration += 1 + global_steps += 1 + + if global_steps % eval_steps == 0: + print('Evaluating...') + all_results = [] + for i_step in tqdm(range(dev_steps_per_epoch)): + batch_data = next(dev_gen) + feed_data = {input_ids: batch_data['input_ids'], + input_masks: batch_data['input_mask'], + segment_ids: batch_data['segment_ids']} + batch_start_logits, batch_end_logits = sess.run( + [eval_model.start_logits, eval_model.end_logits], + feed_dict=feed_data) + for j in range(len(batch_data['unique_id'])): + start_logits = batch_start_logits[j] + end_logits = batch_end_logits[j] + unique_id = batch_data['unique_id'][j] + all_results.append(RawResult(unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(args.checkpoint_dir, + 'prediction_epoch' + str(i) + '.json') + output_nbest_file = os.path.join(args.checkpoint_dir, 'nbest_epoch' + str(i) + '.json') + + write_predictions(dev_examples, dev_data, all_results, + n_best_size=args.n_best, max_answer_length=args.max_ans_length, + do_lower_case=True, output_prediction_file=output_prediction_file, + output_nbest_file=output_nbest_file) + tmp_result = get_eval(args.dev_file, output_prediction_file) + tmp_result['STEP'] = global_steps + with open(args.log_file, 'a') as aw: + aw.write(json.dumps(tmp_result) + '\n') + print(tmp_result) + + if float(tmp_result['F1']) > best_f1: + best_f1 = float(tmp_result['F1']) + if float(tmp_result['EM']) > best_em: + best_em = float(tmp_result['EM']) + + if float(tmp_result['F1']) + float(tmp_result['EM']) > best_f1_em: + best_f1_em = float(tmp_result['F1']) + float(tmp_result['EM']) + scores = {'F1': float(tmp_result['F1']), 'EM': float(tmp_result['EM'])} + save_prex = "checkpoint_score" + for k in scores: + save_prex += ('_' + k + '-' + str(scores[k])[:6]) + save_prex += '.ckpt' + saver.save(sess, save_path=os.path.join(args.checkpoint_dir, save_prex)) + + F1s.append(best_f1) + EMs.append(best_em) + + print('Mean F1:', np.mean(F1s), 'Mean EM:', np.mean(EMs)) + print('Best F1:', np.max(F1s), 'Best EM:', np.max(EMs)) + with open(args.log_file, 'a') as aw: + aw.write('Mean(Best) F1:{}({})\n'.format(np.mean(F1s), np.max(F1s))) + aw.write('Mean(Best) EM:{}({})\n'.format(np.mean(EMs), np.max(EMs))) diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py index 393eb8a..c393644 100755 --- a/convert_tf_checkpoint_to_pytorch.py +++ b/convert_tf_checkpoint_to_pytorch.py @@ -100,16 +100,16 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--tf_checkpoint_path", - default='check_points/pretrain_models/albert_large_zh/albert_model.ckpt', + default='check_points/pretrain_models/albert_xlarge_zh/albert_model.ckpt', type=str, help="Path the TensorFlow checkpoint path.") parser.add_argument("--bert_config_file", - default='check_points/pretrain_models/albert_large_zh/albert_config_large.json', + default='check_points/pretrain_models/albert_xlarge_zh/bert_config.json', type=str, help="The config json file corresponding to the pre-trained BERT model. \n" "This specifies the model architecture.") parser.add_argument("--pytorch_dump_path", - default='check_points/pretrain_models/albert_large_zh/pytorch_albert_model.pth', + default='check_points/pretrain_models/albert_xlarge_zh/pytorch_model.pth', type=str, help="Path to the output PyTorch model.") parser.add_argument("--is_albert", diff --git a/evaluate/CJRC_output.py b/evaluate/CJRC_output.py index 302a7c5..90f5bef 100644 --- a/evaluate/CJRC_output.py +++ b/evaluate/CJRC_output.py @@ -6,9 +6,6 @@ from tqdm import tqdm -# import ipdb - - def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, version_2_with_negative=False, null_score_diff_threshold=0.): diff --git a/models/pytorch_modeling.py b/models/pytorch_modeling.py index 7db2843..c7f5fc8 100755 --- a/models/pytorch_modeling.py +++ b/models/pytorch_modeling.py @@ -249,7 +249,7 @@ def to_json_string(self): print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") class BertLayerNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-12): + def __init__(self, hidden_size, eps=1e-5): """Construct a layernorm module in the TF style (epsilon inside the square root). """ super(BertLayerNorm, self).__init__() @@ -277,7 +277,7 @@ def __init__(self, config): # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None): @@ -315,7 +315,7 @@ def __init__(self, config): self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load # any TensorFlow checkpoint file - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, input_ids, token_type_ids=None, position_ids=None): @@ -392,7 +392,7 @@ class BertSelfOutput(nn.Module): def __init__(self, config): super(BertSelfOutput, self).__init__() self.dense = nn.Linear(config.hidden_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.ln_type = 'postln' if 'ln_type' in config.__dict__: @@ -444,7 +444,7 @@ class BertOutput(nn.Module): def __init__(self, config): super(BertOutput, self).__init__() self.dense = nn.Linear(config.intermediate_size, config.hidden_size) - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) self.dropout = nn.Dropout(config.hidden_dropout_prob) self.ln_type = 'postln' if 'ln_type' in config.__dict__: @@ -536,7 +536,7 @@ def __init__(self, config): self.dense = nn.Linear(config.hidden_size, config.hidden_size) self.transform_act_fn = ACT2FN[config.hidden_act] \ if isinstance(config.hidden_act, str) else config.hidden_act - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -1111,7 +1111,7 @@ def __init__(self, config, dropout_rate): self.bert = ALBertModel(config) self.ln_type = config.ln_type if self.ln_type == 'ln_pre': - self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) else: self.LayerNorm = None self.dropout = nn.Dropout(dropout_rate) @@ -1154,8 +1154,8 @@ def __init__(self, config, dropout_rate): self.bert = ALBertModel(config) self.ln_type = config.ln_type if self.ln_type == 'ln_pre': - self.LayerNorm_qa = BertLayerNorm(config.hidden_size, eps=1e-12) - self.LayerNorm_cls = BertLayerNorm(config.hidden_size, eps=1e-12) + self.LayerNorm_qa = BertLayerNorm(config.hidden_size, eps=1e-5) + self.LayerNorm_cls = BertLayerNorm(config.hidden_size, eps=1e-5) else: self.LayerNorm_qa = None self.LayerNorm_cls = None diff --git a/models/tf_modeling.py b/models/tf_modeling.py new file mode 100644 index 0000000..d1e59ee --- /dev/null +++ b/models/tf_modeling.py @@ -0,0 +1,661 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy +import json +import numpy as np +import six +import tensorflow as tf + + +def float32_variable_storage_getter(getter, name, shape=None, dtype=None, + initializer=None, regularizer=None, + trainable=True, *args, **kwargs): + """Custom variable getter that forces trainable variables to be stored in + float32 precision and then casts them to the training precision. + """ + storage_dtype = tf.float32 if trainable else dtype + variable = getter(name, shape, dtype=storage_dtype, + initializer=initializer, regularizer=regularizer, + trainable=trainable, + *args, **kwargs) + if trainable and dtype != tf.float32: + variable = tf.cast(variable, dtype) + return variable + + +def get_custom_getter(compute_type): + return float32_variable_storage_getter if compute_type == tf.float16 else None + + +# define the dense layer + +try: + import blocksparse as bs + + + def layer_norm(x, name='LayerNorm', epsilon=1e-5, relu=False): + """ + normalize state vector to be zero mean / unit variance + learned scale/shift + """ + n_state = x.shape[-1].value + with tf.variable_scope(name): + gain = tf.get_variable('gamma', [n_state], initializer=tf.constant_initializer(1.0)) + bias = tf.get_variable('beta', [n_state], initializer=tf.constant_initializer(0.0)) + + return bs.layer_norm(x, gain, bias, axis=-1, epsilon=epsilon, relu=relu) + + + def dense(x, hidden_size, activation=None, name='dense', kernel_initializer=None, bias=True): + if kernel_initializer is None: + kernel_initializer = create_initializer(0.02) + with tf.variable_scope(name): + nx = x.shape[-1].value + ndims = x.shape.ndims + dtype = x.dtype + + # Note: param initializers are not particularly well tuned in this code + w = tf.get_variable("kernel", [nx, hidden_size], initializer=kernel_initializer, + dtype=dtype) + + assert x.op.device != '' + + if bias: + b = tf.get_variable("bias", [hidden_size], initializer=tf.zeros_initializer) + else: + b = 0 + + # merge context and batch dims for more efficient matmul + if ndims > 2: + y_shape = tf.concat([tf.shape(x)[: ndims - 1], [hidden_size]], axis=0) + x = tf.reshape(x, [-1, nx]) + + y = tf.matmul(x, w) + + if activation == 'fast_gelu' or activation == 'gelu': + fast_gelu = True + else: + fast_gelu = False + if activation == 'relu': + relu = True + else: + relu = False + y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False) + + if activation == 'tanh': + y = tf.tanh(y) + elif activation == 'sigmoid': + y = tf.sigmoid(y) + + if ndims > 2: + y = tf.reshape(y, y_shape) + + return y + + + def attention_softmax(qk_scores, scale): + return bs.softmax(qk_scores, scale) + +except: + print('WARNING!!!!Please install blocksparse for faster training and lower gpu memory cost!!!!!!') + + + def layer_norm_ops(x, g, b, axis=1, segments=1, epsilon=1e-6): + if axis < 0: + axis += len(x.shape) + + K = x.shape[axis].value + assert g.shape.num_elements() == K + assert b.shape.num_elements() == K + assert K % segments == 0 + assert axis != 0 or segments == 1, "Segments only implemented on axis=1 for now" + K //= segments + + ys = list() + for s in range(segments): + segK = slice(s * K, s * K + K) + segX = [segK if d == axis else slice(None) for d in range(x.shape.ndims)] + + mean, var = tf.nn.moments(x[segX], [axis], keep_dims=True) + norm = (x[segX] - mean) * tf.rsqrt(var + epsilon) + ys.append(norm * g[segK] + b[segK]) + + y = tf.concat(ys, axis) if segments > 1 else ys[0] + + return y + + + def layer_norm(input_tensor, name='LayerNorm', epsilon=1e-5): + """ + normalize state vector to be zero mean / unit variance + learned scale/shift + """ + n_state = input_tensor.shape[-1].value + with tf.variable_scope(name): + gain = tf.get_variable('gamma', [n_state], initializer=tf.constant_initializer(1.0), + dtype=input_tensor.dtype) + bias = tf.get_variable('beta', [n_state], initializer=tf.constant_initializer(0.0), + dtype=input_tensor.dtype) + x = layer_norm_ops(input_tensor, gain, bias, axis=-1, epsilon=epsilon) + return x + + + def dense(x, hidden_size, activation=None, name='dense', kernel_initializer=None, bias=True): + def gelu(x): + cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + def fast_gelu(x): + return x * tf.nn.sigmoid(1.702 * x) + + if kernel_initializer is None: + kernel_initializer = create_initializer(0.02) + with tf.variable_scope(name): + nx = x.shape[-1].value + ndims = x.shape.ndims + dtype = x.dtype + + # Note: param initializers are not particularly well tuned in this code + w = tf.get_variable("kernel", [nx, hidden_size], initializer=kernel_initializer, + dtype=dtype) + if bias: + b = tf.get_variable("bias", [hidden_size], initializer=tf.zeros_initializer, dtype=dtype) + else: + b = 0 + + # merge context and batch dims for more efficient matmul + if ndims > 2: + y_shape = tf.concat([tf.shape(x)[: ndims - 1], [hidden_size]], axis=0) + x = tf.reshape(x, [-1, nx]) + + y = tf.matmul(x, w) + + if bias: + y += b + + if activation == 'tanh': + y = tf.tanh(y) + elif activation == 'sigmoid': + y = tf.sigmoid(y) + elif activation == 'relu': + y = tf.nn.relu(y) + elif activation == 'gelu': + y = gelu(y) + elif activation == 'fast_gelu': + y = fast_gelu(y) + + if ndims > 2: + y = tf.reshape(y, y_shape) + + return y + + + def attention_softmax(qk_scores, scale): + return tf.nn.softmax(qk_scores * scale, axis=-1) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, rate=dropout_prob) + return output + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name='LayerNorm'): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def get_shape_list(tensor, expected_rank=None, name=None): + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + + +def split_states(x, n): + """ + reshape (batch, pixel, state) -> (batch, pixel, head, head_state) + """ + x_shape = get_shape_list(x) + m = x_shape[-1] + new_x_shape = x_shape[:-1] + [n, m // n] + return tf.reshape(x, new_x_shape) + + +def split_heads(x, n): + return tf.transpose(split_states(x, n), [0, 2, 1, 3]) + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def merge_states(x): + """ + reshape (batch, pixel, head, head_state) -> (batch, pixel, state) + """ + x_shape = get_shape_list(x) + new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])] + return tf.reshape(x, new_x_shape) + + +def merge_heads(x): + return merge_states(tf.transpose(x, [0, 2, 1, 3])) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_float16=False): + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range), + dtype=tf.float16 if use_float16 else tf.float32) + + output = tf.nn.embedding_lookup(embedding_table, input_ids) + return output, embedding_table + + +def embedding_postprocessor(input_tensor, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1, + use_float16=False): + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range), + dtype=tf.float16 if use_float16 else tf.float32) + token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids) + output += token_type_embeddings + + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range), + dtype=tf.float16 if use_float16 else tf.float32) + pos_ids = tf.expand_dims(tf.range(seq_length), 0) + pos_ids = tf.tile(pos_ids, (batch_size, 1)) + position_embeddings = tf.nn.embedding_lookup(full_position_embeddings, pos_ids) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def attention_layer(x, attention_mask=None, + num_attention_heads=1, + size_per_head=512, + attention_probs_dropout_prob=0.0, + initializer_range=0.02): + q = dense(x, num_attention_heads * size_per_head, name='query', + kernel_initializer=create_initializer(initializer_range)) + k = dense(x, num_attention_heads * size_per_head, name='key', + kernel_initializer=create_initializer(initializer_range)) + v = dense(x, num_attention_heads * size_per_head, name='value', + kernel_initializer=create_initializer(initializer_range)) + q = split_heads(q, num_attention_heads) + k = split_heads(k, num_attention_heads) + v = split_heads(v, num_attention_heads) + + qk = tf.matmul(q, k, transpose_b=True) # [bs, head, len, len] + qk += (-10000. * (1 - attention_mask)) + qk = attention_softmax(qk, scale=1.0 / np.sqrt(size_per_head)) + # 本来dropout在这,显存占用大 + qkv = tf.matmul(qk, v) # [bs, head, len, dim] + att = merge_heads(qkv) # [bs, len, dim*head] + # dropout转移到这里 + att = dropout(att, attention_probs_dropout_prob) + + return att + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn='gelu', + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + with tf.variable_scope("attention"): + with tf.variable_scope("self"): + attention_head = attention_layer(x=input_tensor, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = dense( + attention_head, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + input_tensor) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + input_tensor = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + return all_layer_outputs + else: + return layer_output + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_float16=False, + scope="bert"): + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert", reuse=tf.AUTO_REUSE, + custom_getter=get_custom_getter(tf.float16 if use_float16 else tf.float32)): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + self.embedding_output, self.embedding_table = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_float16=use_float16) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob, + use_float16=use_float16) + + with tf.variable_scope("encoder"): + attention_mask = tf.reshape(input_mask, (-1, 1, 1, input_mask.shape[1])) # [bs, len]->[bs, 1, 1, len] + attention_mask = tf.cast(attention_mask, self.embedding_output.dtype) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=config.hidden_act, + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = dense( + first_token_tensor, + config.hidden_size, + activation='tanh', + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +class BertModelMRC(object): + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + start_positions=None, + end_positions=None, + use_float16=False, + scope="bert"): + with tf.device("/gpu:0"): + self.bert = BertModel(config, is_training, input_ids, input_mask, token_type_ids, use_float16, scope) + + # finetune mrc + with tf.variable_scope('finetune_mrc', reuse=tf.AUTO_REUSE, + custom_getter=get_custom_getter(tf.float16 if use_float16 else tf.float32)): + self.sequence_output = self.bert.get_sequence_output() + # [bs, len] + self.start_logits = tf.squeeze(dense(self.sequence_output, 1, name='start_dense'), -1) + self.end_logits = tf.squeeze(dense(self.sequence_output, 1, name='end_dense'), -1) + self.start_logits += tf.cast(-10000. * (1 - input_mask), self.start_logits.dtype) + self.end_logits += tf.cast(-10000. * (1 - input_mask), self.end_logits.dtype) + + if is_training and start_positions is not None and end_positions is not None: + start_loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=tf.cast(self.start_logits, tf.float32), + labels=start_positions) + end_loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=tf.cast(self.end_logits, tf.float32), + labels=end_positions) + start_loss = tf.reduce_mean(start_loss_) + end_loss = tf.reduce_mean(end_loss_) + self.train_loss = (start_loss + end_loss) / 2.0 diff --git a/optimizations/tf_optimization.py b/optimizations/tf_optimization.py new file mode 100644 index 0000000..fe9f6ad --- /dev/null +++ b/optimizations/tf_optimization.py @@ -0,0 +1,201 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +class Optimizer(object): + def __init__(self, loss, init_lr, num_train_steps, num_warmup_steps, + hvd=None, use_fp16=False, loss_count=1000, clip_norm=1.0, + init_loss_scale=2 ** 16, beta1=0.9, beta2=0.999): + """Creates an optimizer training op.""" + self.global_step = tf.train.get_or_create_global_step() + + # avoid step change in learning rate at end of warmup phase + decayed_learning_rate_at_crossover_point = init_lr * (1.0 - float(num_warmup_steps) / float(num_train_steps)) + adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point) + learning_rate = tf.constant(value=adjusted_init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + self.global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(self.global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + self.learning_rate = learning_rate + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=beta1, + beta_2=beta2, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if hvd is not None: + from horovod.tensorflow.compression import Compression + optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) + if use_fp16: + loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager( + init_loss_scale=init_loss_scale, + incr_every_n_steps=loss_count, + decr_every_n_nan_or_inf=2, + decr_ratio=0.5) + optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + self.loss_scale = loss_scale_manager.get_loss_scale() + + tvars = tf.trainable_variables() + grads_and_vars = optimizer.compute_gradients(loss, tvars) + grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] + grads, tvars = list(zip(*grads_and_vars)) + all_are_finite = tf.reduce_all( + [tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 else tf.constant(True, dtype=tf.bool) + + # This is how the model was pre-trained. + # ensure global norm is a finite number + # to prevent clip_by_global_norm from having a hizzy fit. + (clipped_grads, _) = tf.clip_by_global_norm( + grads, clip_norm=clip_norm, + use_norm=tf.cond( + all_are_finite, + lambda: tf.global_norm(grads), + lambda: tf.constant(clip_norm))) + + train_op = optimizer.apply_gradients( + list(zip(clipped_grads, tvars)), global_step=self.global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = tf.cond(all_are_finite, lambda: self.global_step + 1, lambda: self.global_step) + new_global_step = tf.identity(new_global_step, name='step_update') + self.train_op = tf.group(train_op, [self.global_step.assign(new_global_step)]) + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = tf.identity(learning_rate, name='learning_rate') + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/utils.py b/utils.py index 2fa746f..c6f7e81 100644 --- a/utils.py +++ b/utils.py @@ -7,19 +7,20 @@ from glob import glob -def check_args(args): +def check_args(args, rank=0): args.setting_file = os.path.join(args.checkpoint_dir, args.setting_file) args.log_file = os.path.join(args.checkpoint_dir, args.log_file) - os.makedirs(args.checkpoint_dir, exist_ok=True) - with open(args.setting_file, 'wt') as opt_file: - opt_file.write('------------ Options -------------\n') - print('------------ Options -------------') - for k in args.__dict__: - v = args.__dict__[k] - opt_file.write('%s: %s\n' % (str(k), str(v))) - print('%s: %s' % (str(k), str(v))) - opt_file.write('-------------- End ----------------\n') - print('------------ End -------------') + if rank == 0: + os.makedirs(args.checkpoint_dir, exist_ok=True) + with open(args.setting_file, 'wt') as opt_file: + opt_file.write('------------ Options -------------\n') + print('------------ Options -------------') + for k in args.__dict__: + v = args.__dict__[k] + opt_file.write('%s: %s\n' % (str(k), str(v))) + print('%s: %s' % (str(k), str(v))) + opt_file.write('-------------- End ----------------\n') + print('------------ End -------------') return args