From 7f1646e6698901a74f263f466b09a0a57bf1bfa0 Mon Sep 17 00:00:00 2001
From: ewrfcas <ccjdurandal422@163.com>
Date: Tue, 15 Oct 2019 23:35:08 +0800
Subject: [PATCH] update the finetune codes for tensorflow

---
 CJRC_finetune.py => CJRC_finetune_pytorch.py  |   0
 DRCD_finetune.py => DRCD_finetune_pytorch.py  |  16 +-
 README.md                                     |  23 +-
 ...inetune.py => cmrc2018_finetune_pytorch.py |   0
 cmrc2018_finetune_tf.py                       | 256 +++++++
 convert_tf_checkpoint_to_pytorch.py           |   6 +-
 evaluate/CJRC_output.py                       |   3 -
 models/pytorch_modeling.py                    |  18 +-
 models/tf_modeling.py                         | 661 ++++++++++++++++++
 optimizations/tf_optimization.py              | 201 ++++++
 utils.py                                      |  23 +-
 11 files changed, 1171 insertions(+), 36 deletions(-)
 rename CJRC_finetune.py => CJRC_finetune_pytorch.py (100%)
 rename DRCD_finetune.py => DRCD_finetune_pytorch.py (97%)
 rename cmrc2018_finetune.py => cmrc2018_finetune_pytorch.py (100%)
 create mode 100644 cmrc2018_finetune_tf.py
 create mode 100644 models/tf_modeling.py
 create mode 100644 optimizations/tf_optimization.py

diff --git a/CJRC_finetune.py b/CJRC_finetune_pytorch.py
similarity index 100%
rename from CJRC_finetune.py
rename to CJRC_finetune_pytorch.py
diff --git a/DRCD_finetune.py b/DRCD_finetune_pytorch.py
similarity index 97%
rename from DRCD_finetune.py
rename to DRCD_finetune_pytorch.py
index b41c791..06c2c91 100644
--- a/DRCD_finetune.py
+++ b/DRCD_finetune_pytorch.py
@@ -81,15 +81,15 @@ def evaluate(model, args, eval_examples, eval_features, device, global_steps, be
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--gpu_ids', type=str, default='0,1,2,3')
+    parser.add_argument('--gpu_ids', type=str, default='0,1,2,3,4,5,6,7')
 
     # training parameter
-    parser.add_argument('--train_epochs', type=int, default=2)
+    parser.add_argument('--train_epochs', type=int, default=3)
     parser.add_argument('--n_batch', type=int, default=32)
-    parser.add_argument('--lr', type=float, default=3e-5)
+    parser.add_argument('--lr', type=float, default=2.5e-5)
     parser.add_argument('--dropout', type=float, default=0.1)
     parser.add_argument('--clip_norm', type=float, default=1.0)
-    parser.add_argument('--warmup_rate', type=float, default=0.05)
+    parser.add_argument('--warmup_rate', type=float, default=0.06)
     parser.add_argument("--schedule", default='warmup_linear', type=str, help='schedule')
     parser.add_argument("--weight_decay_rate", default=0.01, type=float, help='weight_decay_rate')
     parser.add_argument('--seed', type=list, default=[123, 456, 789, 556, 977])
@@ -112,13 +112,13 @@ def evaluate(model, args, eval_examples, eval_features, device, global_steps, be
     parser.add_argument('--dev_file', type=str,
                         default='origin_data/DRCD/DRCD_dev.json')
     parser.add_argument('--bert_config_file', type=str,
-                        default='check_points/pretrain_models/albert_large_zh/albert_config_large.json')
+                        default='check_points/pretrain_models/albert_xlarge_zh/bert_config.json')
     parser.add_argument('--vocab_file', type=str,
-                        default='check_points/pretrain_models/albert_large_zh/vocab.txt')
+                        default='check_points/pretrain_models/albert_xlarge_zh/vocab.txt')
     parser.add_argument('--init_restore_dir', type=str,
-                        default='check_points/pretrain_models/albert_large_zh/pytorch_albert_model.pth')
+                        default='check_points/pretrain_models/albert_xlarge_zh/pytorch_model.pth')
     parser.add_argument('--checkpoint_dir', type=str,
-                        default='check_points/DRCD/albert_large_zh/')
+                        default='check_points/DRCD/albert_xlarge_zh/')
     parser.add_argument('--setting_file', type=str, default='setting.txt')
     parser.add_argument('--log_file', type=str, default='log.txt')
 
diff --git a/README.md b/README.md
index edde3d7..b5fd42e 100755
--- a/README.md
+++ b/README.md
@@ -1,6 +1,12 @@
 ## BERT下游任务finetune列表
 
-finetune基于官方代码改造的模型都是基于pytorch的，因为tensorflow的fp16和多gpu还要重写，有兴趣的童鞋可以补充一下。
+finetune基于官方代码改造的模型基于pytorch/tensorflow双版本
+
+*** 2019-10-15: 增加tensorflow(bert/roberta)在cmrc2018上的finetune代码 ***
+
+2019-10-14: 新增DRCD test结果
+
+*** 2019-10-12: pytorch支持albert ***
 
 ### 模型及相关代码来源
 
@@ -14,7 +20,7 @@ finetune基于官方代码改造的模型都是基于pytorch的，因为tensorfl
 
 5. 自己瞎折腾的siBert (https://github.com/ewrfcas/SiBert_tensorflow)
 
-### 关于FP16
+### 关于pytorch的FP16
 
 FP16的训练可以显著降低显存压力(如果有V100等GPU资源还能提高速度)。但是最新版编译的apex-FP16对并行的支持并不友好(https://github.com/NVIDIA/apex/issues/227)。  
 实践下来bert相关任务的finetune任务对fp16的数值压力是比较小的，因此可以更多的以计算精度换取效率，所以我还是倾向于使用老版的FusedAdam+FP16_Optimizer的组合。  
@@ -23,6 +29,19 @@ FP16的训练可以显著降低显存压力(如果有V100等GPU资源还能提
 pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext"  --global-option="--deprecated_fused_adam" ./
 ```
 
+### 关于tensorflow的blocksparse
+
+blocksparse(https://github.com/openai/blocksparse)可以在tensorflow1.13版本直接pip安装，否则可以自己clone后编译。  
+其中fast_gelu以及self-attention中的softmax能够极大缓解显存压力。另外部分dropout位置我有所调整，整体显存占用下降大约30%~40%。
+
+tensorflow roberta_large length=512 fp16
+
+model | length | batch | memory |
+| ------ | ------ | ------ | ------ |
+| roberta_base | 512 | 32 | 16GB |
+| roberta_large | 512 | 12 | 16GB |
+
+
 ### 参与任务
 
 1. CMRC 2018：篇章片段抽取型阅读理解（简体中文，只测了dev）
diff --git a/cmrc2018_finetune.py b/cmrc2018_finetune_pytorch.py
similarity index 100%
rename from cmrc2018_finetune.py
rename to cmrc2018_finetune_pytorch.py
diff --git a/cmrc2018_finetune_tf.py b/cmrc2018_finetune_tf.py
new file mode 100644
index 0000000..1b5f6b2
--- /dev/null
+++ b/cmrc2018_finetune_tf.py
@@ -0,0 +1,256 @@
+import argparse
+import numpy as np
+import tensorflow as tf
+import os
+from models.tf_modeling import BertModelMRC, BertConfig
+from optimizations.tf_optimization import Optimizer
+import json
+import utils
+from evaluate.cmrc2018_evaluate import get_eval
+from evaluate.cmrc2018_output import write_predictions
+import random
+from tqdm import tqdm
+import collections
+from tokenizations.offical_tokenization import BertTokenizer
+from preprocess.cmrc2018_preprocess import json2features
+
+
+def data_generator(data, n_batch, shuffle=False, drop_last=False):
+    steps_per_epoch = len(data) // n_batch
+    if len(data) % n_batch != 0 and not drop_last:
+        steps_per_epoch += 1
+    data_set = dict()
+    for k in data[0]:
+        data_set[k] = np.array([data_[k] for data_ in data])
+    index_all = np.arange(len(data))
+
+    while True:
+        if shuffle:
+            random.shuffle(index_all)
+        for i in range(steps_per_epoch):
+            yield {k: data_set[k][index_all[i * n_batch:(i + 1) * n_batch]] for k in data_set}
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    tf.logging.set_verbosity(tf.logging.INFO)
+
+    parser.add_argument('--gpu_ids', type=str, default='0')
+
+    # training parameter
+    parser.add_argument('--train_epochs', type=int, default=2)
+    parser.add_argument('--n_batch', type=int, default=32)
+    parser.add_argument('--lr', type=float, default=3e-5)
+    parser.add_argument('--dropout', type=float, default=0.1)
+    parser.add_argument('--clip_norm', type=float, default=1.0)
+    parser.add_argument('--loss_scale', type=float, default=2.0 ** 15)
+    parser.add_argument('--warmup_iters', type=int, default=0.1)
+    parser.add_argument('--loss_count', type=int, default=1000)
+    parser.add_argument('--seed', type=list, default=[123, 456, 789, 556, 977])
+    parser.add_argument('--float16', type=int, default=True)  # only sm >= 7.0 (tensorcores)
+    parser.add_argument('--max_ans_length', type=int, default=50)
+    parser.add_argument('--log_interval', type=int, default=30)  # show the average loss per 30 steps args.
+    parser.add_argument('--n_best', type=int, default=20)
+    parser.add_argument('--eval_epochs', type=float, default=0.5)
+    parser.add_argument('--save_best', type=bool, default=True)
+    parser.add_argument('--vocab_size', type=int, default=21128)
+    parser.add_argument('--max_seq_length', type=int, default=512)
+
+    # data dir
+    parser.add_argument('--vocab_file', type=str,
+                        default='check_points/pretrain_models/roberta_wwm_ext_base/vocab.txt')
+
+    parser.add_argument('--train_dir', type=str, default='dataset/cmrc2018/train_features_roberta512.json')
+    parser.add_argument('--dev_dir1', type=str, default='dataset/cmrc2018/dev_examples_roberta512.json')
+    parser.add_argument('--dev_dir2', type=str, default='dataset/cmrc2018/dev_features_roberta512.json')
+    parser.add_argument('--train_file', type=str, default='origin_data/cmrc2018/cmrc2018_train.json')
+    parser.add_argument('--dev_file', type=str, default='origin_data/cmrc2018/cmrc2018_dev.json')
+    parser.add_argument('--bert_config_file', type=str,
+                        default='check_points/pretrain_models/roberta_wwm_ext_base/bert_config.json')
+    parser.add_argument('--init_restore_dir', type=str,
+                        default='check_points/pretrain_models/roberta_wwm_ext_base/bert_model.ckpt')
+    parser.add_argument('--checkpoint_dir', type=str,
+                        default='check_points/cmrc2018/roberta_wwm_ext_base/')
+    parser.add_argument('--setting_file', type=str, default='setting.txt')
+    parser.add_argument('--log_file', type=str, default='log.txt')
+
+    # use some global vars for convenience
+    args = parser.parse_args()
+    args.checkpoint_dir += ('/epoch{}_batch{}_lr{}_warmup{}_anslen{}_tf/'
+                            .format(args.train_epochs, args.n_batch, args.lr, args.warmup_iters, args.max_ans_length))
+    args = utils.check_args(args)
+    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
+
+    print_rank0('######## generating data ########')
+
+    tokenizer = BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True)
+    assert args.vocab_size == len(tokenizer.vocab)
+    if not os.path.exists(args.train_dir):
+        json2features(args.train_file, [args.train_dir.replace('_features_', '_examples_'),
+                                        args.train_dir], tokenizer, is_training=True)
+
+    if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2):
+        json2features(args.dev_file, [args.dev_dir1, args.dev_dir2], tokenizer, is_training=False)
+
+    train_data = json.load(open(args.train_dir, 'r'))
+    dev_examples = json.load(open(args.dev_dir1, 'r'))
+    dev_data = json.load(open(args.dev_dir2, 'r'))
+    if os.path.exists(args.log_file):
+        os.remove(args.log_file)
+
+    steps_per_epoch = len(train_data) // args.n_batch
+    eval_steps = int(steps_per_epoch * args.eval_epochs)
+    dev_steps_per_epoch = len(dev_data) // args.n_batch
+    if len(train_data) % args.n_batch != 0:
+        steps_per_epoch += 1
+    if len(dev_data) % args.n_batch != 0:
+        dev_steps_per_epoch += 1
+    total_steps = steps_per_epoch * args.train_epochs
+    args.warmup_iters = int(args.warmup_iters * total_steps)
+
+    print('steps per epoch:', steps_per_epoch)
+    print('total steps:', total_steps)
+    print('warmup steps:', args.warmup_iters)
+
+    F1s = []
+    EMs = []
+    best_f1_em = 0
+    with tf.device("/gpu:0"):
+        input_ids = tf.placeholder(tf.int32, shape=[None, args.max_seq_length], name='input_ids')
+        input_masks = tf.placeholder(tf.float32, shape=[None, args.max_seq_length], name='input_masks')
+        segment_ids = tf.placeholder(tf.int32, shape=[None, args.max_seq_length], name='segment_ids')
+        start_positions = tf.placeholder(tf.int32, shape=[None, ], name='start_positions')
+        end_positions = tf.placeholder(tf.int32, shape=[None, ], name='end_positions')
+
+    # build the models for training and testing/validation
+    print('######## init model ########')
+    bert_config = BertConfig.from_json_file(args.bert_config_file)
+    train_model = BertModelMRC(config=bert_config,
+                               is_training=True,
+                               input_ids=input_ids,
+                               input_mask=input_masks,
+                               token_type_ids=segment_ids,
+                               start_positions=start_positions,
+                               end_positions=end_positions,
+                               use_float16=args.float16)
+
+    eval_model = BertModelMRC(config=bert_config,
+                              is_training=False,
+                              input_ids=input_ids,
+                              input_mask=input_masks,
+                              token_type_ids=segment_ids,
+                              use_float16=args.float16)
+
+    optimization = Optimizer(loss=train_model.train_loss,
+                             init_lr=args.lr,
+                             num_train_steps=total_steps,
+                             num_warmup_steps=args.warmup_iters,
+                             hvd=None,
+                             use_fp16=args.float16,
+                             loss_count=args.loss_count,
+                             clip_norm=args.clip_norm,
+                             init_loss_scale=args.loss_scale)
+
+    for seed_ in args.seed:
+        best_f1, best_em = 0, 0
+        with open(args.log_file, 'a') as aw:
+            aw.write('===================================' +
+                     'SEED:' + str(seed_)
+                     + '===================================' + '\n')
+        print('SEED:', seed_)
+        # random seed
+        np.random.seed(seed_)
+        random.seed(seed_)
+        tf.set_random_seed(seed_)
+
+        train_gen = data_generator(train_data, args.n_batch, shuffle=True, drop_last=False)
+        dev_gen = data_generator(dev_data, args.n_batch, shuffle=False, drop_last=False)
+
+        config = tf.ConfigProto()
+        config.allow_soft_placement = True
+        config.gpu_options.allow_growth = True
+
+        utils.show_all_variables()
+        utils.init_from_checkpoint(args.init_restore_dir)
+        RawResult = collections.namedtuple("RawResult",
+                                           ["unique_id", "start_logits", "end_logits"])
+
+        saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=1)
+        global_steps = 0
+        with tf.Session(config=config) as sess:
+            sess.run(tf.global_variables_initializer())
+            for i in range(args.train_epochs):
+                print('Starting epoch %d' % (i + 1))
+                total_loss = 0
+                iteration = 1
+                with tqdm(total=steps_per_epoch, desc='Epoch %d' % (i + 1)) as pbar:
+                    for _ in range(steps_per_epoch):
+                        batch_data = next(train_gen)
+                        feed_data = {input_ids: batch_data['input_ids'],
+                                     input_masks: batch_data['input_mask'],
+                                     segment_ids: batch_data['segment_ids'],
+                                     start_positions: batch_data['start_position'],
+                                     end_positions: batch_data['end_position']}
+                        loss, _ = sess.run([train_model.train_loss, optimization.train_op], feed_dict=feed_data)
+                        total_loss += loss
+                        pbar.set_postfix({'loss': '{0:1.5f}'.format(total_loss / (iteration + 1e-5))})
+                        pbar.update(1)
+                        iteration += 1
+                        global_steps += 1
+
+                        if global_steps % eval_steps == 0:
+                            print('Evaluating...')
+                            all_results = []
+                            for i_step in tqdm(range(dev_steps_per_epoch)):
+                                batch_data = next(dev_gen)
+                                feed_data = {input_ids: batch_data['input_ids'],
+                                             input_masks: batch_data['input_mask'],
+                                             segment_ids: batch_data['segment_ids']}
+                                batch_start_logits, batch_end_logits = sess.run(
+                                    [eval_model.start_logits, eval_model.end_logits],
+                                    feed_dict=feed_data)
+                                for j in range(len(batch_data['unique_id'])):
+                                    start_logits = batch_start_logits[j]
+                                    end_logits = batch_end_logits[j]
+                                    unique_id = batch_data['unique_id'][j]
+                                    all_results.append(RawResult(unique_id=unique_id,
+                                                                 start_logits=start_logits,
+                                                                 end_logits=end_logits))
+
+                            output_prediction_file = os.path.join(args.checkpoint_dir,
+                                                                  'prediction_epoch' + str(i) + '.json')
+                            output_nbest_file = os.path.join(args.checkpoint_dir, 'nbest_epoch' + str(i) + '.json')
+
+                            write_predictions(dev_examples, dev_data, all_results,
+                                              n_best_size=args.n_best, max_answer_length=args.max_ans_length,
+                                              do_lower_case=True, output_prediction_file=output_prediction_file,
+                                              output_nbest_file=output_nbest_file)
+                            tmp_result = get_eval(args.dev_file, output_prediction_file)
+                            tmp_result['STEP'] = global_steps
+                            with open(args.log_file, 'a') as aw:
+                                aw.write(json.dumps(tmp_result) + '\n')
+                            print(tmp_result)
+
+                            if float(tmp_result['F1']) > best_f1:
+                                best_f1 = float(tmp_result['F1'])
+                            if float(tmp_result['EM']) > best_em:
+                                best_em = float(tmp_result['EM'])
+
+                            if float(tmp_result['F1']) + float(tmp_result['EM']) > best_f1_em:
+                                best_f1_em = float(tmp_result['F1']) + float(tmp_result['EM'])
+                                scores = {'F1': float(tmp_result['F1']), 'EM': float(tmp_result['EM'])}
+                                save_prex = "checkpoint_score"
+                                for k in scores:
+                                    save_prex += ('_' + k + '-' + str(scores[k])[:6])
+                                save_prex += '.ckpt'
+                                saver.save(sess, save_path=os.path.join(args.checkpoint_dir, save_prex))
+
+        F1s.append(best_f1)
+        EMs.append(best_em)
+
+    print('Mean F1:', np.mean(F1s), 'Mean EM:', np.mean(EMs))
+    print('Best F1:', np.max(F1s), 'Best EM:', np.max(EMs))
+    with open(args.log_file, 'a') as aw:
+        aw.write('Mean(Best) F1:{}({})\n'.format(np.mean(F1s), np.max(F1s)))
+        aw.write('Mean(Best) EM:{}({})\n'.format(np.mean(EMs), np.max(EMs)))
diff --git a/convert_tf_checkpoint_to_pytorch.py b/convert_tf_checkpoint_to_pytorch.py
index 393eb8a..c393644 100755
--- a/convert_tf_checkpoint_to_pytorch.py
+++ b/convert_tf_checkpoint_to_pytorch.py
@@ -100,16 +100,16 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
     parser = argparse.ArgumentParser()
     ## Required parameters
     parser.add_argument("--tf_checkpoint_path",
-                        default='check_points/pretrain_models/albert_large_zh/albert_model.ckpt',
+                        default='check_points/pretrain_models/albert_xlarge_zh/albert_model.ckpt',
                         type=str,
                         help="Path the TensorFlow checkpoint path.")
     parser.add_argument("--bert_config_file",
-                        default='check_points/pretrain_models/albert_large_zh/albert_config_large.json',
+                        default='check_points/pretrain_models/albert_xlarge_zh/bert_config.json',
                         type=str,
                         help="The config json file corresponding to the pre-trained BERT model. \n"
                              "This specifies the model architecture.")
     parser.add_argument("--pytorch_dump_path",
-                        default='check_points/pretrain_models/albert_large_zh/pytorch_albert_model.pth',
+                        default='check_points/pretrain_models/albert_xlarge_zh/pytorch_model.pth',
                         type=str,
                         help="Path to the output PyTorch model.")
     parser.add_argument("--is_albert",
diff --git a/evaluate/CJRC_output.py b/evaluate/CJRC_output.py
index 302a7c5..90f5bef 100644
--- a/evaluate/CJRC_output.py
+++ b/evaluate/CJRC_output.py
@@ -6,9 +6,6 @@
 from tqdm import tqdm
 
 
-# import ipdb
-
-
 def write_predictions(all_examples, all_features, all_results, n_best_size,
                       max_answer_length, do_lower_case, output_prediction_file,
                       output_nbest_file, version_2_with_negative=False, null_score_diff_threshold=0.):
diff --git a/models/pytorch_modeling.py b/models/pytorch_modeling.py
index 7db2843..c7f5fc8 100755
--- a/models/pytorch_modeling.py
+++ b/models/pytorch_modeling.py
@@ -249,7 +249,7 @@ def to_json_string(self):
     print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.")
 
     class BertLayerNorm(nn.Module):
-        def __init__(self, hidden_size, eps=1e-12):
+        def __init__(self, hidden_size, eps=1e-5):
             """Construct a layernorm module in the TF style (epsilon inside the square root).
             """
             super(BertLayerNorm, self).__init__()
@@ -277,7 +277,7 @@ def __init__(self, config):
 
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, input_ids, token_type_ids=None):
@@ -315,7 +315,7 @@ def __init__(self, config):
         self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
         # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
         # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, input_ids, token_type_ids=None, position_ids=None):
@@ -392,7 +392,7 @@ class BertSelfOutput(nn.Module):
     def __init__(self, config):
         super(BertSelfOutput, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.ln_type = 'postln'
         if 'ln_type' in config.__dict__:
@@ -444,7 +444,7 @@ class BertOutput(nn.Module):
     def __init__(self, config):
         super(BertOutput, self).__init__()
         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
         self.ln_type = 'postln'
         if 'ln_type' in config.__dict__:
@@ -536,7 +536,7 @@ def __init__(self, config):
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.transform_act_fn = ACT2FN[config.hidden_act] \
             if isinstance(config.hidden_act, str) else config.hidden_act
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
 
     def forward(self, hidden_states):
         hidden_states = self.dense(hidden_states)
@@ -1111,7 +1111,7 @@ def __init__(self, config, dropout_rate):
         self.bert = ALBertModel(config)
         self.ln_type = config.ln_type
         if self.ln_type == 'ln_pre':
-            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
+            self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5)
         else:
             self.LayerNorm = None
         self.dropout = nn.Dropout(dropout_rate)
@@ -1154,8 +1154,8 @@ def __init__(self, config, dropout_rate):
         self.bert = ALBertModel(config)
         self.ln_type = config.ln_type
         if self.ln_type == 'ln_pre':
-            self.LayerNorm_qa = BertLayerNorm(config.hidden_size, eps=1e-12)
-            self.LayerNorm_cls = BertLayerNorm(config.hidden_size, eps=1e-12)
+            self.LayerNorm_qa = BertLayerNorm(config.hidden_size, eps=1e-5)
+            self.LayerNorm_cls = BertLayerNorm(config.hidden_size, eps=1e-5)
         else:
             self.LayerNorm_qa = None
             self.LayerNorm_cls = None
diff --git a/models/tf_modeling.py b/models/tf_modeling.py
new file mode 100644
index 0000000..d1e59ee
--- /dev/null
+++ b/models/tf_modeling.py
@@ -0,0 +1,661 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""The main BERT model and related functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import json
+import numpy as np
+import six
+import tensorflow as tf
+
+
+def float32_variable_storage_getter(getter, name, shape=None, dtype=None,
+                                    initializer=None, regularizer=None,
+                                    trainable=True, *args, **kwargs):
+    """Custom variable getter that forces trainable variables to be stored in
+       float32 precision and then casts them to the training precision.
+    """
+    storage_dtype = tf.float32 if trainable else dtype
+    variable = getter(name, shape, dtype=storage_dtype,
+                      initializer=initializer, regularizer=regularizer,
+                      trainable=trainable,
+                      *args, **kwargs)
+    if trainable and dtype != tf.float32:
+        variable = tf.cast(variable, dtype)
+    return variable
+
+
+def get_custom_getter(compute_type):
+    return float32_variable_storage_getter if compute_type == tf.float16 else None
+
+
+# define the dense layer
+
+try:
+    import blocksparse as bs
+
+
+    def layer_norm(x, name='LayerNorm', epsilon=1e-5, relu=False):
+        """
+        normalize state vector to be zero mean / unit variance + learned scale/shift
+        """
+        n_state = x.shape[-1].value
+        with tf.variable_scope(name):
+            gain = tf.get_variable('gamma', [n_state], initializer=tf.constant_initializer(1.0))
+            bias = tf.get_variable('beta', [n_state], initializer=tf.constant_initializer(0.0))
+
+            return bs.layer_norm(x, gain, bias, axis=-1, epsilon=epsilon, relu=relu)
+
+
+    def dense(x, hidden_size, activation=None, name='dense', kernel_initializer=None, bias=True):
+        if kernel_initializer is None:
+            kernel_initializer = create_initializer(0.02)
+        with tf.variable_scope(name):
+            nx = x.shape[-1].value
+            ndims = x.shape.ndims
+            dtype = x.dtype
+
+            # Note: param initializers are not particularly well tuned in this code
+            w = tf.get_variable("kernel", [nx, hidden_size], initializer=kernel_initializer,
+                                dtype=dtype)
+
+            assert x.op.device != ''
+
+            if bias:
+                b = tf.get_variable("bias", [hidden_size], initializer=tf.zeros_initializer)
+            else:
+                b = 0
+
+            # merge context and batch dims for more efficient matmul
+            if ndims > 2:
+                y_shape = tf.concat([tf.shape(x)[: ndims - 1], [hidden_size]], axis=0)
+                x = tf.reshape(x, [-1, nx])
+
+            y = tf.matmul(x, w)
+
+            if activation == 'fast_gelu' or activation == 'gelu':
+                fast_gelu = True
+            else:
+                fast_gelu = False
+            if activation == 'relu':
+                relu = True
+            else:
+                relu = False
+            y = bs.bias_relu(y, b, relu=relu, fast_gelu=fast_gelu, atomics=False)
+
+            if activation == 'tanh':
+                y = tf.tanh(y)
+            elif activation == 'sigmoid':
+                y = tf.sigmoid(y)
+
+            if ndims > 2:
+                y = tf.reshape(y, y_shape)
+
+            return y
+
+
+    def attention_softmax(qk_scores, scale):
+        return bs.softmax(qk_scores, scale)
+
+except:
+    print('WARNING!!!!Please install blocksparse for faster training and lower gpu memory cost!!!!!!')
+
+
+    def layer_norm_ops(x, g, b, axis=1, segments=1, epsilon=1e-6):
+        if axis < 0:
+            axis += len(x.shape)
+
+        K = x.shape[axis].value
+        assert g.shape.num_elements() == K
+        assert b.shape.num_elements() == K
+        assert K % segments == 0
+        assert axis != 0 or segments == 1, "Segments only implemented on axis=1 for now"
+        K //= segments
+
+        ys = list()
+        for s in range(segments):
+            segK = slice(s * K, s * K + K)
+            segX = [segK if d == axis else slice(None) for d in range(x.shape.ndims)]
+
+            mean, var = tf.nn.moments(x[segX], [axis], keep_dims=True)
+            norm = (x[segX] - mean) * tf.rsqrt(var + epsilon)
+            ys.append(norm * g[segK] + b[segK])
+
+        y = tf.concat(ys, axis) if segments > 1 else ys[0]
+
+        return y
+
+
+    def layer_norm(input_tensor, name='LayerNorm', epsilon=1e-5):
+        """
+        normalize state vector to be zero mean / unit variance + learned scale/shift
+        """
+        n_state = input_tensor.shape[-1].value
+        with tf.variable_scope(name):
+            gain = tf.get_variable('gamma', [n_state], initializer=tf.constant_initializer(1.0),
+                                   dtype=input_tensor.dtype)
+            bias = tf.get_variable('beta', [n_state], initializer=tf.constant_initializer(0.0),
+                                   dtype=input_tensor.dtype)
+            x = layer_norm_ops(input_tensor, gain, bias, axis=-1, epsilon=epsilon)
+            return x
+
+
+    def dense(x, hidden_size, activation=None, name='dense', kernel_initializer=None, bias=True):
+        def gelu(x):
+            cdf = 0.5 * (1.0 + tf.tanh((np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3)))))
+            return x * cdf
+
+        def fast_gelu(x):
+            return x * tf.nn.sigmoid(1.702 * x)
+
+        if kernel_initializer is None:
+            kernel_initializer = create_initializer(0.02)
+        with tf.variable_scope(name):
+            nx = x.shape[-1].value
+            ndims = x.shape.ndims
+            dtype = x.dtype
+
+            # Note: param initializers are not particularly well tuned in this code
+            w = tf.get_variable("kernel", [nx, hidden_size], initializer=kernel_initializer,
+                                dtype=dtype)
+            if bias:
+                b = tf.get_variable("bias", [hidden_size], initializer=tf.zeros_initializer, dtype=dtype)
+            else:
+                b = 0
+
+            # merge context and batch dims for more efficient matmul
+            if ndims > 2:
+                y_shape = tf.concat([tf.shape(x)[: ndims - 1], [hidden_size]], axis=0)
+                x = tf.reshape(x, [-1, nx])
+
+            y = tf.matmul(x, w)
+
+            if bias:
+                y += b
+
+            if activation == 'tanh':
+                y = tf.tanh(y)
+            elif activation == 'sigmoid':
+                y = tf.sigmoid(y)
+            elif activation == 'relu':
+                y = tf.nn.relu(y)
+            elif activation == 'gelu':
+                y = gelu(y)
+            elif activation == 'fast_gelu':
+                y = fast_gelu(y)
+
+            if ndims > 2:
+                y = tf.reshape(y, y_shape)
+
+            return y
+
+
+    def attention_softmax(qk_scores, scale):
+        return tf.nn.softmax(qk_scores * scale, axis=-1)
+
+
+def dropout(input_tensor, dropout_prob):
+    """Perform dropout.
+
+    Args:
+      input_tensor: float Tensor.
+      dropout_prob: Python float. The probability of dropping out a value (NOT of
+        *keeping* a dimension as in `tf.nn.dropout`).
+
+    Returns:
+      A version of `input_tensor` with dropout applied.
+    """
+    if dropout_prob is None or dropout_prob == 0.0:
+        return input_tensor
+
+    output = tf.nn.dropout(input_tensor, rate=dropout_prob)
+    return output
+
+
+def layer_norm_and_dropout(input_tensor, dropout_prob, name='LayerNorm'):
+    """Runs layer normalization followed by dropout."""
+    output_tensor = layer_norm(input_tensor, name)
+    output_tensor = dropout(output_tensor, dropout_prob)
+    return output_tensor
+
+
+def create_initializer(initializer_range=0.02):
+    """Creates a `truncated_normal_initializer` with the given range."""
+    return tf.truncated_normal_initializer(stddev=initializer_range)
+
+
+def get_shape_list(tensor, expected_rank=None, name=None):
+    if name is None:
+        name = tensor.name
+
+    if expected_rank is not None:
+        assert_rank(tensor, expected_rank, name)
+
+    shape = tensor.shape.as_list()
+
+    non_static_indexes = []
+    for (index, dim) in enumerate(shape):
+        if dim is None:
+            non_static_indexes.append(index)
+
+    if not non_static_indexes:
+        return shape
+
+    dyn_shape = tf.shape(tensor)
+    for index in non_static_indexes:
+        shape[index] = dyn_shape[index]
+    return shape
+
+
+def assert_rank(tensor, expected_rank, name=None):
+    """Raises an exception if the tensor rank is not of the expected rank.
+
+    Args:
+      tensor: A tf.Tensor to check the rank of.
+      expected_rank: Python integer or list of integers, expected rank.
+      name: Optional name of the tensor for the error message.
+
+    Raises:
+      ValueError: If the expected shape doesn't match the actual shape.
+    """
+    if name is None:
+        name = tensor.name
+
+    expected_rank_dict = {}
+    if isinstance(expected_rank, six.integer_types):
+        expected_rank_dict[expected_rank] = True
+    else:
+        for x in expected_rank:
+            expected_rank_dict[x] = True
+
+    actual_rank = tensor.shape.ndims
+    if actual_rank not in expected_rank_dict:
+        scope_name = tf.get_variable_scope().name
+        raise ValueError(
+            "For the tensor `%s` in scope `%s`, the actual rank "
+            "`%d` (shape = %s) is not equal to the expected rank `%s`" %
+            (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank)))
+
+
+def split_states(x, n):
+    """
+    reshape (batch, pixel, state) -> (batch, pixel, head, head_state)
+    """
+    x_shape = get_shape_list(x)
+    m = x_shape[-1]
+    new_x_shape = x_shape[:-1] + [n, m // n]
+    return tf.reshape(x, new_x_shape)
+
+
+def split_heads(x, n):
+    return tf.transpose(split_states(x, n), [0, 2, 1, 3])
+
+
+def reshape_from_matrix(output_tensor, orig_shape_list):
+    """Reshapes a rank 2 tensor back to its original rank >= 2 tensor."""
+    if len(orig_shape_list) == 2:
+        return output_tensor
+
+    output_shape = get_shape_list(output_tensor)
+
+    orig_dims = orig_shape_list[0:-1]
+    width = output_shape[-1]
+
+    return tf.reshape(output_tensor, orig_dims + [width])
+
+
+def merge_states(x):
+    """
+    reshape (batch, pixel, head, head_state) -> (batch, pixel, state)
+    """
+    x_shape = get_shape_list(x)
+    new_x_shape = x_shape[:-2] + [np.prod(x_shape[-2:])]
+    return tf.reshape(x, new_x_shape)
+
+
+def merge_heads(x):
+    return merge_states(tf.transpose(x, [0, 2, 1, 3]))
+
+
+def embedding_lookup(input_ids,
+                     vocab_size,
+                     embedding_size=128,
+                     initializer_range=0.02,
+                     word_embedding_name="word_embeddings",
+                     use_float16=False):
+    embedding_table = tf.get_variable(
+        name=word_embedding_name,
+        shape=[vocab_size, embedding_size],
+        initializer=create_initializer(initializer_range),
+        dtype=tf.float16 if use_float16 else tf.float32)
+
+    output = tf.nn.embedding_lookup(embedding_table, input_ids)
+    return output, embedding_table
+
+
+def embedding_postprocessor(input_tensor,
+                            token_type_ids=None,
+                            token_type_vocab_size=16,
+                            token_type_embedding_name="token_type_embeddings",
+                            position_embedding_name="position_embeddings",
+                            initializer_range=0.02,
+                            max_position_embeddings=512,
+                            dropout_prob=0.1,
+                            use_float16=False):
+    input_shape = get_shape_list(input_tensor, expected_rank=3)
+    batch_size = input_shape[0]
+    seq_length = input_shape[1]
+    width = input_shape[2]
+
+    output = input_tensor
+    token_type_table = tf.get_variable(
+        name=token_type_embedding_name,
+        shape=[token_type_vocab_size, width],
+        initializer=create_initializer(initializer_range),
+        dtype=tf.float16 if use_float16 else tf.float32)
+    token_type_embeddings = tf.nn.embedding_lookup(token_type_table, token_type_ids)
+    output += token_type_embeddings
+
+    full_position_embeddings = tf.get_variable(
+        name=position_embedding_name,
+        shape=[max_position_embeddings, width],
+        initializer=create_initializer(initializer_range),
+        dtype=tf.float16 if use_float16 else tf.float32)
+    pos_ids = tf.expand_dims(tf.range(seq_length), 0)
+    pos_ids = tf.tile(pos_ids, (batch_size, 1))
+    position_embeddings = tf.nn.embedding_lookup(full_position_embeddings, pos_ids)
+    output += position_embeddings
+
+    output = layer_norm_and_dropout(output, dropout_prob)
+    return output
+
+
+def attention_layer(x, attention_mask=None,
+                    num_attention_heads=1,
+                    size_per_head=512,
+                    attention_probs_dropout_prob=0.0,
+                    initializer_range=0.02):
+    q = dense(x, num_attention_heads * size_per_head, name='query',
+              kernel_initializer=create_initializer(initializer_range))
+    k = dense(x, num_attention_heads * size_per_head, name='key',
+              kernel_initializer=create_initializer(initializer_range))
+    v = dense(x, num_attention_heads * size_per_head, name='value',
+              kernel_initializer=create_initializer(initializer_range))
+    q = split_heads(q, num_attention_heads)
+    k = split_heads(k, num_attention_heads)
+    v = split_heads(v, num_attention_heads)
+
+    qk = tf.matmul(q, k, transpose_b=True)  # [bs, head, len, len]
+    qk += (-10000. * (1 - attention_mask))
+    qk = attention_softmax(qk, scale=1.0 / np.sqrt(size_per_head))
+    # 本来dropout在这，显存占用大
+    qkv = tf.matmul(qk, v)  # [bs, head, len, dim]
+    att = merge_heads(qkv)  # [bs, len, dim*head]
+    # dropout转移到这里
+    att = dropout(att, attention_probs_dropout_prob)
+
+    return att
+
+
+def transformer_model(input_tensor,
+                      attention_mask=None,
+                      hidden_size=768,
+                      num_hidden_layers=12,
+                      num_attention_heads=12,
+                      intermediate_size=3072,
+                      intermediate_act_fn='gelu',
+                      hidden_dropout_prob=0.1,
+                      attention_probs_dropout_prob=0.1,
+                      initializer_range=0.02,
+                      do_return_all_layers=False):
+    if hidden_size % num_attention_heads != 0:
+        raise ValueError(
+            "The hidden size (%d) is not a multiple of the number of attention "
+            "heads (%d)" % (hidden_size, num_attention_heads))
+
+    attention_head_size = int(hidden_size / num_attention_heads)
+
+    all_layer_outputs = []
+    for layer_idx in range(num_hidden_layers):
+        with tf.variable_scope("layer_%d" % layer_idx):
+            with tf.variable_scope("attention"):
+                with tf.variable_scope("self"):
+                    attention_head = attention_layer(x=input_tensor,
+                                                     attention_mask=attention_mask,
+                                                     num_attention_heads=num_attention_heads,
+                                                     size_per_head=attention_head_size,
+                                                     attention_probs_dropout_prob=attention_probs_dropout_prob,
+                                                     initializer_range=initializer_range)
+
+                # Run a linear projection of `hidden_size` then add a residual
+                # with `layer_input`.
+                with tf.variable_scope("output"):
+                    attention_output = dense(
+                        attention_head,
+                        hidden_size,
+                        kernel_initializer=create_initializer(initializer_range))
+                    attention_output = dropout(attention_output, hidden_dropout_prob)
+                    attention_output = layer_norm(attention_output + input_tensor)
+
+            # The activation is only applied to the "intermediate" hidden layer.
+            with tf.variable_scope("intermediate"):
+                intermediate_output = dense(
+                    attention_output,
+                    intermediate_size,
+                    activation=intermediate_act_fn,
+                    kernel_initializer=create_initializer(initializer_range))
+
+            # Down-project back to `hidden_size` then add the residual.
+            with tf.variable_scope("output"):
+                layer_output = dense(
+                    intermediate_output,
+                    hidden_size,
+                    kernel_initializer=create_initializer(initializer_range))
+                layer_output = dropout(layer_output, hidden_dropout_prob)
+                layer_output = layer_norm(layer_output + attention_output)
+                input_tensor = layer_output
+                all_layer_outputs.append(layer_output)
+
+    if do_return_all_layers:
+        return all_layer_outputs
+    else:
+        return layer_output
+
+
+class BertConfig(object):
+    """Configuration for `BertModel`."""
+
+    def __init__(self,
+                 vocab_size,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=16,
+                 initializer_range=0.02):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size=None)
+        for (key, value) in six.iteritems(json_object):
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with tf.gfile.GFile(json_file, "r") as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+
+class BertModel(object):
+    def __init__(self,
+                 config,
+                 is_training,
+                 input_ids,
+                 input_mask=None,
+                 token_type_ids=None,
+                 use_float16=False,
+                 scope="bert"):
+        config = copy.deepcopy(config)
+        if not is_training:
+            config.hidden_dropout_prob = 0.0
+            config.attention_probs_dropout_prob = 0.0
+
+        input_shape = get_shape_list(input_ids, expected_rank=2)
+        batch_size = input_shape[0]
+        seq_length = input_shape[1]
+
+        if input_mask is None:
+            input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
+
+        if token_type_ids is None:
+            token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32)
+
+        with tf.variable_scope(scope, default_name="bert", reuse=tf.AUTO_REUSE,
+                               custom_getter=get_custom_getter(tf.float16 if use_float16 else tf.float32)):
+            with tf.variable_scope("embeddings"):
+                # Perform embedding lookup on the word ids.
+                self.embedding_output, self.embedding_table = embedding_lookup(
+                    input_ids=input_ids,
+                    vocab_size=config.vocab_size,
+                    embedding_size=config.hidden_size,
+                    initializer_range=config.initializer_range,
+                    word_embedding_name="word_embeddings",
+                    use_float16=use_float16)
+
+                # Add positional embeddings and token type embeddings, then layer
+                # normalize and perform dropout.
+                self.embedding_output = embedding_postprocessor(
+                    input_tensor=self.embedding_output,
+                    token_type_ids=token_type_ids,
+                    token_type_vocab_size=config.type_vocab_size,
+                    token_type_embedding_name="token_type_embeddings",
+                    position_embedding_name="position_embeddings",
+                    initializer_range=config.initializer_range,
+                    max_position_embeddings=config.max_position_embeddings,
+                    dropout_prob=config.hidden_dropout_prob,
+                    use_float16=use_float16)
+
+            with tf.variable_scope("encoder"):
+                attention_mask = tf.reshape(input_mask, (-1, 1, 1, input_mask.shape[1]))  # [bs, len]->[bs, 1, 1, len]
+                attention_mask = tf.cast(attention_mask, self.embedding_output.dtype)
+
+                # Run the stacked transformer.
+                # `sequence_output` shape = [batch_size, seq_length, hidden_size].
+                self.all_encoder_layers = transformer_model(
+                    input_tensor=self.embedding_output,
+                    attention_mask=attention_mask,
+                    hidden_size=config.hidden_size,
+                    num_hidden_layers=config.num_hidden_layers,
+                    num_attention_heads=config.num_attention_heads,
+                    intermediate_size=config.intermediate_size,
+                    intermediate_act_fn=config.hidden_act,
+                    hidden_dropout_prob=config.hidden_dropout_prob,
+                    attention_probs_dropout_prob=config.attention_probs_dropout_prob,
+                    initializer_range=config.initializer_range,
+                    do_return_all_layers=True)
+
+            self.sequence_output = self.all_encoder_layers[-1]
+            with tf.variable_scope("pooler"):
+                # We "pool" the model by simply taking the hidden state corresponding
+                # to the first token. We assume that this has been pre-trained
+                first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1)
+                self.pooled_output = dense(
+                    first_token_tensor,
+                    config.hidden_size,
+                    activation='tanh',
+                    kernel_initializer=create_initializer(config.initializer_range))
+
+    def get_pooled_output(self):
+        return self.pooled_output
+
+    def get_sequence_output(self):
+        return self.sequence_output
+
+    def get_all_encoder_layers(self):
+        return self.all_encoder_layers
+
+    def get_embedding_output(self):
+        return self.embedding_output
+
+    def get_embedding_table(self):
+        return self.embedding_table
+
+
+class BertModelMRC(object):
+    def __init__(self,
+                 config,
+                 is_training,
+                 input_ids,
+                 input_mask=None,
+                 token_type_ids=None,
+                 start_positions=None,
+                 end_positions=None,
+                 use_float16=False,
+                 scope="bert"):
+        with tf.device("/gpu:0"):
+            self.bert = BertModel(config, is_training, input_ids, input_mask, token_type_ids, use_float16, scope)
+
+            # finetune mrc
+            with tf.variable_scope('finetune_mrc', reuse=tf.AUTO_REUSE,
+                                   custom_getter=get_custom_getter(tf.float16 if use_float16 else tf.float32)):
+                self.sequence_output = self.bert.get_sequence_output()
+                # [bs, len]
+                self.start_logits = tf.squeeze(dense(self.sequence_output, 1, name='start_dense'), -1)
+                self.end_logits = tf.squeeze(dense(self.sequence_output, 1, name='end_dense'), -1)
+                self.start_logits += tf.cast(-10000. * (1 - input_mask), self.start_logits.dtype)
+                self.end_logits += tf.cast(-10000. * (1 - input_mask), self.end_logits.dtype)
+
+                if is_training and start_positions is not None and end_positions is not None:
+                    start_loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                        logits=tf.cast(self.start_logits, tf.float32),
+                        labels=start_positions)
+                    end_loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(
+                        logits=tf.cast(self.end_logits, tf.float32),
+                        labels=end_positions)
+                    start_loss = tf.reduce_mean(start_loss_)
+                    end_loss = tf.reduce_mean(end_loss_)
+                    self.train_loss = (start_loss + end_loss) / 2.0
diff --git a/optimizations/tf_optimization.py b/optimizations/tf_optimization.py
new file mode 100644
index 0000000..fe9f6ad
--- /dev/null
+++ b/optimizations/tf_optimization.py
@@ -0,0 +1,201 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Functions and classes related to optimization (weight updates)."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+import tensorflow as tf
+
+
+class Optimizer(object):
+    def __init__(self, loss, init_lr, num_train_steps, num_warmup_steps,
+                 hvd=None, use_fp16=False, loss_count=1000, clip_norm=1.0,
+                 init_loss_scale=2 ** 16, beta1=0.9, beta2=0.999):
+        """Creates an optimizer training op."""
+        self.global_step = tf.train.get_or_create_global_step()
+
+        # avoid step change in learning rate at end of warmup phase
+        decayed_learning_rate_at_crossover_point = init_lr * (1.0 - float(num_warmup_steps) / float(num_train_steps))
+        adjusted_init_lr = init_lr * (init_lr / decayed_learning_rate_at_crossover_point)
+        learning_rate = tf.constant(value=adjusted_init_lr, shape=[], dtype=tf.float32)
+
+        # Implements linear decay of the learning rate.
+        learning_rate = tf.train.polynomial_decay(
+            learning_rate,
+            self.global_step,
+            num_train_steps,
+            end_learning_rate=0.0,
+            power=1.0,
+            cycle=False)
+
+        # Implements linear warmup. I.e., if global_step < num_warmup_steps, the
+        # learning rate will be `global_step/num_warmup_steps * init_lr`.
+        if num_warmup_steps:
+            global_steps_int = tf.cast(self.global_step, tf.int32)
+            warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32)
+
+            global_steps_float = tf.cast(global_steps_int, tf.float32)
+            warmup_steps_float = tf.cast(warmup_steps_int, tf.float32)
+
+            warmup_percent_done = global_steps_float / warmup_steps_float
+            warmup_learning_rate = init_lr * warmup_percent_done
+
+            is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32)
+            learning_rate = ((1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate)
+
+        self.learning_rate = learning_rate
+
+        # It is recommended that you use this optimizer for fine tuning, since this
+        # is how the model was trained (note that the Adam m/v variables are NOT
+        # loaded from init_checkpoint.)
+        optimizer = AdamWeightDecayOptimizer(
+            learning_rate=learning_rate,
+            weight_decay_rate=0.01,
+            beta_1=beta1,
+            beta_2=beta2,
+            epsilon=1e-6,
+            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+
+        if hvd is not None:
+            from horovod.tensorflow.compression import Compression
+            optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
+        if use_fp16:
+            loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(
+                init_loss_scale=init_loss_scale,
+                incr_every_n_steps=loss_count,
+                decr_every_n_nan_or_inf=2,
+                decr_ratio=0.5)
+            optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+            self.loss_scale = loss_scale_manager.get_loss_scale()
+
+        tvars = tf.trainable_variables()
+        grads_and_vars = optimizer.compute_gradients(loss, tvars)
+        grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
+        grads, tvars = list(zip(*grads_and_vars))
+        all_are_finite = tf.reduce_all(
+            [tf.reduce_all(tf.is_finite(g)) for g in grads]) if use_fp16 else tf.constant(True, dtype=tf.bool)
+
+        # This is how the model was pre-trained.
+        # ensure global norm is a finite number
+        # to prevent clip_by_global_norm from having a hizzy fit.
+        (clipped_grads, _) = tf.clip_by_global_norm(
+            grads, clip_norm=clip_norm,
+            use_norm=tf.cond(
+                all_are_finite,
+                lambda: tf.global_norm(grads),
+                lambda: tf.constant(clip_norm)))
+
+        train_op = optimizer.apply_gradients(
+            list(zip(clipped_grads, tvars)), global_step=self.global_step)
+
+        # Normally the global step update is done inside of `apply_gradients`.
+        # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
+        # a different optimizer, you should probably take this line out.
+        new_global_step = tf.cond(all_are_finite, lambda: self.global_step + 1, lambda: self.global_step)
+        new_global_step = tf.identity(new_global_step, name='step_update')
+        self.train_op = tf.group(train_op, [self.global_step.assign(new_global_step)])
+
+
+class AdamWeightDecayOptimizer(tf.train.Optimizer):
+    """A basic Adam optimizer that includes "correct" L2 weight decay."""
+
+    def __init__(self,
+                 learning_rate,
+                 weight_decay_rate=0.0,
+                 beta_1=0.9,
+                 beta_2=0.999,
+                 epsilon=1e-6,
+                 exclude_from_weight_decay=None,
+                 name="AdamWeightDecayOptimizer"):
+        """Constructs a AdamWeightDecayOptimizer."""
+        super(AdamWeightDecayOptimizer, self).__init__(False, name)
+
+        self.learning_rate = tf.identity(learning_rate, name='learning_rate')
+        self.weight_decay_rate = weight_decay_rate
+        self.beta_1 = beta_1
+        self.beta_2 = beta_2
+        self.epsilon = epsilon
+        self.exclude_from_weight_decay = exclude_from_weight_decay
+
+    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+        """See base class."""
+        assignments = []
+        for (grad, param) in grads_and_vars:
+            if grad is None or param is None:
+                continue
+
+            param_name = self._get_variable_name(param.name)
+
+            m = tf.get_variable(
+                name=param_name + "/adam_m",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+            v = tf.get_variable(
+                name=param_name + "/adam_v",
+                shape=param.shape.as_list(),
+                dtype=tf.float32,
+                trainable=False,
+                initializer=tf.zeros_initializer())
+
+            # Standard Adam update.
+            next_m = (
+                    tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad))
+            next_v = (
+                    tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2,
+                                                              tf.square(grad)))
+
+            update = next_m / (tf.sqrt(next_v) + self.epsilon)
+
+            # Just adding the square of the weights to the loss function is *not*
+            # the correct way of using L2 regularization/weight decay with Adam,
+            # since that will interact with the m and v parameters in strange ways.
+            #
+            # Instead we want ot decay the weights in a manner that doesn't interact
+            # with the m/v parameters. This is equivalent to adding the square
+            # of the weights to the loss with plain (non-momentum) SGD.
+            if self._do_use_weight_decay(param_name):
+                update += self.weight_decay_rate * param
+
+            update_with_lr = self.learning_rate * update
+
+            next_param = param - update_with_lr
+
+            assignments.extend(
+                [param.assign(next_param),
+                 m.assign(next_m),
+                 v.assign(next_v)])
+        return tf.group(*assignments, name=name)
+
+    def _do_use_weight_decay(self, param_name):
+        """Whether to use L2 weight decay for `param_name`."""
+        if not self.weight_decay_rate:
+            return False
+        if self.exclude_from_weight_decay:
+            for r in self.exclude_from_weight_decay:
+                if re.search(r, param_name) is not None:
+                    return False
+        return True
+
+    def _get_variable_name(self, param_name):
+        """Get the variable name from the tensor name."""
+        m = re.match("^(.*):\\d+$", param_name)
+        if m is not None:
+            param_name = m.group(1)
+        return param_name
diff --git a/utils.py b/utils.py
index 2fa746f..c6f7e81 100644
--- a/utils.py
+++ b/utils.py
@@ -7,19 +7,20 @@
 from glob import glob
 
 
-def check_args(args):
+def check_args(args, rank=0):
     args.setting_file = os.path.join(args.checkpoint_dir, args.setting_file)
     args.log_file = os.path.join(args.checkpoint_dir, args.log_file)
-    os.makedirs(args.checkpoint_dir, exist_ok=True)
-    with open(args.setting_file, 'wt') as opt_file:
-        opt_file.write('------------ Options -------------\n')
-        print('------------ Options -------------')
-        for k in args.__dict__:
-            v = args.__dict__[k]
-            opt_file.write('%s: %s\n' % (str(k), str(v)))
-            print('%s: %s' % (str(k), str(v)))
-        opt_file.write('-------------- End ----------------\n')
-        print('------------ End -------------')
+    if rank == 0:
+        os.makedirs(args.checkpoint_dir, exist_ok=True)
+        with open(args.setting_file, 'wt') as opt_file:
+            opt_file.write('------------ Options -------------\n')
+            print('------------ Options -------------')
+            for k in args.__dict__:
+                v = args.__dict__[k]
+                opt_file.write('%s: %s\n' % (str(k), str(v)))
+                print('%s: %s' % (str(k), str(v)))
+            opt_file.write('-------------- End ----------------\n')
+            print('------------ End -------------')
 
     return args