From 2ea90461e0a0321945f880330b629ce09e0e3fd2 Mon Sep 17 00:00:00 2001 From: b Date: Fri, 22 Nov 2019 16:58:05 +0800 Subject: [PATCH] init --- baselines/README.md | 167 + .../albert_config/albert_config_base.json | 23 + .../albert_config/albert_config_large.json | 23 + .../albert_config/albert_config_tiny.json | 23 + .../albert_config/albert_config_xlarge.json | 23 + .../albert_config/albert_config_xxlarge.json | 23 + .../albert/albert_config/bert_config.json | 19 + .../models/albert/albert_config/vocab.txt | 21128 ++++++++++++++++ baselines/models/albert/bert_utils.py | 143 + .../models/albert/create_pretrain_data.sh | 6 + .../models/albert/create_pretraining_data.py | 708 + baselines/models/albert/modeling.py | 1264 + baselines/models/albert/optimization.py | 300 + .../models/albert/optimization_finetuning.py | 174 + .../resources/add_data_removing_dropout.jpg | Bin 0 -> 98747 bytes .../albert/resources/albert_configuration.jpg | Bin 0 -> 92070 bytes .../albert/resources/albert_performance.jpg | Bin 0 -> 120584 bytes .../create_pretraining_data_roberta.py | 630 + .../create_pretrain_data_batch_webtext.sh | 10 + .../albert/resources/state_of_the_art.jpg | Bin 0 -> 121174 bytes baselines/models/albert/run_classifier.py | 1493 ++ baselines/models/albert/run_classifier_bq.sh | 68 + .../models/albert/run_classifier_iflytek.sh | 68 + .../models/albert/run_classifier_inews.sh | 68 + .../models/albert/run_classifier_lcqmc.sh | 64 + .../models/albert/run_classifier_thucnews.sh | 68 + .../models/albert/run_classifier_tnews.sh | 68 + .../models/albert/run_classifier_xnli.sh | 68 + baselines/models/albert/run_pretraining.py | 501 + baselines/models/albert/test_changes.py | 87 + baselines/models/albert/tokenization.py | 401 + .../models/albert/tpu/run_classifier_inews.sh | 21 + .../albert/tpu/run_classifier_inews_tiny.sh | 22 + .../models/albert/tpu/run_classifier_lcqmc.sh | 21 + .../albert/tpu/run_classifier_lcqmc_tiny.sh | 22 + .../albert/tpu/run_classifier_thucnews.sh | 21 + .../tpu/run_classifier_thucnews_tiny.sh | 21 + .../models/albert/tpu/run_classifier_tnews.sh | 21 + .../albert/tpu/run_classifier_tnews_tiny.sh | 22 + .../models/albert/tpu/run_classifier_xnli.sh | 21 + .../albert/tpu/run_classifier_xnli_tiny.sh | 21 + baselines/models/bert/.gitignore | 116 + baselines/models/bert/CONTRIBUTING.md | 31 + baselines/models/bert/LICENSE | 202 + baselines/models/bert/__init__.py | 15 + baselines/models/bert/conlleval.py | 300 + .../models/bert/create_pretraining_data.py | 469 + baselines/models/bert/extract_features.py | 419 + baselines/models/bert/modeling.py | 986 + baselines/models/bert/modeling_test.py | 277 + baselines/models/bert/multilingual.md | 303 + baselines/models/bert/optimization.py | 174 + baselines/models/bert/optimization_test.py | 48 + ...ng_movie_reviews_with_bert_on_tf_hub.ipynb | 1231 + baselines/models/bert/requirements.txt | 2 + baselines/models/bert/run_classifier.py | 1592 ++ baselines/models/bert/run_classifier_bq.sh | 75 + .../models/bert/run_classifier_iflydata.sh | 75 + baselines/models/bert/run_classifier_inews.sh | 75 + baselines/models/bert/run_classifier_lcqmc.sh | 71 + .../models/bert/run_classifier_thucnews.sh | 75 + baselines/models/bert/run_classifier_tnews.sh | 75 + .../models/bert/run_classifier_with_tfhub.py | 314 + baselines/models/bert/run_classifier_xnli.sh | 75 + baselines/models/bert/run_ner.py | 852 + baselines/models/bert/run_ner_msra.sh | 20 + baselines/models/bert/run_pretraining.py | 493 + baselines/models/bert/run_squad.py | 1283 + baselines/models/bert/sample_text.txt | 33 + baselines/models/bert/tf_metrics.py | 215 + baselines/models/bert/tokenization.py | 399 + baselines/models/bert/tokenization_test.py | 137 + .../models/bert/tpu/run_classifier_inews.sh | 21 + .../bert/tpu/run_classifier_jdcomment.sh | 21 + .../models/bert/tpu/run_classifier_lcqmc.sh | 21 + .../bert/tpu/run_classifier_thucnews.sh | 21 + .../models/bert/tpu/run_classifier_tnews.sh | 21 + .../models/bert/tpu/run_classifier_xnli.sh | 21 + baselines/models/bert_wwm_ext/.gitignore | 116 + baselines/models/bert_wwm_ext/CONTRIBUTING.md | 31 + baselines/models/bert_wwm_ext/LICENSE | 202 + baselines/models/bert_wwm_ext/__init__.py | 15 + baselines/models/bert_wwm_ext/conlleval.py | 300 + .../bert_wwm_ext/create_pretraining_data.py | 469 + .../models/bert_wwm_ext/extract_features.py | 419 + baselines/models/bert_wwm_ext/modeling.py | 986 + .../models/bert_wwm_ext/modeling_test.py | 277 + baselines/models/bert_wwm_ext/multilingual.md | 303 + baselines/models/bert_wwm_ext/optimization.py | 174 + .../models/bert_wwm_ext/optimization_test.py | 48 + ...ng_movie_reviews_with_bert_on_tf_hub.ipynb | 1231 + .../models/bert_wwm_ext/requirements.txt | 2 + .../models/bert_wwm_ext/run_classifier.py | 1591 ++ .../models/bert_wwm_ext/run_classifier_bq.sh | 67 + .../bert_wwm_ext/run_classifier_iflydata.sh | 67 + .../bert_wwm_ext/run_classifier_inews.sh | 67 + .../bert_wwm_ext/run_classifier_lcqmc.sh | 63 + .../bert_wwm_ext/run_classifier_thucnews.sh | 67 + .../bert_wwm_ext/run_classifier_tnews.sh | 67 + .../bert_wwm_ext/run_classifier_with_tfhub.py | 314 + .../bert_wwm_ext/run_classifier_xnli.sh | 67 + baselines/models/bert_wwm_ext/run_ner.py | 844 + baselines/models/bert_wwm_ext/run_ner_msra.sh | 20 + .../models/bert_wwm_ext/run_pretraining.py | 493 + baselines/models/bert_wwm_ext/run_squad.py | 1283 + baselines/models/bert_wwm_ext/sample_text.txt | 33 + baselines/models/bert_wwm_ext/tf_metrics.py | 215 + baselines/models/bert_wwm_ext/tokenization.py | 399 + .../models/bert_wwm_ext/tokenization_test.py | 137 + .../bert_wwm_ext/tpu/run_classifier_inews.sh | 21 + .../bert_wwm_ext/tpu/run_classifier_lcqmc.sh | 21 + .../tpu/run_classifier_thucnews.sh | 21 + .../bert_wwm_ext/tpu/run_classifier_tnews.sh | 21 + .../bert_wwm_ext/tpu/run_classifier_xnli.sh | 21 + baselines/models/ernie/.gitignore | 116 + baselines/models/ernie/CONTRIBUTING.md | 31 + baselines/models/ernie/LICENSE | 202 + baselines/models/ernie/__init__.py | 15 + baselines/models/ernie/conlleval.py | 300 + .../models/ernie/create_pretraining_data.py | 469 + baselines/models/ernie/extract_features.py | 419 + baselines/models/ernie/modeling.py | 986 + baselines/models/ernie/modeling_test.py | 277 + baselines/models/ernie/multilingual.md | 303 + baselines/models/ernie/optimization.py | 174 + baselines/models/ernie/optimization_test.py | 48 + ...ng_movie_reviews_with_bert_on_tf_hub.ipynb | 1231 + baselines/models/ernie/requirements.txt | 2 + baselines/models/ernie/run_classifier.py | 1578 ++ baselines/models/ernie/run_classifier_bq.sh | 67 + .../models/ernie/run_classifier_iflydata.sh | 67 + .../models/ernie/run_classifier_inews.sh | 67 + .../models/ernie/run_classifier_lcqmc.sh | 63 + .../models/ernie/run_classifier_thucnews.sh | 67 + .../models/ernie/run_classifier_tnews.sh | 67 + .../models/ernie/run_classifier_with_tfhub.py | 314 + baselines/models/ernie/run_classifier_xnli.sh | 67 + baselines/models/ernie/run_ner.py | 844 + baselines/models/ernie/run_ner_msra.sh | 20 + baselines/models/ernie/run_pretraining.py | 493 + baselines/models/ernie/run_squad.py | 1283 + baselines/models/ernie/sample_text.txt | 33 + baselines/models/ernie/tf_metrics.py | 215 + baselines/models/ernie/tokenization.py | 399 + baselines/models/ernie/tokenization_test.py | 137 + .../models/ernie/tpu/run_classifier_inews.sh | 21 + .../models/ernie/tpu/run_classifier_lcqmc.sh | 21 + .../ernie/tpu/run_classifier_thucnews.sh | 21 + .../models/ernie/tpu/run_classifier_tnews.sh | 21 + .../models/ernie/tpu/run_classifier_xnli.sh | 21 + baselines/models/roberta/conlleval.py | 300 + .../models/roberta/create_pretrain_data.sh | 9 + .../models/roberta/create_pretraining_data.py | 630 + baselines/models/roberta/modeling.py | 986 + baselines/models/roberta/optimization.py | 203 + .../models/roberta/optimization_finetuning.py | 174 + .../RoBERTa_zh_Large_Learning_Curve.png | Bin 0 -> 195127 bytes baselines/models/roberta/resources/vocab.txt | 21128 ++++++++++++++++ baselines/models/roberta/run_classifier.py | 1486 ++ baselines/models/roberta/run_classifier_bq.sh | 67 + .../models/roberta/run_classifier_iflydata.sh | 67 + .../models/roberta/run_classifier_inews.sh | 67 + .../models/roberta/run_classifier_lcqmc.sh | 63 + .../models/roberta/run_classifier_thucnews.sh | 67 + .../models/roberta/run_classifier_tnews.sh | 67 + .../models/roberta/run_classifier_xnli.sh | 67 + baselines/models/roberta/run_ner.py | 844 + baselines/models/roberta/run_ner_msra.sh | 20 + baselines/models/roberta/run_pretraining.py | 498 + baselines/models/roberta/tf_metrics.py | 215 + baselines/models/roberta/tokenization.py | 401 + .../roberta/tpu/run_classifier_inews.sh | 21 + .../roberta/tpu/run_classifier_jdcomment.sh | 21 + .../roberta/tpu/run_classifier_lcqmc.sh | 21 + .../roberta/tpu/run_classifier_thucnews.sh | 21 + .../roberta/tpu/run_classifier_tnews.sh | 21 + .../models/roberta/tpu/run_classifier_xnli.sh | 21 + .../models/roberta_wwm_ext/CONTRIBUTING.md | 31 + baselines/models/roberta_wwm_ext/LICENSE | 202 + baselines/models/roberta_wwm_ext/__init__.py | 15 + baselines/models/roberta_wwm_ext/conlleval.py | 300 + .../create_pretraining_data.py | 469 + .../roberta_wwm_ext/extract_features.py | 419 + baselines/models/roberta_wwm_ext/modeling.py | 986 + .../models/roberta_wwm_ext/modeling_test.py | 277 + .../models/roberta_wwm_ext/multilingual.md | 303 + .../models/roberta_wwm_ext/optimization.py | 174 + .../roberta_wwm_ext/optimization_test.py | 48 + .../models/roberta_wwm_ext/requirements.txt | 2 + .../models/roberta_wwm_ext/run_classifier.py | 1540 ++ .../roberta_wwm_ext/run_classifier_bq.sh | 67 + .../run_classifier_iflydata.sh | 67 + .../roberta_wwm_ext/run_classifier_inews.sh | 67 + .../roberta_wwm_ext/run_classifier_lcqmc.sh | 63 + .../run_classifier_thucnews.sh | 67 + .../roberta_wwm_ext/run_classifier_tnews.sh | 67 + .../run_classifier_with_tfhub.py | 314 + .../roberta_wwm_ext/run_classifier_xnli.sh | 67 + baselines/models/roberta_wwm_ext/run_ner.py | 844 + .../models/roberta_wwm_ext/run_ner_msra.sh | 20 + .../models/roberta_wwm_ext/run_pretraining.py | 493 + baselines/models/roberta_wwm_ext/run_squad.py | 1283 + .../models/roberta_wwm_ext/tf_metrics.py | 215 + .../models/roberta_wwm_ext/tokenization.py | 399 + .../roberta_wwm_ext/tokenization_test.py | 137 + .../tpu/run_classifier_inews.sh | 21 + .../tpu/run_classifier_jdcomment.sh | 21 + .../tpu/run_classifier_lcqmc.sh | 21 + .../tpu/run_classifier_thucnews.sh | 21 + .../tpu/run_classifier_tnews.sh | 21 + .../tpu/run_classifier_xnli.sh | 21 + .../roberta_wwm_large_ext/CONTRIBUTING.md | 31 + .../models/roberta_wwm_large_ext/LICENSE | 202 + .../models/roberta_wwm_large_ext/__init__.py | 15 + .../models/roberta_wwm_large_ext/conlleval.py | 300 + .../create_pretraining_data.py | 469 + .../roberta_wwm_large_ext/extract_features.py | 419 + .../models/roberta_wwm_large_ext/modeling.py | 986 + .../roberta_wwm_large_ext/modeling_test.py | 277 + .../roberta_wwm_large_ext/multilingual.md | 303 + .../roberta_wwm_large_ext/optimization.py | 174 + .../optimization_test.py | 48 + ...ng_movie_reviews_with_bert_on_tf_hub.ipynb | 1231 + .../roberta_wwm_large_ext/requirements.txt | 2 + .../roberta_wwm_large_ext/run_classifier.py | 1585 ++ .../run_classifier_bq.sh | 67 + .../run_classifier_iflydata.sh | 67 + .../run_classifier_inews.sh | 67 + .../run_classifier_lcqmc.sh | 63 + .../run_classifier_thucnews.sh | 67 + .../run_classifier_tnews.sh | 67 + .../run_classifier_with_tfhub.py | 314 + .../run_classifier_xnli.sh | 67 + .../models/roberta_wwm_large_ext/run_ner.py | 844 + .../roberta_wwm_large_ext/run_ner_msra.sh | 20 + .../roberta_wwm_large_ext/run_pretraining.py | 493 + .../models/roberta_wwm_large_ext/run_squad.py | 1283 + .../roberta_wwm_large_ext/tf_metrics.py | 215 + .../roberta_wwm_large_ext/tokenization.py | 399 + .../tokenization_test.py | 137 + .../tpu/run_classifier_inews.sh | 21 + .../tpu/run_classifier_jdcomment.sh | 21 + .../tpu/run_classifier_lcqmc.sh | 21 + .../tpu/run_classifier_thucnews.sh | 21 + .../tpu/run_classifier_tnews.sh | 21 + .../tpu/run_classifier_xnli.sh | 21 + baselines/models/xlnet/__init__.py | 0 baselines/models/xlnet/classifier_utils.py | 246 + .../models/xlnet/cmrc2018_evaluate_drcd.py | 151 + baselines/models/xlnet/data_utils.py | 915 + baselines/models/xlnet/function_builder.py | 362 + baselines/models/xlnet/gpu_utils.py | 69 + baselines/models/xlnet/model_utils.py | 399 + baselines/models/xlnet/modeling.py | 783 + baselines/models/xlnet/prepro_utils.py | 138 + baselines/models/xlnet/run_classifier.py | 1360 + baselines/models/xlnet/run_classifier_bq.sh | 78 + .../models/xlnet/run_classifier_iflydata.sh | 78 + .../models/xlnet/run_classifier_inews.sh | 78 + .../models/xlnet/run_classifier_lcqmc.sh | 74 + .../models/xlnet/run_classifier_thucnews.py | 1380 + .../models/xlnet/run_classifier_thucnews.sh | 80 + .../models/xlnet/run_classifier_tnews.sh | 78 + baselines/models/xlnet/run_classifier_xnli.sh | 78 + baselines/models/xlnet/run_cmrc_drcd.py | 1293 + baselines/models/xlnet/spiece.model | Bin 0 -> 691427 bytes baselines/models/xlnet/squad_utils.py | 327 + baselines/models/xlnet/summary.py | 128 + baselines/models/xlnet/temp.sh | 2 + .../models/xlnet/tpu/run_classifier_inews.sh | 28 + .../models/xlnet/tpu/run_classifier_lcqmc.sh | 28 + .../models/xlnet/tpu/run_classifier_tnews.sh | 28 + .../models/xlnet/tpu/run_classifier_xnli.sh | 28 + baselines/models/xlnet/tpu_estimator.py | 3522 +++ baselines/models/xlnet/xlnet.py | 292 + .../classifier_pytorch/.gitignore | 104 + .../classifier_pytorch/README.md | 118 + .../chineseGLUEdatasets/inews/.gitignore | 104 + .../chineseGLUEdatasets/lcqmc/.gitignore | 104 + .../chineseGLUEdatasets/tnews/.gitignore | 104 + .../chineseGLUEdatasets/xnli/.gitignore | 104 + ...lbert_original_tf_checkpoint_to_pytorch.py | 72 + ..._bert_original_tf_checkpoint_to_pytorch.py | 65 + ...rnie_original_pad_checkpoint_to_pytorch.py | 217 + ...xlnet_original_tf_checkpoint_to_pytorch.py | 104 + .../classifier_pytorch/metrics/__init__.py | 0 .../metrics/glue_compute_metrics.py | 88 + .../outputs/inews_output/.gitignore | 104 + .../outputs/lcqmc_output/.gitignore | 104 + .../outputs/tnews_output/.gitignore | 104 + .../outputs/xnli_output/.gitignore | 104 + .../classifier_pytorch/processors/__init__.py | 4 + .../classifier_pytorch/processors/glue.py | 697 + .../classifier_pytorch/processors/utils.py | 120 + .../classifier_pytorch/run_classifier.py | 522 + .../run_classifier_inews.sh | 25 + .../run_classifier_lcqmc.sh | 25 + .../run_classifier_tnews.sh | 25 + .../classifier_pytorch/run_classifier_xnli.sh | 22 + .../classifier_pytorch/tools/common.py | 353 + .../classifier_pytorch/tools/progressbar.py | 59 + .../transformers/__init__.py | 96 + .../transformers/__main__.py | 129 + .../transformers/configuration_auto.py | 137 + .../transformers/configuration_bert.py | 115 + .../transformers/configuration_ctrl.py | 143 + .../transformers/configuration_distilbert.py | 89 + .../transformers/configuration_gpt2.py | 144 + .../transformers/configuration_openai.py | 134 + .../transformers/configuration_roberta.py | 35 + .../transformers/configuration_transfo_xl.py | 168 + .../transformers/configuration_utils.py | 207 + .../transformers/configuration_xlm.py | 181 + .../transformers/configuration_xlnet.py | 170 + .../transformers/file_utils.py | 324 + .../transformers/modeling_albert.py | 1065 + .../transformers/modeling_auto.py | 503 + .../transformers/modeling_bert.py | 1149 + .../transformers/modeling_ctrl.py | 485 + .../transformers/modeling_distilbert.py | 693 + .../transformers/modeling_gpt2.py | 662 + .../transformers/modeling_openai.py | 621 + .../transformers/modeling_roberta.py | 470 + .../transformers/modeling_transfo_xl.py | 890 + .../modeling_transfo_xl_utilities.py | 332 + .../transformers/modeling_utils.py | 817 + .../transformers/modeling_xlm.py | 886 + .../transformers/modeling_xlnet.py | 1368 + .../transformers/optimization.py | 189 + .../transformers/tokenization_auto.py | 124 + .../transformers/tokenization_bert.py | 502 + .../transformers/tokenization_ctrl.py | 187 + .../transformers/tokenization_distilbert.py | 62 + .../transformers/tokenization_gpt2.py | 234 + .../transformers/tokenization_openai.py | 208 + .../transformers/tokenization_roberta.py | 140 + .../transformers/tokenization_transfo_xl.py | 579 + .../transformers/tokenization_utils.py | 1068 + .../transformers/tokenization_xlm.py | 833 + .../transformers/tokenization_xlnet.py | 253 + .../convert_tf_checkpoint_to_pytorch.py | 124 + .../mrc_pytorch/preprocess/DRCD_output.py | 469 + .../mrc_pytorch/preprocess/DRCD_preprocess.py | 345 + .../preprocess/cmrc2018_evaluate.py | 216 + .../mrc_pytorch/preprocess/cmrc2018_output.py | 471 + .../preprocess/cmrc2018_preprocess.py | 362 + .../mrc_pytorch/pytorch_modeling.py | 1205 + .../models_pytorch/mrc_pytorch/run_mrc.py | 287 + .../mrc_pytorch/run_mrc_cmrc2018.sh | 27 + .../mrc_pytorch/run_mrc_drcd.sh | 27 + .../mrc_pytorch/tools/file_utils.py | 238 + .../mrc_pytorch/tools/langconv.py | 273 + .../mrc_pytorch/tools/offical_tokenization.py | 343 + .../mrc_pytorch/tools/pytorch_optimization.py | 197 + .../models_pytorch/mrc_pytorch/tools/utils.py | 146 + .../mrc_pytorch/tools/zh_wiki.py | 8287 ++++++ 356 files changed, 151089 insertions(+) create mode 100644 baselines/README.md create mode 100644 baselines/models/albert/albert_config/albert_config_base.json create mode 100644 baselines/models/albert/albert_config/albert_config_large.json create mode 100644 baselines/models/albert/albert_config/albert_config_tiny.json create mode 100644 baselines/models/albert/albert_config/albert_config_xlarge.json create mode 100644 baselines/models/albert/albert_config/albert_config_xxlarge.json create mode 100644 baselines/models/albert/albert_config/bert_config.json create mode 100644 baselines/models/albert/albert_config/vocab.txt create mode 100755 baselines/models/albert/bert_utils.py create mode 100755 baselines/models/albert/create_pretrain_data.sh create mode 100755 baselines/models/albert/create_pretraining_data.py create mode 100644 baselines/models/albert/modeling.py create mode 100755 baselines/models/albert/optimization.py create mode 100755 baselines/models/albert/optimization_finetuning.py create mode 100644 baselines/models/albert/resources/add_data_removing_dropout.jpg create mode 100644 baselines/models/albert/resources/albert_configuration.jpg create mode 100644 baselines/models/albert/resources/albert_performance.jpg create mode 100644 baselines/models/albert/resources/create_pretraining_data_roberta.py create mode 100644 baselines/models/albert/resources/shell_scripts/create_pretrain_data_batch_webtext.sh create mode 100644 baselines/models/albert/resources/state_of_the_art.jpg create mode 100755 baselines/models/albert/run_classifier.py create mode 100755 baselines/models/albert/run_classifier_bq.sh create mode 100644 baselines/models/albert/run_classifier_iflytek.sh create mode 100755 baselines/models/albert/run_classifier_inews.sh create mode 100755 baselines/models/albert/run_classifier_lcqmc.sh create mode 100644 baselines/models/albert/run_classifier_thucnews.sh create mode 100755 baselines/models/albert/run_classifier_tnews.sh create mode 100755 baselines/models/albert/run_classifier_xnli.sh create mode 100755 baselines/models/albert/run_pretraining.py create mode 100755 baselines/models/albert/test_changes.py create mode 100755 baselines/models/albert/tokenization.py create mode 100755 baselines/models/albert/tpu/run_classifier_inews.sh create mode 100755 baselines/models/albert/tpu/run_classifier_inews_tiny.sh create mode 100755 baselines/models/albert/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/albert/tpu/run_classifier_lcqmc_tiny.sh create mode 100755 baselines/models/albert/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/albert/tpu/run_classifier_thucnews_tiny.sh create mode 100755 baselines/models/albert/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/albert/tpu/run_classifier_tnews_tiny.sh create mode 100755 baselines/models/albert/tpu/run_classifier_xnli.sh create mode 100755 baselines/models/albert/tpu/run_classifier_xnli_tiny.sh create mode 100644 baselines/models/bert/.gitignore create mode 100644 baselines/models/bert/CONTRIBUTING.md create mode 100644 baselines/models/bert/LICENSE create mode 100644 baselines/models/bert/__init__.py create mode 100644 baselines/models/bert/conlleval.py create mode 100644 baselines/models/bert/create_pretraining_data.py create mode 100644 baselines/models/bert/extract_features.py create mode 100644 baselines/models/bert/modeling.py create mode 100644 baselines/models/bert/modeling_test.py create mode 100644 baselines/models/bert/multilingual.md create mode 100644 baselines/models/bert/optimization.py create mode 100644 baselines/models/bert/optimization_test.py create mode 100644 baselines/models/bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb create mode 100644 baselines/models/bert/requirements.txt create mode 100644 baselines/models/bert/run_classifier.py create mode 100644 baselines/models/bert/run_classifier_bq.sh create mode 100644 baselines/models/bert/run_classifier_iflydata.sh create mode 100755 baselines/models/bert/run_classifier_inews.sh create mode 100644 baselines/models/bert/run_classifier_lcqmc.sh create mode 100644 baselines/models/bert/run_classifier_thucnews.sh create mode 100644 baselines/models/bert/run_classifier_tnews.sh create mode 100644 baselines/models/bert/run_classifier_with_tfhub.py create mode 100644 baselines/models/bert/run_classifier_xnli.sh create mode 100644 baselines/models/bert/run_ner.py create mode 100644 baselines/models/bert/run_ner_msra.sh create mode 100644 baselines/models/bert/run_pretraining.py create mode 100644 baselines/models/bert/run_squad.py create mode 100644 baselines/models/bert/sample_text.txt create mode 100644 baselines/models/bert/tf_metrics.py create mode 100644 baselines/models/bert/tokenization.py create mode 100644 baselines/models/bert/tokenization_test.py create mode 100755 baselines/models/bert/tpu/run_classifier_inews.sh create mode 100755 baselines/models/bert/tpu/run_classifier_jdcomment.sh create mode 100755 baselines/models/bert/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/bert/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/bert/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/bert/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/bert_wwm_ext/.gitignore create mode 100644 baselines/models/bert_wwm_ext/CONTRIBUTING.md create mode 100644 baselines/models/bert_wwm_ext/LICENSE create mode 100644 baselines/models/bert_wwm_ext/__init__.py create mode 100644 baselines/models/bert_wwm_ext/conlleval.py create mode 100644 baselines/models/bert_wwm_ext/create_pretraining_data.py create mode 100644 baselines/models/bert_wwm_ext/extract_features.py create mode 100644 baselines/models/bert_wwm_ext/modeling.py create mode 100644 baselines/models/bert_wwm_ext/modeling_test.py create mode 100644 baselines/models/bert_wwm_ext/multilingual.md create mode 100644 baselines/models/bert_wwm_ext/optimization.py create mode 100644 baselines/models/bert_wwm_ext/optimization_test.py create mode 100644 baselines/models/bert_wwm_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb create mode 100644 baselines/models/bert_wwm_ext/requirements.txt create mode 100644 baselines/models/bert_wwm_ext/run_classifier.py create mode 100644 baselines/models/bert_wwm_ext/run_classifier_bq.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_iflydata.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_inews.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_lcqmc.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_thucnews.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_tnews.sh create mode 100644 baselines/models/bert_wwm_ext/run_classifier_with_tfhub.py create mode 100644 baselines/models/bert_wwm_ext/run_classifier_xnli.sh create mode 100644 baselines/models/bert_wwm_ext/run_ner.py create mode 100644 baselines/models/bert_wwm_ext/run_ner_msra.sh create mode 100644 baselines/models/bert_wwm_ext/run_pretraining.py create mode 100644 baselines/models/bert_wwm_ext/run_squad.py create mode 100644 baselines/models/bert_wwm_ext/sample_text.txt create mode 100644 baselines/models/bert_wwm_ext/tf_metrics.py create mode 100644 baselines/models/bert_wwm_ext/tokenization.py create mode 100644 baselines/models/bert_wwm_ext/tokenization_test.py create mode 100755 baselines/models/bert_wwm_ext/tpu/run_classifier_inews.sh create mode 100755 baselines/models/bert_wwm_ext/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/bert_wwm_ext/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/bert_wwm_ext/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/bert_wwm_ext/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/ernie/.gitignore create mode 100644 baselines/models/ernie/CONTRIBUTING.md create mode 100644 baselines/models/ernie/LICENSE create mode 100644 baselines/models/ernie/__init__.py create mode 100644 baselines/models/ernie/conlleval.py create mode 100644 baselines/models/ernie/create_pretraining_data.py create mode 100644 baselines/models/ernie/extract_features.py create mode 100644 baselines/models/ernie/modeling.py create mode 100644 baselines/models/ernie/modeling_test.py create mode 100644 baselines/models/ernie/multilingual.md create mode 100644 baselines/models/ernie/optimization.py create mode 100644 baselines/models/ernie/optimization_test.py create mode 100644 baselines/models/ernie/predicting_movie_reviews_with_bert_on_tf_hub.ipynb create mode 100644 baselines/models/ernie/requirements.txt create mode 100644 baselines/models/ernie/run_classifier.py create mode 100644 baselines/models/ernie/run_classifier_bq.sh create mode 100644 baselines/models/ernie/run_classifier_iflydata.sh create mode 100644 baselines/models/ernie/run_classifier_inews.sh create mode 100644 baselines/models/ernie/run_classifier_lcqmc.sh create mode 100644 baselines/models/ernie/run_classifier_thucnews.sh create mode 100644 baselines/models/ernie/run_classifier_tnews.sh create mode 100644 baselines/models/ernie/run_classifier_with_tfhub.py create mode 100644 baselines/models/ernie/run_classifier_xnli.sh create mode 100644 baselines/models/ernie/run_ner.py create mode 100644 baselines/models/ernie/run_ner_msra.sh create mode 100644 baselines/models/ernie/run_pretraining.py create mode 100644 baselines/models/ernie/run_squad.py create mode 100644 baselines/models/ernie/sample_text.txt create mode 100644 baselines/models/ernie/tf_metrics.py create mode 100644 baselines/models/ernie/tokenization.py create mode 100644 baselines/models/ernie/tokenization_test.py create mode 100755 baselines/models/ernie/tpu/run_classifier_inews.sh create mode 100755 baselines/models/ernie/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/ernie/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/ernie/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/ernie/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/roberta/conlleval.py create mode 100644 baselines/models/roberta/create_pretrain_data.sh create mode 100644 baselines/models/roberta/create_pretraining_data.py create mode 100644 baselines/models/roberta/modeling.py create mode 100644 baselines/models/roberta/optimization.py create mode 100644 baselines/models/roberta/optimization_finetuning.py create mode 100644 baselines/models/roberta/resources/RoBERTa_zh_Large_Learning_Curve.png create mode 100644 baselines/models/roberta/resources/vocab.txt create mode 100644 baselines/models/roberta/run_classifier.py create mode 100644 baselines/models/roberta/run_classifier_bq.sh create mode 100644 baselines/models/roberta/run_classifier_iflydata.sh create mode 100644 baselines/models/roberta/run_classifier_inews.sh create mode 100644 baselines/models/roberta/run_classifier_lcqmc.sh create mode 100644 baselines/models/roberta/run_classifier_thucnews.sh create mode 100644 baselines/models/roberta/run_classifier_tnews.sh create mode 100644 baselines/models/roberta/run_classifier_xnli.sh create mode 100644 baselines/models/roberta/run_ner.py create mode 100644 baselines/models/roberta/run_ner_msra.sh create mode 100644 baselines/models/roberta/run_pretraining.py create mode 100644 baselines/models/roberta/tf_metrics.py create mode 100644 baselines/models/roberta/tokenization.py create mode 100755 baselines/models/roberta/tpu/run_classifier_inews.sh create mode 100755 baselines/models/roberta/tpu/run_classifier_jdcomment.sh create mode 100755 baselines/models/roberta/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/roberta/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/roberta/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/roberta/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/roberta_wwm_ext/CONTRIBUTING.md create mode 100644 baselines/models/roberta_wwm_ext/LICENSE create mode 100644 baselines/models/roberta_wwm_ext/__init__.py create mode 100644 baselines/models/roberta_wwm_ext/conlleval.py create mode 100644 baselines/models/roberta_wwm_ext/create_pretraining_data.py create mode 100644 baselines/models/roberta_wwm_ext/extract_features.py create mode 100644 baselines/models/roberta_wwm_ext/modeling.py create mode 100644 baselines/models/roberta_wwm_ext/modeling_test.py create mode 100644 baselines/models/roberta_wwm_ext/multilingual.md create mode 100644 baselines/models/roberta_wwm_ext/optimization.py create mode 100644 baselines/models/roberta_wwm_ext/optimization_test.py create mode 100644 baselines/models/roberta_wwm_ext/requirements.txt create mode 100644 baselines/models/roberta_wwm_ext/run_classifier.py create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_bq.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_iflydata.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_inews.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_lcqmc.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_thucnews.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_tnews.sh create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_with_tfhub.py create mode 100644 baselines/models/roberta_wwm_ext/run_classifier_xnli.sh create mode 100644 baselines/models/roberta_wwm_ext/run_ner.py create mode 100644 baselines/models/roberta_wwm_ext/run_ner_msra.sh create mode 100644 baselines/models/roberta_wwm_ext/run_pretraining.py create mode 100644 baselines/models/roberta_wwm_ext/run_squad.py create mode 100644 baselines/models/roberta_wwm_ext/tf_metrics.py create mode 100644 baselines/models/roberta_wwm_ext/tokenization.py create mode 100644 baselines/models/roberta_wwm_ext/tokenization_test.py create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_inews.sh create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_jdcomment.sh create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/roberta_wwm_ext/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/roberta_wwm_large_ext/CONTRIBUTING.md create mode 100644 baselines/models/roberta_wwm_large_ext/LICENSE create mode 100644 baselines/models/roberta_wwm_large_ext/__init__.py create mode 100644 baselines/models/roberta_wwm_large_ext/conlleval.py create mode 100644 baselines/models/roberta_wwm_large_ext/create_pretraining_data.py create mode 100644 baselines/models/roberta_wwm_large_ext/extract_features.py create mode 100644 baselines/models/roberta_wwm_large_ext/modeling.py create mode 100644 baselines/models/roberta_wwm_large_ext/modeling_test.py create mode 100644 baselines/models/roberta_wwm_large_ext/multilingual.md create mode 100644 baselines/models/roberta_wwm_large_ext/optimization.py create mode 100644 baselines/models/roberta_wwm_large_ext/optimization_test.py create mode 100644 baselines/models/roberta_wwm_large_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb create mode 100644 baselines/models/roberta_wwm_large_ext/requirements.txt create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier.py create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_bq.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_iflydata.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_inews.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_lcqmc.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_thucnews.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_tnews.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_with_tfhub.py create mode 100644 baselines/models/roberta_wwm_large_ext/run_classifier_xnli.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_ner.py create mode 100644 baselines/models/roberta_wwm_large_ext/run_ner_msra.sh create mode 100644 baselines/models/roberta_wwm_large_ext/run_pretraining.py create mode 100644 baselines/models/roberta_wwm_large_ext/run_squad.py create mode 100644 baselines/models/roberta_wwm_large_ext/tf_metrics.py create mode 100644 baselines/models/roberta_wwm_large_ext/tokenization.py create mode 100644 baselines/models/roberta_wwm_large_ext/tokenization_test.py create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_inews.sh create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_jdcomment.sh create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_thucnews.sh create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/roberta_wwm_large_ext/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/xlnet/__init__.py create mode 100644 baselines/models/xlnet/classifier_utils.py create mode 100644 baselines/models/xlnet/cmrc2018_evaluate_drcd.py create mode 100644 baselines/models/xlnet/data_utils.py create mode 100644 baselines/models/xlnet/function_builder.py create mode 100644 baselines/models/xlnet/gpu_utils.py create mode 100644 baselines/models/xlnet/model_utils.py create mode 100644 baselines/models/xlnet/modeling.py create mode 100644 baselines/models/xlnet/prepro_utils.py create mode 100644 baselines/models/xlnet/run_classifier.py create mode 100644 baselines/models/xlnet/run_classifier_bq.sh create mode 100644 baselines/models/xlnet/run_classifier_iflydata.sh create mode 100644 baselines/models/xlnet/run_classifier_inews.sh create mode 100644 baselines/models/xlnet/run_classifier_lcqmc.sh create mode 100644 baselines/models/xlnet/run_classifier_thucnews.py create mode 100644 baselines/models/xlnet/run_classifier_thucnews.sh create mode 100644 baselines/models/xlnet/run_classifier_tnews.sh create mode 100644 baselines/models/xlnet/run_classifier_xnli.sh create mode 100644 baselines/models/xlnet/run_cmrc_drcd.py create mode 100644 baselines/models/xlnet/spiece.model create mode 100644 baselines/models/xlnet/squad_utils.py create mode 100644 baselines/models/xlnet/summary.py create mode 100644 baselines/models/xlnet/temp.sh create mode 100755 baselines/models/xlnet/tpu/run_classifier_inews.sh create mode 100755 baselines/models/xlnet/tpu/run_classifier_lcqmc.sh create mode 100755 baselines/models/xlnet/tpu/run_classifier_tnews.sh create mode 100755 baselines/models/xlnet/tpu/run_classifier_xnli.sh create mode 100644 baselines/models/xlnet/tpu_estimator.py create mode 100644 baselines/models/xlnet/xlnet.py create mode 100644 baselines/models_pytorch/classifier_pytorch/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/README.md create mode 100644 baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/inews/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/lcqmc/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/tnews/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/xnli/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/convert_albert_original_tf_checkpoint_to_pytorch.py create mode 100644 baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py create mode 100644 baselines/models_pytorch/classifier_pytorch/convert_ernie_original_pad_checkpoint_to_pytorch.py create mode 100644 baselines/models_pytorch/classifier_pytorch/convert_xlnet_original_tf_checkpoint_to_pytorch.py create mode 100644 baselines/models_pytorch/classifier_pytorch/metrics/__init__.py create mode 100644 baselines/models_pytorch/classifier_pytorch/metrics/glue_compute_metrics.py create mode 100644 baselines/models_pytorch/classifier_pytorch/outputs/inews_output/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/outputs/lcqmc_output/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/outputs/xnli_output/.gitignore create mode 100644 baselines/models_pytorch/classifier_pytorch/processors/__init__.py create mode 100644 baselines/models_pytorch/classifier_pytorch/processors/glue.py create mode 100644 baselines/models_pytorch/classifier_pytorch/processors/utils.py create mode 100644 baselines/models_pytorch/classifier_pytorch/run_classifier.py create mode 100644 baselines/models_pytorch/classifier_pytorch/run_classifier_inews.sh create mode 100644 baselines/models_pytorch/classifier_pytorch/run_classifier_lcqmc.sh create mode 100644 baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh create mode 100644 baselines/models_pytorch/classifier_pytorch/run_classifier_xnli.sh create mode 100644 baselines/models_pytorch/classifier_pytorch/tools/common.py create mode 100644 baselines/models_pytorch/classifier_pytorch/tools/progressbar.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/__init__.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/__main__.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_auto.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_utils.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlm.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/file_utils.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_albert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_auto.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_bert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_ctrl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_distilbert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_gpt2.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_openai.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_roberta.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl_utilities.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_utils.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlm.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlnet.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/optimization.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_auto.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_bert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_ctrl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_gpt2.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_openai.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_transfo_xl.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_utils.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlm.py create mode 100644 baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlnet.py create mode 100755 baselines/models_pytorch/mrc_pytorch/convert_tf_checkpoint_to_pytorch.py create mode 100644 baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_output.py create mode 100644 baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_preprocess.py create mode 100644 baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_evaluate.py create mode 100644 baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_output.py create mode 100644 baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_preprocess.py create mode 100755 baselines/models_pytorch/mrc_pytorch/pytorch_modeling.py create mode 100644 baselines/models_pytorch/mrc_pytorch/run_mrc.py create mode 100644 baselines/models_pytorch/mrc_pytorch/run_mrc_cmrc2018.sh create mode 100644 baselines/models_pytorch/mrc_pytorch/run_mrc_drcd.sh create mode 100755 baselines/models_pytorch/mrc_pytorch/tools/file_utils.py create mode 100644 baselines/models_pytorch/mrc_pytorch/tools/langconv.py create mode 100755 baselines/models_pytorch/mrc_pytorch/tools/offical_tokenization.py create mode 100755 baselines/models_pytorch/mrc_pytorch/tools/pytorch_optimization.py create mode 100644 baselines/models_pytorch/mrc_pytorch/tools/utils.py create mode 100644 baselines/models_pytorch/mrc_pytorch/tools/zh_wiki.py diff --git a/baselines/README.md b/baselines/README.md new file mode 100644 index 0000000..fd75762 --- /dev/null +++ b/baselines/README.md @@ -0,0 +1,167 @@ + + + + + + + + + + + + + + + + + + + + + + + + +1. 一键运行 + + 我们为您提供了可以“一键运行”的脚本来辅助您更快的在指定模型上运行特定任务。 + + 以在 Bert 模型上运行“BQ 智能客服问句匹配”任务为例,您可以直接在 chineseGLUE/baselines/models/**bert**/ 下运行 run_classifier_**bq**.sh 脚本。 + + ```bash + cd chineseGLUE/baselines/models/bert/ + sh run_classifier_bq.sh + ``` + + 该脚本将会自动下载“BQ 智能客服问句匹配”数据集(保存在chineseGLUE/baselines/glue/chineseGLUEdatasets/**bq**/ 文件夹下)和Bert模型(保存在 chineseGLUE/baselines/models/bert/prev_trained_model/ 下)。 + + 如果您想在其他模型上执行其他的任务,只需要在对应模型的目录下找到对应任务的执行脚本( run_classifier_**??**.sh ),即可直接运行。 + +2. 测试效果 + + 1. TNEWS 文本分类 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | ALBERT-xlarge | 88.30 | 88.30 |batch_size=32, length=128, epoch=3 | + | BERT-base | 89.80 | 89.78 | batch_size=32, length=128, epoch=3 | + | BERT-wwm-ext-base | 89.88 | 89.81 | batch_size=32, length=128, epoch=3 | + | ERNIE-base | 89.77 |89.83 | batch_size=32, length=128, epoch=3 | + | RoBERTa-large |***90.00*** | ***89.91*** | batch_size=16, length=128, epoch=3 | + | XLNet-mid |86.14 | 86.26 | batch_size=32, length=128, epoch=3 | + | RoBERTa-wwm-ext | 89.82 | 89.79 | batch_size=32, length=128, epoch=3 | + | RoBERTa-wwm-large-ext | ***90.05*** | ***90.11*** | batch_size=16, length=128, epoch=3 | + + 2. XNLI 自然语言推理 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | ALBERT-xlarge |74.0? |74.0? |batch_size=64, length=128, epoch=2 | + | ALBERT-base | 77.0 | 77.1 |batch_size=64, length=128, epoch=2 | + | ALBERT-large | 78.0 | 77.5 |batch_size=64, length=128, epoch=2 | + | BERT-base | 77.80 | 77.8 | batch_size=64, length=128, epoch=2 | + | BERT-wwm-ext-base | 79.4 | 78.7 | batch_size=64, length=128, epoch=2 | + | ERNIE-base | 79.7 |78.6 | batch_size=64, length=128, epoch=2 | + | RoBERTa-large |***80.2*** |***79.9*** | batch_size=64, length=128, epoch=2 | + | XLNet-mid |79.2 | 78.7 | batch_size=64, length=128, epoch=2 | + | RoBERTa-wwm-ext | 79.56 | 79.28 | batch_size=64, length=128, epoch=2 | + | RoBERTa-wwm-large-ext | ***80.20*** | ***80.04*** | batch_size=16, length=128, epoch=2 | + + 3. LCQMC 语义相似度匹配 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | ALBERT-xlarge | 89.00 | 86.76 |batch_size=64, length=128, epoch=3 | + | BERT-base | 89.4 | 86.9 | batch_size=64, length=128, epoch=3 | + | BERT-wwm-ext-base |89.1 | ***87.3*** | batch_size=64, length=128, epoch=3 | + | ERNIE-base | 89.8 | 87.2 | batch_size=64, length=128, epoch=3| + | RoBERTa-large |***89.9*** | 87.2| batch_size=64, length=128, epoch=3 | + | XLNet-mid | 86.14 | 85.98 | batch_size=32, length=128, epoch=3 | + | RoBERTa-wwm-ext | 89.08 | 86.33 | batch_size=64, length=128, epoch=3 | + | RoBERTa-wwm-large-ext | 89.79 | 86.82 | batch_size=16, length=128, epoch=3 | + + 4. INEWS 互联网情感分析 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | ALBERT-xlarge |83.70 | 81.90 |batch_size=32, length=512, epoch=8 | + | BERT-base | 81.29 | 82.70 | batch_size=16, length=512, epoch=3 | + | BERT-wwm-ext-base | 81.93 | 83.46 | batch_size=16, length=512, epoch=3 | + | ERNIE-base | ***84.50*** |***85.14*** | batch_size=16, length=512, epoch=3 | + | RoBERTa-large |81.90 | 84.00 | batch_size=4, length=512, epoch=3 | + | XLNet-mid |82.00 | 84.00 | batch_size=8, length=512, epoch=3 | + | RoBERTa-wwm-ext | 82.98 | 82.28 | batch_size=16, length=512, epoch=3 | + | RoBERTa-wwm-large-ext |83.73 | 82.78 | batch_size=4, length=512, epoch=3 | + + 5. DRCD 繁体阅读理解 (F1, EM) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | BERT-base |F1:92.30 EM:86.60 | F1:91.46 EM:85.49 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | BERT-wwm-ext-base |F1:93.27 EM:88.00 | F1:92.63 EM:87.15 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | ERNIE-base |F1:92.78 EM:86.85 | F1:92.01 EM:86.03 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | ALBERT-large |F1:93.90 EM:88.88 | F1:93.06 EM:87.52 | batch=32, length=512, epoch=3 lr=2e-5 warmup=0.05 | + | ALBERT-xlarge |F1:94.63 EM:89.68 | F1:94.70 EM:89.78 | batch_size=32, length=512, epoch=3 lr=2.5e-5 warmup=0.06 | + | ALBERT-tiny |F1:81.51 EM:71.61 | F1:80.67 EM:70.08 | batch=32, length=512, epoch=3 lr=2e-4 warmup=0.1 | + | RoBERTa-large |F1:94.93 EM:90.11 | F1:94.25 EM:89.35 | batch=32, length=256, epoch=2 lr=3e-5 warmup=0.1| + | xlnet-mid |F1:92.08 EM:84.40 | F1:91.44 EM:83.28 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | RoBERTa-wwm-ext |F1:94.26 EM:89.29 | F1:93.53 EM:88.12 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1| + | RoBERTa-wwm-large-ext |***F1:95.32 EM:90.54*** | ***F1:95.06 EM:90.70*** | batch=32, length=512, epoch=2 lr=2.5e-5 warmup=0.1 | + + 6. CMRC2018 阅读理解 (F1, EM) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | BERT-base |F1:85.48 EM:64.77 | F1:87.17 EM:69.72 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | BERT-wwm-ext-base |F1:86.68 EM:66.96 |F1:88.78 EM:73.23| batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | ERNIE-base |F1:87.30 EM:66.89 | F1:89.62 EM:73.32 | batch=32, length=512, epoch=2 lr=3e-5 warmup=0.1 | + | ALBERT-large | F1:87.86 EM:67.75 |F1:90.17 EM:73.66| epoch3, batch=32, length=512, lr=2e-5, warmup=0.05 | + | ALBERT-xlarge | F1:88.66 EM:68.90 |F1:90.92 EM:75.22| epoch3, batch=32, length=512, lr=2e-5, warmup=0.1 | + | ALBERT-tiny | F1:73.95 EM:48.31 |F1:75.73 EM:53.68| epoch3, batch=32, length=512, lr=2e-4, warmup=0.1 | + | RoBERTa-large | F1:88.61 EM:69.94 |F1:90.94 EM:76.11| epoch2, batch=32, length=256, lr=3e-5, warmup=0.1 | + | xlnet-mid |F1:85.63 EM:65.31 | F1:86.09 EM:66.51 | epoch2, batch=32, length=512, lr=3e-5, warmup=0.1 | + | RoBERTa-wwm-ext |F1:87.28 EM:67.89 | F1:89.74 EM:73.89 | epoch2, batch=32, length=512, lr=3e-5, warmup=0.1 | + | RoBERTa-wwm-large-ext |***F1:89.42 EM:70.59*** | ***F1:91.56 EM:76.58*** | epoch2, batch=32, length=512, lr=2.5e-5, warmup=0.1 | + + 7. BQ 智能客服问句匹配 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | BERT-base | 85.86 | 85.08 | batch_size=64, length=128, epoch=3 | + | BERT-wwm-ext-base | 86.05 | ***85.21*** |batch_size=64, length=128, epoch=3 | + | ERNIE-base | 85.92 | 84.47 | batch_size=64, length=128, epoch=3 | + | RoBERTa-large | 85.68 | 85.20 | batch_size=8, length=128, epoch=3 | + | XLNet-mid | 79.81 | 77.85 | batch_size=32, length=128, epoch=3 | + | ALBERT-xlarge | 85.21 | 84.21 | batch_size=16, length=128, epoch=3 | + | ALBERT-tiny | 82.04 | 80.76 | batch_size=64, length=128, epoch=5 | + | RoBERTa-wwm-ext | 85.31 | 84.02 | batch_size=64, length=128, epoch=3 | + | RoBERTa-wwm-large-ext | ***86.34*** | 84.90 | batch_size=16, length=128, epoch=3 | + + 8. MSRANER 命名实体识别 (F1) + + | 模型 | 测试集(test) | 训练参数 | + | :----: | :----: | :----: | + | BERT-base | 95.38 | batch_size=16, length=256, epoch=5, lr=2e-5 | + | BERT-wwm-ext-base | 95.26 | batch_size=16, length=256, epoch=5, lr=2e-5 | + | ERNIE-base | 95.17 | batch_size=16, length=256, epoch=5, lr=2e-5 | + | RoBERTa-large | ***96.07*** | batch_size=8, length=256, epoch=5, lr=2e-5 | + | XLNet-mid | - | - | + | ALBERT-xlarge | - | - | + | ALBERT-tiny | - | - | + | RoBERTa-wwm-ext | 95.06 | batch_size=16, length=256, epoch=5, lr=2e-5 | + | RoBERTa-wwm-large-ext | 95.32 | batch_size=8, length=256, epoch=5, lr=2e-5 | + + 9. THUCNEWS 长文本分类 (Accuracy) + + | 模型 | 开发集(dev) | 测试集(test) | 训练参数 | + | :----:| :----: | :----: | :----: | + | ALBERT-xlarge | 95.74 | 95.45 |batch_size=32, length=512, epoch=8 | + | ALBERT-tiny | 92.63 | 93.54 | batch_size=32, length=512, epoch=8 | + | BERT-base | 95.28 | 95.35 | batch_size=8, length=128, epoch=3 | + | BERT-wwm-ext-base | 95.38 | 95.57 | batch_size=8, length=128, epoch=3 | + | ERNIE-base | 94.35 | 94.90 | batch_size=16, length=256, epoch=3 | + | RoBERTa-large | 94.52 | 94.56 | batch_size=2, length=256, epoch=3 | + | XLNet-mid | - | 94.52 | batch_size=16, length=128, epoch=3 | + | RoBERTa-wwm-ext | 95.59 | 95.52 | batch_size=16, length=256, epoch=3 | + | RoBERTa-wwm-large-ext | ***96.10*** | ***95.93*** | batch_size=32, length=512, epoch=8 | + diff --git a/baselines/models/albert/albert_config/albert_config_base.json b/baselines/models/albert/albert_config/albert_config_base.json new file mode 100644 index 0000000..57a411b --- /dev/null +++ b/baselines/models/albert/albert_config/albert_config_base.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 768, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 3072 , + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"postln" + +} diff --git a/baselines/models/albert/albert_config/albert_config_large.json b/baselines/models/albert/albert_config/albert_config_large.json new file mode 100644 index 0000000..190511a --- /dev/null +++ b/baselines/models/albert/albert_config/albert_config_large.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 1024, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 4096, + "max_position_embeddings": 512, + "num_attention_heads": 16, + "num_hidden_layers": 24, + + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"postln" + +} diff --git a/baselines/models/albert/albert_config/albert_config_tiny.json b/baselines/models/albert/albert_config/albert_config_tiny.json new file mode 100644 index 0000000..dc97f5b --- /dev/null +++ b/baselines/models/albert/albert_config/albert_config_tiny.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 312, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 1248 , + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 4, + + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"postln" + +} diff --git a/baselines/models/albert/albert_config/albert_config_xlarge.json b/baselines/models/albert/albert_config/albert_config_xlarge.json new file mode 100644 index 0000000..c313629 --- /dev/null +++ b/baselines/models/albert/albert_config/albert_config_xlarge.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 2048, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 512, + "num_attention_heads": 32, + "num_hidden_layers": 24, + + "pooler_fc_size": 1024, + "pooler_num_attention_heads": 64, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"preln" + +} diff --git a/baselines/models/albert/albert_config/albert_config_xxlarge.json b/baselines/models/albert/albert_config/albert_config_xxlarge.json new file mode 100644 index 0000000..f69dbe2 --- /dev/null +++ b/baselines/models/albert/albert_config/albert_config_xxlarge.json @@ -0,0 +1,23 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 4096, + "embedding_size": 128, + "initializer_range": 0.02, + "intermediate_size": 16384, + "max_position_embeddings": 512, + "num_attention_heads": 64, + "num_hidden_layers": 12, + + "pooler_fc_size": 1024, + "pooler_num_attention_heads": 64, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128, + "ln_type":"preln" + +} diff --git a/baselines/models/albert/albert_config/bert_config.json b/baselines/models/albert/albert_config/bert_config.json new file mode 100644 index 0000000..bb3fd49 --- /dev/null +++ b/baselines/models/albert/albert_config/bert_config.json @@ -0,0 +1,19 @@ +{ + "attention_probs_dropout_prob": 0.0, + "directionality": "bidi", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.0, + "hidden_size": 768, + "initializer_range": 0.02, + "intermediate_size": 3072, + "max_position_embeddings": 512, + "num_attention_heads": 12, + "num_hidden_layers": 12, + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 21128 +} diff --git a/baselines/models/albert/albert_config/vocab.txt b/baselines/models/albert/albert_config/vocab.txt new file mode 100644 index 0000000..ca4f978 --- /dev/null +++ b/baselines/models/albert/albert_config/vocab.txt @@ -0,0 +1,21128 @@ +[PAD] +[unused1] +[unused2] +[unused3] +[unused4] +[unused5] +[unused6] +[unused7] +[unused8] +[unused9] +[unused10] +[unused11] +[unused12] +[unused13] +[unused14] +[unused15] +[unused16] +[unused17] +[unused18] +[unused19] +[unused20] +[unused21] +[unused22] +[unused23] +[unused24] +[unused25] +[unused26] +[unused27] +[unused28] +[unused29] +[unused30] +[unused31] +[unused32] +[unused33] +[unused34] +[unused35] +[unused36] +[unused37] +[unused38] +[unused39] +[unused40] +[unused41] +[unused42] +[unused43] +[unused44] +[unused45] +[unused46] +[unused47] +[unused48] +[unused49] +[unused50] +[unused51] +[unused52] +[unused53] +[unused54] +[unused55] +[unused56] +[unused57] +[unused58] +[unused59] +[unused60] +[unused61] +[unused62] +[unused63] +[unused64] +[unused65] +[unused66] +[unused67] +[unused68] +[unused69] +[unused70] +[unused71] +[unused72] +[unused73] +[unused74] +[unused75] +[unused76] +[unused77] +[unused78] +[unused79] +[unused80] +[unused81] +[unused82] +[unused83] +[unused84] +[unused85] +[unused86] +[unused87] +[unused88] +[unused89] +[unused90] +[unused91] +[unused92] +[unused93] +[unused94] +[unused95] +[unused96] +[unused97] +[unused98] +[unused99] +[UNK] +[CLS] +[SEP] +[MASK] + + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +[ +\ +] +^ +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +£ +¤ +¥ +§ +© +« +® +° +± +² +³ +µ +· +¹ +º +» +¼ +× +ß +æ +÷ +ø +đ +ŋ +ɔ +ə +ɡ +ʰ +ˇ +ˈ +ˊ +ˋ +ˍ +ː +˙ +˚ +ˢ +α +β +γ +δ +ε +η +θ +ι +κ +λ +μ +ν +ο +π +ρ +ς +σ +τ +υ +φ +χ +ψ +ω +а +б +в +г +д +е +ж +з +и +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +ы +ь +я +і +ا +ب +ة +ت +د +ر +س +ع +ل +م +ن +ه +و +ي +۩ +ก +ง +น +ม +ย +ร +อ +า +เ +๑ +་ +ღ +ᄀ +ᄁ +ᄂ +ᄃ +ᄅ +ᄆ +ᄇ +ᄈ +ᄉ +ᄋ +ᄌ +ᄎ +ᄏ +ᄐ +ᄑ +ᄒ +ᅡ +ᅢ +ᅣ +ᅥ +ᅦ +ᅧ +ᅨ +ᅩ +ᅪ +ᅬ +ᅭ +ᅮ +ᅯ +ᅲ +ᅳ +ᅴ +ᅵ +ᆨ +ᆫ +ᆯ +ᆷ +ᆸ +ᆺ +ᆻ +ᆼ +ᗜ +ᵃ +ᵉ +ᵍ +ᵏ +ᵐ +ᵒ +ᵘ +‖ +„ +† +• +‥ +‧ +
 +‰ +′ +″ +‹ +› +※ +‿ +⁄ +ⁱ +⁺ +ⁿ +₁ +₂ +₃ +₄ +€ +℃ +№ +™ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +← +↑ +→ +↓ +↔ +↗ +↘ +⇒ +∀ +− +∕ +∙ +√ +∞ +∟ +∠ +∣ +∥ +∩ +∮ +∶ +∼ +∽ +≈ +≒ +≡ +≤ +≥ +≦ +≧ +≪ +≫ +⊙ +⋅ +⋈ +⋯ +⌒ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +⑴ +⑵ +⑶ +⑷ +⑸ +⒈ +⒉ +⒊ +⒋ +ⓒ +ⓔ +ⓘ +─ +━ +│ +┃ +┅ +┆ +┊ +┌ +└ +├ +┣ +═ +║ +╚ +╞ +╠ +╭ +╮ +╯ +╰ +╱ +╳ +▂ +▃ +▅ +▇ +█ +▉ +▋ +▌ +▍ +▎ +■ +□ +▪ +▫ +▬ +▲ +△ +▶ +► +▼ +▽ +◆ +◇ +○ +◎ +● +◕ +◠ +◢ +◤ +☀ +★ +☆ +☕ +☞ +☺ +☼ +♀ +♂ +♠ +♡ +♣ +♥ +♦ +♪ +♫ +♬ +✈ +✔ +✕ +✖ +✦ +✨ +✪ +✰ +✿ +❀ +❤ +➜ +➤ +⦿ +、 +。 +〃 +々 +〇 +〈 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〓 +〔 +〕 +〖 +〗 +〜 +〝 +〞 +ぁ +あ +ぃ +い +う +ぇ +え +お +か +き +く +け +こ +さ +し +す +せ +そ +た +ち +っ +つ +て +と +な +に +ぬ +ね +の +は +ひ +ふ +へ +ほ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +を +ん +゜ +ゝ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +キ +ク +ケ +コ +サ +シ +ス +セ +ソ +タ +チ +ッ +ツ +テ +ト +ナ +ニ +ヌ +ネ +ノ +ハ +ヒ +フ +ヘ +ホ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヲ +ン +ヶ +・ +ー +ヽ +ㄅ +ㄆ +ㄇ +ㄉ +ㄋ +ㄌ +ㄍ +ㄎ +ㄏ +ㄒ +ㄚ +ㄛ +ㄞ +ㄟ +ㄢ +ㄤ +ㄥ +ㄧ +ㄨ +ㆍ +㈦ +㊣ +㎡ +㗎 +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丐 +丑 +专 +且 +丕 +世 +丘 +丙 +业 +丛 +东 +丝 +丞 +丟 +両 +丢 +两 +严 +並 +丧 +丨 +个 +丫 +中 +丰 +串 +临 +丶 +丸 +丹 +为 +主 +丼 +丽 +举 +丿 +乂 +乃 +久 +么 +义 +之 +乌 +乍 +乎 +乏 +乐 +乒 +乓 +乔 +乖 +乗 +乘 +乙 +乜 +九 +乞 +也 +习 +乡 +书 +乩 +买 +乱 +乳 +乾 +亀 +亂 +了 +予 +争 +事 +二 +于 +亏 +云 +互 +五 +井 +亘 +亙 +亚 +些 +亜 +亞 +亟 +亡 +亢 +交 +亥 +亦 +产 +亨 +亩 +享 +京 +亭 +亮 +亲 +亳 +亵 +人 +亿 +什 +仁 +仃 +仄 +仅 +仆 +仇 +今 +介 +仍 +从 +仏 +仑 +仓 +仔 +仕 +他 +仗 +付 +仙 +仝 +仞 +仟 +代 +令 +以 +仨 +仪 +们 +仮 +仰 +仲 +件 +价 +任 +份 +仿 +企 +伉 +伊 +伍 +伎 +伏 +伐 +休 +伕 +众 +优 +伙 +会 +伝 +伞 +伟 +传 +伢 +伤 +伦 +伪 +伫 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +佇 +佈 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佢 +佣 +佤 +佥 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佻 +佼 +使 +侃 +侄 +來 +侈 +例 +侍 +侏 +侑 +侖 +侗 +供 +依 +侠 +価 +侣 +侥 +侦 +侧 +侨 +侬 +侮 +侯 +侵 +侶 +侷 +便 +係 +促 +俄 +俊 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +保 +俞 +俟 +俠 +信 +俨 +俩 +俪 +俬 +俭 +修 +俯 +俱 +俳 +俸 +俺 +俾 +倆 +倉 +個 +倌 +倍 +倏 +們 +倒 +倔 +倖 +倘 +候 +倚 +倜 +借 +倡 +値 +倦 +倩 +倪 +倫 +倬 +倭 +倶 +债 +值 +倾 +偃 +假 +偈 +偉 +偌 +偎 +偏 +偕 +做 +停 +健 +側 +偵 +偶 +偷 +偻 +偽 +偿 +傀 +傅 +傍 +傑 +傘 +備 +傚 +傢 +傣 +傥 +储 +傩 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +働 +像 +僑 +僕 +僖 +僚 +僥 +僧 +僭 +僮 +僱 +僵 +價 +僻 +儀 +儂 +億 +儆 +儉 +儋 +儒 +儕 +儘 +償 +儡 +優 +儲 +儷 +儼 +儿 +兀 +允 +元 +兄 +充 +兆 +兇 +先 +光 +克 +兌 +免 +児 +兑 +兒 +兔 +兖 +党 +兜 +兢 +入 +內 +全 +兩 +八 +公 +六 +兮 +兰 +共 +兲 +关 +兴 +兵 +其 +具 +典 +兹 +养 +兼 +兽 +冀 +内 +円 +冇 +冈 +冉 +冊 +册 +再 +冏 +冒 +冕 +冗 +写 +军 +农 +冠 +冢 +冤 +冥 +冨 +冪 +冬 +冯 +冰 +冲 +决 +况 +冶 +冷 +冻 +冼 +冽 +冾 +净 +凄 +准 +凇 +凈 +凉 +凋 +凌 +凍 +减 +凑 +凛 +凜 +凝 +几 +凡 +凤 +処 +凪 +凭 +凯 +凰 +凱 +凳 +凶 +凸 +凹 +出 +击 +函 +凿 +刀 +刁 +刃 +分 +切 +刈 +刊 +刍 +刎 +刑 +划 +列 +刘 +则 +刚 +创 +初 +删 +判 +別 +刨 +利 +刪 +别 +刮 +到 +制 +刷 +券 +刹 +刺 +刻 +刽 +剁 +剂 +剃 +則 +剉 +削 +剋 +剌 +前 +剎 +剐 +剑 +剔 +剖 +剛 +剜 +剝 +剣 +剤 +剥 +剧 +剩 +剪 +副 +割 +創 +剷 +剽 +剿 +劃 +劇 +劈 +劉 +劊 +劍 +劏 +劑 +力 +劝 +办 +功 +加 +务 +劣 +动 +助 +努 +劫 +劭 +励 +劲 +劳 +労 +劵 +効 +劾 +势 +勁 +勃 +勇 +勉 +勋 +勐 +勒 +動 +勖 +勘 +務 +勛 +勝 +勞 +募 +勢 +勤 +勧 +勳 +勵 +勸 +勺 +勻 +勾 +勿 +匀 +包 +匆 +匈 +匍 +匐 +匕 +化 +北 +匙 +匝 +匠 +匡 +匣 +匪 +匮 +匯 +匱 +匹 +区 +医 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卍 +华 +协 +卑 +卒 +卓 +協 +单 +卖 +南 +単 +博 +卜 +卞 +卟 +占 +卡 +卢 +卤 +卦 +卧 +卫 +卮 +卯 +印 +危 +即 +却 +卵 +卷 +卸 +卻 +卿 +厂 +厄 +厅 +历 +厉 +压 +厌 +厕 +厘 +厚 +厝 +原 +厢 +厥 +厦 +厨 +厩 +厭 +厮 +厲 +厳 +去 +县 +叁 +参 +參 +又 +叉 +及 +友 +双 +反 +収 +发 +叔 +取 +受 +变 +叙 +叛 +叟 +叠 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +叶 +号 +司 +叹 +叻 +叼 +叽 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吓 +吕 +吖 +吗 +君 +吝 +吞 +吟 +吠 +吡 +否 +吧 +吨 +吩 +含 +听 +吭 +吮 +启 +吱 +吳 +吴 +吵 +吶 +吸 +吹 +吻 +吼 +吽 +吾 +呀 +呂 +呃 +呆 +呈 +告 +呋 +呎 +呐 +呓 +呕 +呗 +员 +呛 +呜 +呢 +呤 +呦 +周 +呱 +呲 +味 +呵 +呷 +呸 +呻 +呼 +命 +咀 +咁 +咂 +咄 +咆 +咋 +和 +咎 +咏 +咐 +咒 +咔 +咕 +咖 +咗 +咘 +咙 +咚 +咛 +咣 +咤 +咦 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咽 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哋 +哌 +响 +哎 +哏 +哐 +哑 +哒 +哔 +哗 +哟 +員 +哥 +哦 +哧 +哨 +哩 +哪 +哭 +哮 +哲 +哺 +哼 +哽 +唁 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唠 +唤 +唧 +唬 +售 +唯 +唰 +唱 +唳 +唷 +唸 +唾 +啃 +啄 +商 +啉 +啊 +問 +啓 +啕 +啖 +啜 +啞 +啟 +啡 +啤 +啥 +啦 +啧 +啪 +啫 +啬 +啮 +啰 +啱 +啲 +啵 +啶 +啷 +啸 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喉 +喊 +喋 +喎 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喟 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +営 +喷 +喹 +喺 +喻 +喽 +嗅 +嗆 +嗇 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗟 +嗡 +嗣 +嗤 +嗦 +嗨 +嗪 +嗬 +嗯 +嗰 +嗲 +嗳 +嗶 +嗷 +嗽 +嘀 +嘅 +嘆 +嘈 +嘉 +嘌 +嘍 +嘎 +嘔 +嘖 +嘗 +嘘 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘤 +嘧 +嘩 +嘭 +嘮 +嘯 +嘰 +嘱 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噎 +噓 +噔 +噗 +噙 +噜 +噠 +噢 +噤 +器 +噩 +噪 +噬 +噱 +噴 +噶 +噸 +噹 +噻 +噼 +嚀 +嚇 +嚎 +嚏 +嚐 +嚓 +嚕 +嚟 +嚣 +嚥 +嚨 +嚮 +嚴 +嚷 +嚼 +囂 +囉 +囊 +囍 +囑 +囔 +囗 +囚 +四 +囝 +回 +囟 +因 +囡 +团 +団 +囤 +囧 +囪 +囫 +园 +困 +囱 +囲 +図 +围 +囹 +固 +国 +图 +囿 +圃 +圄 +圆 +圈 +國 +圍 +圏 +園 +圓 +圖 +團 +圜 +土 +圣 +圧 +在 +圩 +圭 +地 +圳 +场 +圻 +圾 +址 +坂 +均 +坊 +坍 +坎 +坏 +坐 +坑 +块 +坚 +坛 +坝 +坞 +坟 +坠 +坡 +坤 +坦 +坨 +坪 +坯 +坳 +坵 +坷 +垂 +垃 +垄 +型 +垒 +垚 +垛 +垠 +垢 +垣 +垦 +垩 +垫 +垭 +垮 +垵 +埂 +埃 +埋 +城 +埔 +埕 +埗 +域 +埠 +埤 +埵 +執 +埸 +培 +基 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堑 +堕 +堙 +堡 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +堿 +塊 +塌 +塑 +塔 +塗 +塘 +塚 +塞 +塢 +塩 +填 +塬 +塭 +塵 +塾 +墀 +境 +墅 +墉 +墊 +墒 +墓 +増 +墘 +墙 +墜 +增 +墟 +墨 +墩 +墮 +墳 +墻 +墾 +壁 +壅 +壆 +壇 +壊 +壑 +壓 +壕 +壘 +壞 +壟 +壢 +壤 +壩 +士 +壬 +壮 +壯 +声 +売 +壳 +壶 +壹 +壺 +壽 +处 +备 +変 +复 +夏 +夔 +夕 +外 +夙 +多 +夜 +够 +夠 +夢 +夥 +大 +天 +太 +夫 +夭 +央 +夯 +失 +头 +夷 +夸 +夹 +夺 +夾 +奂 +奄 +奇 +奈 +奉 +奋 +奎 +奏 +奐 +契 +奔 +奕 +奖 +套 +奘 +奚 +奠 +奢 +奥 +奧 +奪 +奬 +奮 +女 +奴 +奶 +奸 +她 +好 +如 +妃 +妄 +妆 +妇 +妈 +妊 +妍 +妒 +妓 +妖 +妘 +妙 +妝 +妞 +妣 +妤 +妥 +妨 +妩 +妪 +妮 +妲 +妳 +妹 +妻 +妾 +姆 +姉 +姊 +始 +姍 +姐 +姑 +姒 +姓 +委 +姗 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姹 +姻 +姿 +威 +娃 +娄 +娅 +娆 +娇 +娉 +娑 +娓 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娱 +娲 +娴 +娶 +娼 +婀 +婁 +婆 +婉 +婊 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婴 +婵 +婶 +婷 +婺 +婿 +媒 +媚 +媛 +媞 +媧 +媲 +媳 +媽 +媾 +嫁 +嫂 +嫉 +嫌 +嫑 +嫔 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫲 +嫵 +嫻 +嬅 +嬉 +嬌 +嬗 +嬛 +嬢 +嬤 +嬪 +嬰 +嬴 +嬷 +嬸 +嬿 +孀 +孃 +子 +孑 +孔 +孕 +孖 +字 +存 +孙 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +学 +孩 +孪 +孫 +孬 +孰 +孱 +孳 +孵 +學 +孺 +孽 +孿 +宁 +它 +宅 +宇 +守 +安 +宋 +完 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +实 +実 +宠 +审 +客 +宣 +室 +宥 +宦 +宪 +宫 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宽 +宾 +宿 +寂 +寄 +寅 +密 +寇 +富 +寐 +寒 +寓 +寛 +寝 +寞 +察 +寡 +寢 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寰 +寵 +寶 +寸 +对 +寺 +寻 +导 +対 +寿 +封 +専 +射 +将 +將 +專 +尉 +尊 +尋 +對 +導 +小 +少 +尔 +尕 +尖 +尘 +尚 +尝 +尤 +尧 +尬 +就 +尴 +尷 +尸 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +屁 +层 +屄 +居 +屆 +屈 +屉 +届 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +展 +屜 +属 +屠 +屡 +屢 +層 +履 +屬 +屯 +山 +屹 +屿 +岀 +岁 +岂 +岌 +岐 +岑 +岔 +岖 +岗 +岘 +岙 +岚 +岛 +岡 +岩 +岫 +岬 +岭 +岱 +岳 +岷 +岸 +峇 +峋 +峒 +峙 +峡 +峤 +峥 +峦 +峨 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峽 +崁 +崂 +崆 +崇 +崎 +崑 +崔 +崖 +崗 +崙 +崛 +崧 +崩 +崭 +崴 +崽 +嵇 +嵊 +嵋 +嵌 +嵐 +嵘 +嵩 +嵬 +嵯 +嶂 +嶄 +嶇 +嶋 +嶙 +嶺 +嶼 +嶽 +巅 +巍 +巒 +巔 +巖 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巩 +巫 +差 +己 +已 +巳 +巴 +巷 +巻 +巽 +巾 +巿 +币 +市 +布 +帅 +帆 +师 +希 +帐 +帑 +帕 +帖 +帘 +帚 +帛 +帜 +帝 +帥 +带 +帧 +師 +席 +帮 +帯 +帰 +帳 +帶 +帷 +常 +帼 +帽 +幀 +幂 +幄 +幅 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +幫 +干 +平 +年 +并 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +广 +庁 +広 +庄 +庆 +庇 +床 +序 +庐 +库 +应 +底 +庖 +店 +庙 +庚 +府 +庞 +废 +庠 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +庹 +庾 +廁 +廂 +廃 +廈 +廉 +廊 +廓 +廖 +廚 +廝 +廟 +廠 +廢 +廣 +廬 +廳 +延 +廷 +建 +廿 +开 +弁 +异 +弃 +弄 +弈 +弊 +弋 +式 +弑 +弒 +弓 +弔 +引 +弗 +弘 +弛 +弟 +张 +弥 +弦 +弧 +弩 +弭 +弯 +弱 +張 +強 +弹 +强 +弼 +弾 +彅 +彆 +彈 +彌 +彎 +归 +当 +录 +彗 +彙 +彝 +形 +彤 +彥 +彦 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彷 +役 +彻 +彼 +彿 +往 +征 +径 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +従 +徕 +得 +徘 +徙 +徜 +從 +徠 +御 +徨 +復 +循 +徬 +微 +徳 +徴 +徵 +德 +徹 +徼 +徽 +心 +必 +忆 +忌 +忍 +忏 +忐 +忑 +忒 +忖 +志 +忘 +忙 +応 +忠 +忡 +忤 +忧 +忪 +快 +忱 +念 +忻 +忽 +忿 +怀 +态 +怂 +怅 +怆 +怎 +怏 +怒 +怔 +怕 +怖 +怙 +怜 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +总 +怼 +恁 +恃 +恆 +恋 +恍 +恐 +恒 +恕 +恙 +恚 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恫 +恬 +恭 +息 +恰 +恳 +恵 +恶 +恸 +恺 +恻 +恼 +恿 +悄 +悅 +悉 +悌 +悍 +悔 +悖 +悚 +悟 +悠 +患 +悦 +您 +悩 +悪 +悬 +悯 +悱 +悲 +悴 +悵 +悶 +悸 +悻 +悼 +悽 +情 +惆 +惇 +惊 +惋 +惑 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惦 +惧 +惨 +惩 +惫 +惬 +惭 +惮 +惯 +惰 +惱 +想 +惴 +惶 +惹 +惺 +愁 +愆 +愈 +愉 +愍 +意 +愕 +愚 +愛 +愜 +感 +愣 +愤 +愧 +愫 +愷 +愿 +慄 +慈 +態 +慌 +慎 +慑 +慕 +慘 +慚 +慟 +慢 +慣 +慧 +慨 +慫 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憎 +憐 +憑 +憔 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憾 +懂 +懇 +懈 +應 +懊 +懋 +懑 +懒 +懦 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懾 +懿 +戀 +戈 +戊 +戌 +戍 +戎 +戏 +成 +我 +戒 +戕 +或 +战 +戚 +戛 +戟 +戡 +戦 +截 +戬 +戮 +戰 +戲 +戳 +戴 +戶 +户 +戸 +戻 +戾 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +扎 +扑 +扒 +打 +扔 +払 +托 +扛 +扣 +扦 +执 +扩 +扪 +扫 +扬 +扭 +扮 +扯 +扰 +扱 +扳 +扶 +批 +扼 +找 +承 +技 +抄 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抚 +抛 +抜 +択 +抟 +抠 +抡 +抢 +护 +报 +抨 +披 +抬 +抱 +抵 +抹 +押 +抽 +抿 +拂 +拄 +担 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拟 +拡 +拢 +拣 +拥 +拦 +拧 +拨 +择 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拼 +拽 +拾 +拿 +持 +挂 +指 +挈 +按 +挎 +挑 +挖 +挙 +挚 +挛 +挝 +挞 +挟 +挠 +挡 +挣 +挤 +挥 +挨 +挪 +挫 +振 +挲 +挹 +挺 +挽 +挾 +捂 +捅 +捆 +捉 +捋 +捌 +捍 +捎 +捏 +捐 +捕 +捞 +损 +捡 +换 +捣 +捧 +捨 +捩 +据 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掲 +掳 +掴 +掷 +掸 +掺 +揀 +揃 +揄 +揆 +揉 +揍 +描 +提 +插 +揖 +揚 +換 +握 +揣 +揩 +揪 +揭 +揮 +援 +揶 +揸 +揹 +揽 +搀 +搁 +搂 +搅 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搡 +搪 +搬 +搭 +搵 +搶 +携 +搽 +摀 +摁 +摄 +摆 +摇 +摈 +摊 +摒 +摔 +摘 +摞 +摟 +摧 +摩 +摯 +摳 +摸 +摹 +摺 +摻 +撂 +撃 +撅 +撇 +撈 +撐 +撑 +撒 +撓 +撕 +撚 +撞 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撵 +撷 +撸 +撻 +撼 +撿 +擀 +擁 +擂 +擄 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擞 +擠 +擡 +擢 +擦 +擬 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攒 +攔 +攘 +攙 +攜 +攝 +攞 +攢 +攣 +攤 +攥 +攪 +攫 +攬 +支 +收 +攸 +改 +攻 +放 +政 +故 +效 +敌 +敍 +敎 +敏 +救 +敕 +敖 +敗 +敘 +教 +敛 +敝 +敞 +敢 +散 +敦 +敬 +数 +敲 +整 +敵 +敷 +數 +斂 +斃 +文 +斋 +斌 +斎 +斐 +斑 +斓 +斗 +料 +斛 +斜 +斟 +斡 +斤 +斥 +斧 +斩 +斫 +斬 +断 +斯 +新 +斷 +方 +於 +施 +旁 +旃 +旅 +旋 +旌 +旎 +族 +旖 +旗 +无 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旮 +旱 +时 +旷 +旺 +旻 +昀 +昂 +昆 +昇 +昉 +昊 +昌 +明 +昏 +易 +昔 +昕 +昙 +星 +映 +春 +昧 +昨 +昭 +是 +昱 +昴 +昵 +昶 +昼 +显 +晁 +時 +晃 +晉 +晋 +晌 +晏 +晒 +晓 +晔 +晕 +晖 +晗 +晚 +晝 +晞 +晟 +晤 +晦 +晨 +晩 +普 +景 +晰 +晴 +晶 +晷 +智 +晾 +暂 +暄 +暇 +暈 +暉 +暌 +暐 +暑 +暖 +暗 +暝 +暢 +暧 +暨 +暫 +暮 +暱 +暴 +暸 +暹 +曄 +曆 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曦 +曬 +曰 +曲 +曳 +更 +書 +曹 +曼 +曾 +替 +最 +會 +月 +有 +朋 +服 +朐 +朔 +朕 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朮 +术 +朱 +朴 +朵 +机 +朽 +杀 +杂 +权 +杆 +杈 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +杠 +条 +来 +杨 +杭 +杯 +杰 +東 +杳 +杵 +杷 +杼 +松 +板 +极 +构 +枇 +枉 +枋 +析 +枕 +林 +枚 +果 +枝 +枢 +枣 +枪 +枫 +枭 +枯 +枰 +枱 +枳 +架 +枷 +枸 +柄 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柞 +柠 +柢 +查 +柩 +柬 +柯 +柱 +柳 +柴 +柵 +査 +柿 +栀 +栃 +栄 +栅 +标 +栈 +栉 +栋 +栎 +栏 +树 +栓 +栖 +栗 +校 +栩 +株 +样 +核 +根 +格 +栽 +栾 +桀 +桁 +桂 +桃 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桜 +桠 +桡 +桢 +档 +桥 +桦 +桧 +桨 +桩 +桶 +桿 +梁 +梅 +梆 +梏 +梓 +梗 +條 +梟 +梢 +梦 +梧 +梨 +梭 +梯 +械 +梳 +梵 +梶 +检 +棂 +棄 +棉 +棋 +棍 +棒 +棕 +棗 +棘 +棚 +棟 +棠 +棣 +棧 +森 +棱 +棲 +棵 +棹 +棺 +椁 +椅 +椋 +植 +椎 +椒 +検 +椪 +椭 +椰 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楚 +楝 +楞 +楠 +楣 +楨 +楫 +業 +楮 +極 +楷 +楸 +楹 +楼 +楽 +概 +榄 +榆 +榈 +榉 +榔 +榕 +榖 +榛 +榜 +榨 +榫 +榭 +榮 +榱 +榴 +榷 +榻 +槁 +槃 +構 +槌 +槍 +槎 +槐 +槓 +様 +槛 +槟 +槤 +槭 +槲 +槳 +槻 +槽 +槿 +樁 +樂 +樊 +樑 +樓 +標 +樞 +樟 +模 +樣 +権 +横 +樫 +樯 +樱 +樵 +樸 +樹 +樺 +樽 +樾 +橄 +橇 +橋 +橐 +橘 +橙 +機 +橡 +橢 +橫 +橱 +橹 +橼 +檀 +檄 +檎 +檐 +檔 +檗 +檜 +檢 +檬 +檯 +檳 +檸 +檻 +櫃 +櫚 +櫛 +櫥 +櫸 +櫻 +欄 +權 +欒 +欖 +欠 +次 +欢 +欣 +欧 +欲 +欸 +欺 +欽 +款 +歆 +歇 +歉 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歯 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歼 +殁 +殃 +殆 +殇 +殉 +殊 +残 +殒 +殓 +殖 +殘 +殞 +殡 +殤 +殭 +殯 +殲 +殴 +段 +殷 +殺 +殼 +殿 +毀 +毁 +毂 +毅 +毆 +毋 +母 +毎 +每 +毒 +毓 +比 +毕 +毗 +毘 +毙 +毛 +毡 +毫 +毯 +毽 +氈 +氏 +氐 +民 +氓 +气 +氖 +気 +氙 +氛 +氟 +氡 +氢 +氣 +氤 +氦 +氧 +氨 +氪 +氫 +氮 +氯 +氰 +氲 +水 +氷 +永 +氹 +氾 +汀 +汁 +求 +汆 +汇 +汉 +汎 +汐 +汕 +汗 +汙 +汛 +汝 +汞 +江 +池 +污 +汤 +汨 +汩 +汪 +汰 +汲 +汴 +汶 +汹 +決 +汽 +汾 +沁 +沂 +沃 +沅 +沈 +沉 +沌 +沏 +沐 +沒 +沓 +沖 +沙 +沛 +沟 +没 +沢 +沣 +沥 +沦 +沧 +沪 +沫 +沭 +沮 +沱 +河 +沸 +油 +治 +沼 +沽 +沾 +沿 +況 +泄 +泉 +泊 +泌 +泓 +法 +泗 +泛 +泞 +泠 +泡 +波 +泣 +泥 +注 +泪 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +泷 +泸 +泻 +泼 +泽 +泾 +洁 +洄 +洋 +洒 +洗 +洙 +洛 +洞 +津 +洩 +洪 +洮 +洱 +洲 +洵 +洶 +洸 +洹 +活 +洼 +洽 +派 +流 +浃 +浄 +浅 +浆 +浇 +浊 +测 +济 +浏 +浑 +浒 +浓 +浔 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浬 +浮 +浯 +浴 +海 +浸 +涂 +涅 +涇 +消 +涉 +涌 +涎 +涓 +涔 +涕 +涙 +涛 +涝 +涞 +涟 +涠 +涡 +涣 +涤 +润 +涧 +涨 +涩 +涪 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淀 +淄 +淅 +淆 +淇 +淋 +淌 +淑 +淒 +淖 +淘 +淙 +淚 +淞 +淡 +淤 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +深 +淳 +淵 +混 +淹 +淺 +添 +淼 +清 +済 +渉 +渊 +渋 +渍 +渎 +渐 +渔 +渗 +渙 +渚 +減 +渝 +渠 +渡 +渣 +渤 +渥 +渦 +温 +測 +渭 +港 +渲 +渴 +游 +渺 +渾 +湃 +湄 +湊 +湍 +湖 +湘 +湛 +湟 +湧 +湫 +湮 +湯 +湳 +湾 +湿 +満 +溃 +溅 +溉 +溏 +源 +準 +溜 +溝 +溟 +溢 +溥 +溧 +溪 +溫 +溯 +溱 +溴 +溶 +溺 +溼 +滁 +滂 +滄 +滅 +滇 +滋 +滌 +滑 +滓 +滔 +滕 +滙 +滚 +滝 +滞 +滟 +满 +滢 +滤 +滥 +滦 +滨 +滩 +滬 +滯 +滲 +滴 +滷 +滸 +滾 +滿 +漁 +漂 +漆 +漉 +漏 +漓 +演 +漕 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漸 +漾 +漿 +潆 +潇 +潋 +潍 +潑 +潔 +潘 +潛 +潜 +潞 +潟 +潢 +潤 +潦 +潧 +潭 +潮 +潰 +潴 +潸 +潺 +潼 +澀 +澄 +澆 +澈 +澍 +澎 +澗 +澜 +澡 +澤 +澧 +澱 +澳 +澹 +激 +濁 +濂 +濃 +濑 +濒 +濕 +濘 +濛 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濱 +濺 +濾 +瀅 +瀆 +瀉 +瀋 +瀏 +瀑 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀧 +瀨 +瀬 +瀰 +瀾 +灌 +灏 +灑 +灘 +灝 +灞 +灣 +火 +灬 +灭 +灯 +灰 +灵 +灶 +灸 +灼 +災 +灾 +灿 +炀 +炁 +炅 +炉 +炊 +炎 +炒 +炔 +炕 +炖 +炙 +炜 +炫 +炬 +炭 +炮 +炯 +炳 +炷 +炸 +点 +為 +炼 +炽 +烁 +烂 +烃 +烈 +烊 +烏 +烘 +烙 +烛 +烟 +烤 +烦 +烧 +烨 +烩 +烫 +烬 +热 +烯 +烷 +烹 +烽 +焉 +焊 +焕 +焖 +焗 +焘 +焙 +焚 +焜 +無 +焦 +焯 +焰 +焱 +然 +焼 +煅 +煉 +煊 +煌 +煎 +煒 +煖 +煙 +煜 +煞 +煤 +煥 +煦 +照 +煨 +煩 +煮 +煲 +煸 +煽 +熄 +熊 +熏 +熒 +熔 +熙 +熟 +熠 +熨 +熬 +熱 +熵 +熹 +熾 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燒 +燔 +燕 +燙 +燜 +營 +燥 +燦 +燧 +燭 +燮 +燴 +燻 +燼 +燿 +爆 +爍 +爐 +爛 +爪 +爬 +爭 +爰 +爱 +爲 +爵 +父 +爷 +爸 +爹 +爺 +爻 +爽 +爾 +牆 +片 +版 +牌 +牍 +牒 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牦 +牧 +物 +牯 +牲 +牴 +牵 +特 +牺 +牽 +犀 +犁 +犄 +犊 +犍 +犒 +犢 +犧 +犬 +犯 +状 +犷 +犸 +犹 +狀 +狂 +狄 +狈 +狎 +狐 +狒 +狗 +狙 +狞 +狠 +狡 +狩 +独 +狭 +狮 +狰 +狱 +狸 +狹 +狼 +狽 +猎 +猕 +猖 +猗 +猙 +猛 +猜 +猝 +猥 +猩 +猪 +猫 +猬 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獎 +獐 +獒 +獗 +獠 +獣 +獨 +獭 +獰 +獲 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玄 +率 +玉 +王 +玑 +玖 +玛 +玟 +玠 +玥 +玩 +玫 +玮 +环 +现 +玲 +玳 +玷 +玺 +玻 +珀 +珂 +珅 +珈 +珉 +珊 +珍 +珏 +珐 +珑 +珙 +珞 +珠 +珣 +珥 +珩 +珪 +班 +珮 +珲 +珺 +現 +球 +琅 +理 +琇 +琉 +琊 +琍 +琏 +琐 +琛 +琢 +琥 +琦 +琨 +琪 +琬 +琮 +琰 +琲 +琳 +琴 +琵 +琶 +琺 +琼 +瑀 +瑁 +瑄 +瑋 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑞 +瑟 +瑠 +瑣 +瑤 +瑩 +瑪 +瑯 +瑰 +瑶 +瑾 +璀 +璁 +璃 +璇 +璉 +璋 +璎 +璐 +璜 +璞 +璟 +璧 +璨 +環 +璽 +璿 +瓊 +瓏 +瓒 +瓜 +瓢 +瓣 +瓤 +瓦 +瓮 +瓯 +瓴 +瓶 +瓷 +甄 +甌 +甕 +甘 +甙 +甚 +甜 +生 +產 +産 +甥 +甦 +用 +甩 +甫 +甬 +甭 +甯 +田 +由 +甲 +申 +电 +男 +甸 +町 +画 +甾 +畀 +畅 +界 +畏 +畑 +畔 +留 +畜 +畝 +畢 +略 +畦 +番 +畫 +異 +畲 +畳 +畴 +當 +畸 +畹 +畿 +疆 +疇 +疊 +疏 +疑 +疔 +疖 +疗 +疙 +疚 +疝 +疟 +疡 +疣 +疤 +疥 +疫 +疮 +疯 +疱 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痈 +痉 +痊 +痍 +痒 +痔 +痕 +痘 +痙 +痛 +痞 +痠 +痢 +痣 +痤 +痧 +痨 +痪 +痫 +痰 +痱 +痴 +痹 +痺 +痼 +痿 +瘀 +瘁 +瘋 +瘍 +瘓 +瘘 +瘙 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘪 +瘫 +瘴 +瘸 +瘾 +療 +癇 +癌 +癒 +癖 +癜 +癞 +癡 +癢 +癣 +癥 +癫 +癬 +癮 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皑 +皓 +皖 +皙 +皚 +皮 +皰 +皱 +皴 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盎 +盏 +盐 +监 +盒 +盔 +盖 +盗 +盘 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盧 +盪 +目 +盯 +盱 +盲 +直 +相 +盹 +盼 +盾 +省 +眈 +眉 +看 +県 +眙 +眞 +真 +眠 +眦 +眨 +眩 +眯 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睁 +睇 +睏 +睐 +睑 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睬 +睹 +睽 +睾 +睿 +瞄 +瞅 +瞇 +瞋 +瞌 +瞎 +瞑 +瞒 +瞓 +瞞 +瞟 +瞠 +瞥 +瞧 +瞩 +瞪 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矢 +矣 +知 +矩 +矫 +短 +矮 +矯 +石 +矶 +矽 +矾 +矿 +码 +砂 +砌 +砍 +砒 +研 +砖 +砗 +砚 +砝 +砣 +砥 +砧 +砭 +砰 +砲 +破 +砷 +砸 +砺 +砼 +砾 +础 +硅 +硐 +硒 +硕 +硝 +硫 +硬 +确 +硯 +硼 +碁 +碇 +碉 +碌 +碍 +碎 +碑 +碓 +碗 +碘 +碚 +碛 +碟 +碣 +碧 +碩 +碰 +碱 +碳 +碴 +確 +碼 +碾 +磁 +磅 +磊 +磋 +磐 +磕 +磚 +磡 +磨 +磬 +磯 +磲 +磷 +磺 +礁 +礎 +礙 +礡 +礦 +礪 +礫 +礴 +示 +礼 +社 +祀 +祁 +祂 +祇 +祈 +祉 +祎 +祐 +祕 +祖 +祗 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祯 +祷 +祸 +祺 +祿 +禀 +禁 +禄 +禅 +禍 +禎 +福 +禛 +禦 +禧 +禪 +禮 +禱 +禹 +禺 +离 +禽 +禾 +禿 +秀 +私 +秃 +秆 +秉 +秋 +种 +科 +秒 +秘 +租 +秣 +秤 +秦 +秧 +秩 +秭 +积 +称 +秸 +移 +秽 +稀 +稅 +程 +稍 +税 +稔 +稗 +稚 +稜 +稞 +稟 +稠 +稣 +種 +稱 +稲 +稳 +稷 +稹 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +穌 +積 +穎 +穗 +穢 +穩 +穫 +穴 +究 +穷 +穹 +空 +穿 +突 +窃 +窄 +窈 +窍 +窑 +窒 +窓 +窕 +窖 +窗 +窘 +窜 +窝 +窟 +窠 +窥 +窦 +窨 +窩 +窪 +窮 +窯 +窺 +窿 +竄 +竅 +竇 +竊 +立 +竖 +站 +竜 +竞 +竟 +章 +竣 +童 +竭 +端 +競 +竹 +竺 +竽 +竿 +笃 +笆 +笈 +笋 +笏 +笑 +笔 +笙 +笛 +笞 +笠 +符 +笨 +第 +笹 +笺 +笼 +筆 +等 +筊 +筋 +筍 +筏 +筐 +筑 +筒 +答 +策 +筛 +筝 +筠 +筱 +筲 +筵 +筷 +筹 +签 +简 +箇 +箋 +箍 +箏 +箐 +箔 +箕 +算 +箝 +管 +箩 +箫 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篑 +篓 +篙 +篝 +篠 +篡 +篤 +篩 +篪 +篮 +篱 +篷 +簇 +簌 +簍 +簡 +簦 +簧 +簪 +簫 +簷 +簸 +簽 +簾 +簿 +籁 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +籮 +籲 +米 +类 +籼 +籽 +粄 +粉 +粑 +粒 +粕 +粗 +粘 +粟 +粤 +粥 +粧 +粪 +粮 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糅 +糊 +糍 +糕 +糖 +糗 +糙 +糜 +糞 +糟 +糠 +糧 +糬 +糯 +糰 +糸 +系 +糾 +紀 +紂 +約 +紅 +紉 +紊 +紋 +納 +紐 +紓 +純 +紗 +紘 +紙 +級 +紛 +紜 +素 +紡 +索 +紧 +紫 +紮 +累 +細 +紳 +紹 +紺 +終 +絃 +組 +絆 +経 +結 +絕 +絞 +絡 +絢 +給 +絨 +絮 +統 +絲 +絳 +絵 +絶 +絹 +綁 +綏 +綑 +經 +継 +続 +綜 +綠 +綢 +綦 +綫 +綬 +維 +綱 +網 +綴 +綵 +綸 +綺 +綻 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +緘 +線 +緝 +緞 +締 +緣 +編 +緩 +緬 +緯 +練 +緹 +緻 +縁 +縄 +縈 +縛 +縝 +縣 +縫 +縮 +縱 +縴 +縷 +總 +績 +繁 +繃 +繆 +繇 +繋 +織 +繕 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繳 +繹 +繼 +繽 +纂 +續 +纍 +纏 +纓 +纔 +纖 +纜 +纠 +红 +纣 +纤 +约 +级 +纨 +纪 +纫 +纬 +纭 +纯 +纰 +纱 +纲 +纳 +纵 +纶 +纷 +纸 +纹 +纺 +纽 +纾 +线 +绀 +练 +组 +绅 +细 +织 +终 +绊 +绍 +绎 +经 +绑 +绒 +结 +绔 +绕 +绘 +给 +绚 +绛 +络 +绝 +绞 +统 +绡 +绢 +绣 +绥 +绦 +继 +绩 +绪 +绫 +续 +绮 +绯 +绰 +绳 +维 +绵 +绶 +绷 +绸 +绻 +综 +绽 +绾 +绿 +缀 +缄 +缅 +缆 +缇 +缈 +缉 +缎 +缓 +缔 +缕 +编 +缘 +缙 +缚 +缜 +缝 +缠 +缢 +缤 +缥 +缨 +缩 +缪 +缭 +缮 +缰 +缱 +缴 +缸 +缺 +缽 +罂 +罄 +罌 +罐 +网 +罔 +罕 +罗 +罚 +罡 +罢 +罩 +罪 +置 +罰 +署 +罵 +罷 +罹 +羁 +羅 +羈 +羊 +羌 +美 +羔 +羚 +羞 +羟 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羲 +羸 +羹 +羽 +羿 +翁 +翅 +翊 +翌 +翎 +習 +翔 +翘 +翟 +翠 +翡 +翦 +翩 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +耋 +而 +耍 +耐 +耒 +耕 +耗 +耘 +耙 +耦 +耨 +耳 +耶 +耷 +耸 +耻 +耽 +耿 +聂 +聆 +聊 +聋 +职 +聒 +联 +聖 +聘 +聚 +聞 +聪 +聯 +聰 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肃 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肓 +肖 +肘 +肚 +肛 +肝 +肠 +股 +肢 +肤 +肥 +肩 +肪 +肮 +肯 +肱 +育 +肴 +肺 +肽 +肾 +肿 +胀 +胁 +胃 +胄 +胆 +背 +胍 +胎 +胖 +胚 +胛 +胜 +胝 +胞 +胡 +胤 +胥 +胧 +胫 +胭 +胯 +胰 +胱 +胳 +胴 +胶 +胸 +胺 +能 +脂 +脅 +脆 +脇 +脈 +脉 +脊 +脍 +脏 +脐 +脑 +脓 +脖 +脘 +脚 +脛 +脣 +脩 +脫 +脯 +脱 +脲 +脳 +脸 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腩 +腫 +腭 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腻 +腼 +腾 +腿 +膀 +膈 +膊 +膏 +膑 +膘 +膚 +膛 +膜 +膝 +膠 +膦 +膨 +膩 +膳 +膺 +膻 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臓 +臘 +臟 +臣 +臥 +臧 +臨 +自 +臬 +臭 +至 +致 +臺 +臻 +臼 +臾 +舀 +舂 +舅 +舆 +與 +興 +舉 +舊 +舌 +舍 +舎 +舐 +舒 +舔 +舖 +舗 +舛 +舜 +舞 +舟 +航 +舫 +般 +舰 +舱 +舵 +舶 +舷 +舸 +船 +舺 +舾 +艇 +艋 +艘 +艙 +艦 +艮 +良 +艰 +艱 +色 +艳 +艷 +艹 +艺 +艾 +节 +芃 +芈 +芊 +芋 +芍 +芎 +芒 +芙 +芜 +芝 +芡 +芥 +芦 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芷 +芸 +芹 +芻 +芽 +芾 +苁 +苄 +苇 +苋 +苍 +苏 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苞 +苟 +苡 +苣 +若 +苦 +苫 +苯 +英 +苷 +苹 +苻 +茁 +茂 +范 +茄 +茅 +茉 +茎 +茏 +茗 +茜 +茧 +茨 +茫 +茬 +茭 +茯 +茱 +茲 +茴 +茵 +茶 +茸 +茹 +茼 +荀 +荃 +荆 +草 +荊 +荏 +荐 +荒 +荔 +荖 +荘 +荚 +荞 +荟 +荠 +荡 +荣 +荤 +荥 +荧 +荨 +荪 +荫 +药 +荳 +荷 +荸 +荻 +荼 +荽 +莅 +莆 +莉 +莊 +莎 +莒 +莓 +莖 +莘 +莞 +莠 +莢 +莧 +莪 +莫 +莱 +莲 +莴 +获 +莹 +莺 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菊 +菌 +菏 +菓 +菖 +菘 +菜 +菟 +菠 +菡 +菩 +華 +菱 +菲 +菸 +菽 +萁 +萃 +萄 +萊 +萋 +萌 +萍 +萎 +萘 +萝 +萤 +营 +萦 +萧 +萨 +萩 +萬 +萱 +萵 +萸 +萼 +落 +葆 +葉 +著 +葚 +葛 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葱 +葳 +葵 +葷 +葺 +蒂 +蒋 +蒐 +蒔 +蒙 +蒜 +蒞 +蒟 +蒡 +蒨 +蒲 +蒸 +蒹 +蒻 +蒼 +蒿 +蓁 +蓄 +蓆 +蓉 +蓋 +蓑 +蓓 +蓖 +蓝 +蓟 +蓦 +蓬 +蓮 +蓼 +蓿 +蔑 +蔓 +蔔 +蔗 +蔘 +蔚 +蔡 +蔣 +蔥 +蔫 +蔬 +蔭 +蔵 +蔷 +蔺 +蔻 +蔼 +蔽 +蕁 +蕃 +蕈 +蕉 +蕊 +蕎 +蕙 +蕤 +蕨 +蕩 +蕪 +蕭 +蕲 +蕴 +蕻 +蕾 +薄 +薅 +薇 +薈 +薊 +薏 +薑 +薔 +薙 +薛 +薦 +薨 +薩 +薪 +薬 +薯 +薰 +薹 +藉 +藍 +藏 +藐 +藓 +藕 +藜 +藝 +藤 +藥 +藩 +藹 +藻 +藿 +蘆 +蘇 +蘊 +蘋 +蘑 +蘚 +蘭 +蘸 +蘼 +蘿 +虎 +虏 +虐 +虑 +虔 +處 +虚 +虛 +虜 +虞 +號 +虢 +虧 +虫 +虬 +虱 +虹 +虻 +虽 +虾 +蚀 +蚁 +蚂 +蚊 +蚌 +蚓 +蚕 +蚜 +蚝 +蚣 +蚤 +蚩 +蚪 +蚯 +蚱 +蚵 +蛀 +蛆 +蛇 +蛊 +蛋 +蛎 +蛐 +蛔 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛰 +蛳 +蛹 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜊 +蜍 +蜒 +蜓 +蜕 +蜗 +蜘 +蜚 +蜜 +蜡 +蜢 +蜥 +蜱 +蜴 +蜷 +蜻 +蜿 +蝇 +蝈 +蝉 +蝌 +蝎 +蝕 +蝗 +蝙 +蝟 +蝠 +蝦 +蝨 +蝴 +蝶 +蝸 +蝼 +螂 +螃 +融 +螞 +螢 +螨 +螯 +螳 +螺 +蟀 +蟄 +蟆 +蟋 +蟎 +蟑 +蟒 +蟠 +蟬 +蟲 +蟹 +蟻 +蟾 +蠅 +蠍 +蠔 +蠕 +蠛 +蠟 +蠡 +蠢 +蠣 +蠱 +蠶 +蠹 +蠻 +血 +衄 +衅 +衆 +行 +衍 +術 +衔 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +补 +表 +衩 +衫 +衬 +衮 +衰 +衲 +衷 +衹 +衾 +衿 +袁 +袂 +袄 +袅 +袈 +袋 +袍 +袒 +袖 +袜 +袞 +袤 +袪 +被 +袭 +袱 +裁 +裂 +装 +裆 +裊 +裏 +裔 +裕 +裘 +裙 +補 +裝 +裟 +裡 +裤 +裨 +裱 +裳 +裴 +裸 +裹 +製 +裾 +褂 +複 +褐 +褒 +褓 +褔 +褚 +褥 +褪 +褫 +褲 +褶 +褻 +襁 +襄 +襟 +襠 +襪 +襬 +襯 +襲 +西 +要 +覃 +覆 +覇 +見 +規 +覓 +視 +覚 +覦 +覧 +親 +覬 +観 +覷 +覺 +覽 +觀 +见 +观 +规 +觅 +视 +览 +觉 +觊 +觎 +觐 +觑 +角 +觞 +解 +觥 +触 +觸 +言 +訂 +計 +訊 +討 +訓 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詆 +詐 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +試 +詩 +詫 +詬 +詭 +詮 +詰 +話 +該 +詳 +詹 +詼 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誹 +誼 +調 +諄 +談 +請 +諏 +諒 +論 +諗 +諜 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諳 +諷 +諸 +諺 +諾 +謀 +謁 +謂 +謄 +謊 +謎 +謐 +謔 +謗 +謙 +講 +謝 +謠 +謨 +謬 +謹 +謾 +譁 +證 +譎 +譏 +識 +譙 +譚 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +譽 +讀 +變 +讓 +讚 +讞 +计 +订 +认 +讥 +讧 +讨 +让 +讪 +讫 +训 +议 +讯 +记 +讲 +讳 +讴 +讶 +讷 +许 +讹 +论 +讼 +讽 +设 +访 +诀 +证 +诃 +评 +诅 +识 +诈 +诉 +诊 +诋 +词 +诏 +译 +试 +诗 +诘 +诙 +诚 +诛 +话 +诞 +诟 +诠 +诡 +询 +诣 +诤 +该 +详 +诧 +诩 +诫 +诬 +语 +误 +诰 +诱 +诲 +说 +诵 +诶 +请 +诸 +诺 +读 +诽 +课 +诿 +谀 +谁 +调 +谄 +谅 +谆 +谈 +谊 +谋 +谌 +谍 +谎 +谏 +谐 +谑 +谒 +谓 +谔 +谕 +谗 +谘 +谙 +谚 +谛 +谜 +谟 +谢 +谣 +谤 +谥 +谦 +谧 +谨 +谩 +谪 +谬 +谭 +谯 +谱 +谲 +谴 +谶 +谷 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豚 +象 +豢 +豪 +豫 +豬 +豹 +豺 +貂 +貅 +貌 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貳 +貴 +貶 +買 +貸 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賓 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +質 +賬 +賭 +賴 +賺 +購 +賽 +贅 +贈 +贊 +贍 +贏 +贓 +贖 +贛 +贝 +贞 +负 +贡 +财 +责 +贤 +败 +账 +货 +质 +贩 +贪 +贫 +贬 +购 +贮 +贯 +贰 +贱 +贲 +贴 +贵 +贷 +贸 +费 +贺 +贻 +贼 +贾 +贿 +赁 +赂 +赃 +资 +赅 +赈 +赊 +赋 +赌 +赎 +赏 +赐 +赓 +赔 +赖 +赘 +赚 +赛 +赝 +赞 +赠 +赡 +赢 +赣 +赤 +赦 +赧 +赫 +赭 +走 +赳 +赴 +赵 +赶 +起 +趁 +超 +越 +趋 +趕 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趸 +趺 +趾 +跃 +跄 +跆 +跋 +跌 +跎 +跑 +跖 +跚 +跛 +距 +跟 +跡 +跤 +跨 +跩 +跪 +路 +跳 +践 +跷 +跹 +跺 +跻 +踉 +踊 +踌 +踏 +踐 +踝 +踞 +踟 +踢 +踩 +踪 +踮 +踱 +踴 +踵 +踹 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹑 +蹒 +蹙 +蹟 +蹣 +蹤 +蹦 +蹩 +蹬 +蹭 +蹲 +蹴 +蹶 +蹺 +蹼 +蹿 +躁 +躇 +躉 +躊 +躋 +躍 +躏 +躪 +身 +躬 +躯 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軼 +軽 +軾 +較 +載 +輒 +輓 +輔 +輕 +輛 +輝 +輟 +輩 +輪 +輯 +輸 +輻 +輾 +輿 +轄 +轅 +轆 +轉 +轍 +轎 +轟 +车 +轧 +轨 +轩 +转 +轭 +轮 +软 +轰 +轲 +轴 +轶 +轻 +轼 +载 +轿 +较 +辄 +辅 +辆 +辇 +辈 +辉 +辊 +辍 +辐 +辑 +输 +辕 +辖 +辗 +辘 +辙 +辛 +辜 +辞 +辟 +辣 +辦 +辨 +辩 +辫 +辭 +辮 +辯 +辰 +辱 +農 +边 +辺 +辻 +込 +辽 +达 +迁 +迂 +迄 +迅 +过 +迈 +迎 +运 +近 +返 +还 +这 +进 +远 +违 +连 +迟 +迢 +迤 +迥 +迦 +迩 +迪 +迫 +迭 +述 +迴 +迷 +迸 +迹 +迺 +追 +退 +送 +适 +逃 +逅 +逆 +选 +逊 +逍 +透 +逐 +递 +途 +逕 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逵 +逶 +逸 +逻 +逼 +逾 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遏 +遐 +遑 +遒 +道 +達 +違 +遗 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遥 +遨 +適 +遭 +遮 +遲 +遴 +遵 +遶 +遷 +選 +遺 +遼 +遽 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邊 +邋 +邏 +邑 +邓 +邕 +邛 +邝 +邢 +那 +邦 +邨 +邪 +邬 +邮 +邯 +邰 +邱 +邳 +邵 +邸 +邹 +邺 +邻 +郁 +郅 +郊 +郎 +郑 +郜 +郝 +郡 +郢 +郤 +郦 +郧 +部 +郫 +郭 +郴 +郵 +郷 +郸 +都 +鄂 +鄉 +鄒 +鄔 +鄙 +鄞 +鄢 +鄧 +鄭 +鄰 +鄱 +鄲 +鄺 +酉 +酊 +酋 +酌 +配 +酐 +酒 +酗 +酚 +酝 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酱 +酵 +酶 +酷 +酸 +酿 +醃 +醇 +醉 +醋 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +釀 +釁 +采 +釉 +释 +釋 +里 +重 +野 +量 +釐 +金 +釗 +釘 +釜 +針 +釣 +釦 +釧 +釵 +鈀 +鈉 +鈍 +鈎 +鈔 +鈕 +鈞 +鈣 +鈦 +鈪 +鈴 +鈺 +鈾 +鉀 +鉄 +鉅 +鉉 +鉑 +鉗 +鉚 +鉛 +鉤 +鉴 +鉻 +銀 +銃 +銅 +銑 +銓 +銖 +銘 +銜 +銬 +銭 +銮 +銳 +銷 +銹 +鋁 +鋅 +鋒 +鋤 +鋪 +鋰 +鋸 +鋼 +錄 +錐 +錘 +錚 +錠 +錢 +錦 +錨 +錫 +錮 +錯 +録 +錳 +錶 +鍊 +鍋 +鍍 +鍛 +鍥 +鍰 +鍵 +鍺 +鍾 +鎂 +鎊 +鎌 +鎏 +鎔 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎳 +鏈 +鏖 +鏗 +鏘 +鏞 +鏟 +鏡 +鏢 +鏤 +鏽 +鐘 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鑄 +鑊 +鑑 +鑒 +鑣 +鑫 +鑰 +鑲 +鑼 +鑽 +鑾 +鑿 +针 +钉 +钊 +钎 +钏 +钒 +钓 +钗 +钙 +钛 +钜 +钝 +钞 +钟 +钠 +钡 +钢 +钣 +钤 +钥 +钦 +钧 +钨 +钩 +钮 +钯 +钰 +钱 +钳 +钴 +钵 +钺 +钻 +钼 +钾 +钿 +铀 +铁 +铂 +铃 +铄 +铅 +铆 +铉 +铎 +铐 +铛 +铜 +铝 +铠 +铡 +铢 +铣 +铤 +铨 +铩 +铬 +铭 +铮 +铰 +铲 +铵 +银 +铸 +铺 +链 +铿 +销 +锁 +锂 +锄 +锅 +锆 +锈 +锉 +锋 +锌 +锏 +锐 +锑 +错 +锚 +锟 +锡 +锢 +锣 +锤 +锥 +锦 +锭 +键 +锯 +锰 +锲 +锵 +锹 +锺 +锻 +镀 +镁 +镂 +镇 +镉 +镌 +镍 +镐 +镑 +镕 +镖 +镗 +镛 +镜 +镣 +镭 +镯 +镰 +镳 +镶 +長 +长 +門 +閃 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閡 +関 +閣 +閥 +閨 +閩 +閱 +閲 +閹 +閻 +閾 +闆 +闇 +闊 +闌 +闍 +闔 +闕 +闖 +闘 +關 +闡 +闢 +门 +闪 +闫 +闭 +问 +闯 +闰 +闲 +间 +闵 +闷 +闸 +闹 +闺 +闻 +闽 +闾 +阀 +阁 +阂 +阅 +阆 +阇 +阈 +阉 +阎 +阐 +阑 +阔 +阕 +阖 +阙 +阚 +阜 +队 +阡 +阪 +阮 +阱 +防 +阳 +阴 +阵 +阶 +阻 +阿 +陀 +陂 +附 +际 +陆 +陇 +陈 +陋 +陌 +降 +限 +陕 +陛 +陝 +陞 +陟 +陡 +院 +陣 +除 +陨 +险 +陪 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +隍 +階 +随 +隐 +隔 +隕 +隘 +隙 +際 +障 +隠 +隣 +隧 +隨 +險 +隱 +隴 +隶 +隸 +隻 +隼 +隽 +难 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雏 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雞 +離 +難 +雨 +雪 +雯 +雰 +雲 +雳 +零 +雷 +雹 +電 +雾 +需 +霁 +霄 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霜 +霞 +霧 +霭 +霰 +露 +霸 +霹 +霽 +霾 +靂 +靄 +靈 +青 +靓 +靖 +静 +靚 +靛 +靜 +非 +靠 +靡 +面 +靥 +靦 +革 +靳 +靴 +靶 +靼 +鞅 +鞋 +鞍 +鞏 +鞑 +鞘 +鞠 +鞣 +鞦 +鞭 +韆 +韋 +韌 +韓 +韜 +韦 +韧 +韩 +韬 +韭 +音 +韵 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +頗 +領 +頜 +頡 +頤 +頫 +頭 +頰 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顔 +願 +顛 +類 +顧 +顫 +顯 +顱 +顴 +页 +顶 +顷 +项 +顺 +须 +顼 +顽 +顾 +顿 +颁 +颂 +预 +颅 +领 +颇 +颈 +颉 +颊 +颌 +颍 +颐 +频 +颓 +颔 +颖 +颗 +题 +颚 +颛 +颜 +额 +颞 +颠 +颡 +颢 +颤 +颦 +颧 +風 +颯 +颱 +颳 +颶 +颼 +飄 +飆 +风 +飒 +飓 +飕 +飘 +飙 +飚 +飛 +飞 +食 +飢 +飨 +飩 +飪 +飯 +飲 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餒 +餓 +餘 +餚 +餛 +餞 +餡 +館 +餮 +餵 +餾 +饅 +饈 +饋 +饌 +饍 +饑 +饒 +饕 +饗 +饞 +饥 +饨 +饪 +饬 +饭 +饮 +饯 +饰 +饱 +饲 +饴 +饵 +饶 +饷 +饺 +饼 +饽 +饿 +馀 +馁 +馄 +馅 +馆 +馈 +馋 +馍 +馏 +馒 +馔 +首 +馗 +香 +馥 +馨 +馬 +馭 +馮 +馳 +馴 +駁 +駄 +駅 +駆 +駐 +駒 +駕 +駛 +駝 +駭 +駱 +駿 +騁 +騎 +騏 +験 +騙 +騨 +騰 +騷 +驀 +驅 +驊 +驍 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驥 +马 +驭 +驮 +驯 +驰 +驱 +驳 +驴 +驶 +驷 +驸 +驹 +驻 +驼 +驾 +驿 +骁 +骂 +骄 +骅 +骆 +骇 +骈 +骊 +骋 +验 +骏 +骐 +骑 +骗 +骚 +骛 +骜 +骞 +骠 +骡 +骤 +骥 +骧 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髂 +髅 +髋 +髏 +髒 +髓 +體 +髖 +高 +髦 +髪 +髮 +髯 +髻 +鬃 +鬆 +鬍 +鬓 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬱 +鬼 +魁 +魂 +魄 +魅 +魇 +魍 +魏 +魔 +魘 +魚 +魯 +魷 +鮑 +鮨 +鮪 +鮭 +鮮 +鯉 +鯊 +鯖 +鯛 +鯨 +鯰 +鯽 +鰍 +鰓 +鰭 +鰲 +鰻 +鰾 +鱈 +鱉 +鱔 +鱗 +鱷 +鱸 +鱼 +鱿 +鲁 +鲈 +鲍 +鲑 +鲛 +鲜 +鲟 +鲢 +鲤 +鲨 +鲫 +鲱 +鲲 +鲶 +鲷 +鲸 +鳃 +鳄 +鳅 +鳌 +鳍 +鳕 +鳖 +鳗 +鳝 +鳞 +鳥 +鳩 +鳳 +鳴 +鳶 +鴉 +鴕 +鴛 +鴦 +鴨 +鴻 +鴿 +鵑 +鵜 +鵝 +鵡 +鵬 +鵰 +鵲 +鶘 +鶩 +鶯 +鶴 +鷗 +鷲 +鷹 +鷺 +鸚 +鸞 +鸟 +鸠 +鸡 +鸢 +鸣 +鸥 +鸦 +鸨 +鸪 +鸭 +鸯 +鸳 +鸵 +鸽 +鸾 +鸿 +鹂 +鹃 +鹄 +鹅 +鹈 +鹉 +鹊 +鹌 +鹏 +鹑 +鹕 +鹘 +鹜 +鹞 +鹤 +鹦 +鹧 +鹫 +鹭 +鹰 +鹳 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麋 +麒 +麓 +麗 +麝 +麟 +麥 +麦 +麩 +麴 +麵 +麸 +麺 +麻 +麼 +麽 +麾 +黃 +黄 +黍 +黎 +黏 +黑 +黒 +黔 +默 +黛 +黜 +黝 +點 +黠 +黨 +黯 +黴 +鼋 +鼎 +鼐 +鼓 +鼠 +鼬 +鼹 +鼻 +鼾 +齁 +齊 +齋 +齐 +齒 +齡 +齢 +齣 +齦 +齿 +龄 +龅 +龈 +龊 +龋 +龌 +龍 +龐 +龔 +龕 +龙 +龚 +龛 +龜 +龟 +︰ +︱ +︶ +︿ +﹁ +﹂ +﹍ +﹏ +﹐ +﹑ +﹒ +﹔ +﹕ +﹖ +﹗ +﹙ +﹚ +﹝ +﹞ +﹡ +﹣ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +。 +「 +」 +、 +・ +ッ +ー +イ +ク +シ +ス +ト +ノ +フ +ラ +ル +ン +゙ +゚ + ̄ +¥ +👍 +🔥 +😂 +😎 +... +yam +10 +2017 +12 +11 +2016 +20 +30 +15 +06 +lofter +##s +2015 +by +16 +14 +18 +13 +24 +17 +2014 +21 +##0 +22 +19 +25 +23 +com +100 +00 +05 +2013 +##a +03 +09 +08 +28 +##2 +50 +01 +04 +##1 +27 +02 +2012 +##3 +26 +##e +07 +##8 +##5 +##6 +##4 +##9 +##7 +29 +2011 +40 +##t +2010 +##o +##d +##i +2009 +##n +app +www +the +##m +31 +##c +##l +##y +##r +##g +2008 +60 +http +200 +qq +##p +80 +##f +google +pixnet +90 +cookies +tripadvisor +500 +##er +##k +35 +##h +facebook +2007 +2000 +70 +##b +of +##x +##u +45 +300 +iphone +32 +1000 +2006 +48 +ip +36 +in +38 +3d +##w +##ing +55 +ctrip +##on +##v +33 +##の +to +34 +400 +id +2005 +it +37 +windows +llc +top +99 +42 +39 +000 +led +at +##an +41 +51 +52 +46 +49 +43 +53 +44 +##z +android +58 +and +59 +2004 +56 +vr +##か +5000 +2003 +47 +blogthis +twitter +54 +##le +150 +ok +2018 +57 +75 +cn +no +ios +##in +##mm +##00 +800 +on +te +3000 +65 +2001 +360 +95 +ig +lv +120 +##ng +##を +##us +##に +pc +てす +── +600 +##te +85 +2002 +88 +##ed +html +ncc +wifi +email +64 +blog +is +##10 +##て +mail +online +##al +dvd +##ic +studio +##は +##℃ +##ia +##と +line +vip +72 +##q +98 +##ce +##en +for +##is +##ra +##es +##j +usb +net +cp +1999 +asia +4g +##cm +diy +new +3c +##お +ta +66 +language +vs +apple +tw +86 +web +##ne +ipad +62 +you +##re +101 +68 +##tion +ps +de +bt +pony +atm +##2017 +1998 +67 +##ch +ceo +##or +go +##na +av +pro +cafe +96 +pinterest +97 +63 +pixstyleme3c +##ta +more +said +##2016 +1997 +mp3 +700 +##ll +nba +jun +##20 +92 +tv +1995 +pm +61 +76 +nbsp +250 +##ie +linux +##ma +cd +110 +hd +##17 +78 +##ion +77 +6000 +am +##th +##st +94 +##se +##et +69 +180 +gdp +my +105 +81 +abc +89 +flash +79 +one +93 +1990 +1996 +##ck +gps +##も +##ly +web885 +106 +2020 +91 +##ge +4000 +1500 +xd +boss +isbn +1994 +org +##ry +me +love +##11 +0fork +73 +##12 +3g +##ter +##ar +71 +82 +##la +hotel +130 +1970 +pk +83 +87 +140 +ie +##os +##30 +##el +74 +##50 +seo +cpu +##ml +p2p +84 +may +##る +sun +tue +internet +cc +posted +youtube +##at +##ン +##man +ii +##ル +##15 +abs +nt +pdf +yahoo +ago +1980 +##it +news +mac +104 +##てす +##me +##り +java +1992 +spa +##de +##nt +hk +all +plus +la +1993 +##mb +##16 +##ve +west +##da +160 +air +##い +##ps +から +##to +1989 +logo +htc +php +https +fi +momo +##son +sat +##ke +##80 +ebd +suv +wi +day +apk +##88 +##um +mv +galaxy +wiki +or +brake +##ス +1200 +する +this +1991 +mon +##こ +❤2017 +po +##ない +javascript +life +home +june +##ss +system +900 +##ー +##0 +pp +1988 +world +fb +4k +br +##as +ic +ai +leonardo +safari +##60 +live +free +xx +wed +win7 +kiehl +##co +lg +o2o +##go +us +235 +1949 +mm +しい +vfm +kanye +##90 +##2015 +##id +jr +##ey +123 +rss +##sa +##ro +##am +##no +thu +fri +350 +##sh +##ki +103 +comments +name +##のて +##pe +##ine +max +1987 +8000 +uber +##mi +##ton +wordpress +office +1986 +1985 +##ment +107 +bd +win10 +##ld +##li +gmail +bb +dior +##rs +##ri +##rd +##ます +up +cad +##® +dr +して +read +##21 +をお +##io +##99 +url +1984 +pvc +paypal +show +policy +##40 +##ty +##18 +with +##★ +##01 +txt +102 +##ba +dna +from +post +mini +ar +taiwan +john +##ga +privacy +agoda +##13 +##ny +word +##24 +##22 +##by +##ur +##hz +1982 +##ang +265 +cookie +netscape +108 +##ka +##~ +##ad +house +share +note +ibm +code +hello +nike +sim +survey +##016 +1979 +1950 +wikia +##32 +##017 +5g +cbc +##tor +##kg +1983 +##rt +##14 +campaign +store +2500 +os +##ct +##ts +##° +170 +api +##ns +365 +excel +##な +##ao +##ら +##し +~~ +##nd +university +163 +には +518 +##70 +##ya +##il +##25 +pierre +ipo +0020 +897 +##23 +hotels +##ian +のお +125 +years +6606 +##ers +##26 +high +##day +time +##ay +bug +##line +##く +##す +##be +xp +talk2yam +yamservice +10000 +coco +##dy +sony +##ies +1978 +microsoft +david +people +##ha +1960 +instagram +intel +その +##ot +iso +1981 +##va +115 +##mo +##land +xxx +man +co +ltxsw +##ation +baby +220 +##pa +##ol +1945 +7000 +tag +450 +##ue +msn +##31 +oppo +##ト +##ca +control +##om +st +chrome +##ure +##ん +be +##き +lol +##19 +した +##bo +240 +lady +##100 +##way +##から +4600 +##ko +##do +##un +4s +corporation +168 +##ni +herme +##28 +cp +978 +##up +##06 +ui +##ds +ppt +admin +three +します +bbc +re +128 +##48 +ca +##015 +##35 +hp +##ee +tpp +##た +##ive +×× +root +##cc +##ました +##ble +##ity +adobe +park +114 +et +oled +city +##ex +##ler +##ap +china +##book +20000 +view +##ice +global +##km +your +hong +##mg +out +##ms +ng +ebay +##29 +menu +ubuntu +##cy +rom +##view +open +ktv +do +server +##lo +if +english +##ね +##5 +##oo +1600 +##02 +step1 +kong +club +135 +july +inc +1976 +mr +hi +##net +touch +##ls +##ii +michael +lcd +##05 +##33 +phone +james +step2 +1300 +ios9 +##box +dc +##2 +##ley +samsung +111 +280 +pokemon +css +##ent +##les +いいえ +##1 +s8 +atom +play +bmw +##said +sa +etf +ctrl +♥yoyo♥ +##55 +2025 +##2014 +##66 +adidas +amazon +1958 +##ber +##ner +visa +##77 +##der +1800 +connectivity +##hi +firefox +109 +118 +hr +so +style +mark +pop +ol +skip +1975 +as +##27 +##ir +##61 +190 +mba +##う +##ai +le +##ver +1900 +cafe2017 +lte +super +113 +129 +##ron +amd +like +##☆ +are +##ster +we +##sk +paul +data +international +##ft +longchamp +ssd +good +##ート +##ti +reply +##my +↓↓↓ +apr +star +##ker +source +136 +js +112 +get +force +photo +##one +126 +##2013 +##ow +link +bbs +1972 +goods +##lin +python +119 +##ip +game +##ics +##ません +blue +##● +520 +##45 +page +itunes +##03 +1955 +260 +1968 +gt +gif +618 +##ff +##47 +group +くたさい +about +bar +ganji +##nce +music +lee +not +1977 +1971 +1973 +##per +an +faq +comment +##って +days +##ock +116 +##bs +1974 +1969 +v1 +player +1956 +xbox +sql +fm +f1 +139 +##ah +210 +##lv +##mp +##000 +melody +1957 +##3 +550 +17life +199 +1966 +xml +market +##au +##71 +999 +##04 +what +gl +##95 +##age +tips +##68 +book +##ting +mysql +can +1959 +230 +##ung +wonderland +watch +10℃ +##ction +9000 +mar +mobile +1946 +1962 +article +##db +part +▲top +party +って +1967 +1964 +1948 +##07 +##ore +##op +この +dj +##78 +##38 +010 +main +225 +1965 +##ong +art +320 +ad +134 +020 +##73 +117 +pm2 +japan +228 +##08 +ts +1963 +##ica +der +sm +##36 +2019 +##wa +ct +##7 +##や +##64 +1937 +homemesh +search +##85 +##れは +##tv +##di +macbook +##9 +##くたさい +service +##♥ +type +った +750 +##ier +##si +##75 +##います +##ok +best +##ット +goris +lock +##った +cf +3m +big +##ut +ftp +carol +##vi +10 +1961 +happy +sd +##ac +122 +anti +pe +cnn +iii +1920 +138 +##ラ +1940 +esp +jan +tags +##98 +##51 +august +vol +##86 +154 +##™ +##fs +##れ +##sion +design +ac +##ム +press +jordan +ppp +that +key +check +##6 +##tt +##㎡ +1080p +##lt +power +##42 +1952 +##bc +vivi +##ック +he +133 +121 +jpg +##rry +201 +175 +3500 +1947 +nb +##ted +##rn +しています +1954 +usd +##t00 +master +##ンク +001 +model +##58 +al +##09 +1953 +##34 +ram +goo +ても +##ui +127 +1930 +red +##ary +rpg +item +##pm +##41 +270 +##za +project +##2012 +hot +td +blogabstract +##ger +##62 +650 +##44 +gr2 +##します +##m +black +electronic +nfc +year +asus +また +html5 +cindy +##hd +m3 +132 +esc +##od +booking +##53 +fed +tvb +##81 +##ina +mit +165 +##いる +chan +192 +distribution +next +になる +peter +bios +steam +cm +1941 +にも +pk10 +##ix +##65 +##91 +dec +nasa +##ana +icecat +00z +b1 +will +##46 +li +se +##ji +##み +##ard +oct +##ain +jp +##ze +##bi +cio +##56 +smart +h5 +##39 +##port +curve +vpn +##nm +##dia +utc +##あり +12345678910 +##52 +rmvb +chanel +a4 +miss +##and +##im +media +who +##63 +she +girl +5s +124 +vera +##して +class +vivo +king +##フ +##ei +national +ab +1951 +5cm +888 +145 +ipod +ap +1100 +5mm +211 +ms +2756 +##69 +mp4 +msci +##po +##89 +131 +mg +index +380 +##bit +##out +##zz +##97 +##67 +158 +apec +##8 +photoshop +opec +¥799 +ては +##96 +##tes +##ast +2g +○○ +##ール +¥2899 +##ling +##よ +##ory +1938 +##ical +kitty +content +##43 +step3 +##cn +win8 +155 +vc +1400 +iphone7 +robert +##した +tcl +137 +beauty +##87 +en +dollars +##ys +##oc +step +pay +yy +a1 +##2011 +##lly +##ks +##♪ +1939 +188 +download +1944 +sep +exe +ph +います +school +gb +center +pr +street +##board +uv +##37 +##lan +winrar +##que +##ua +##com +1942 +1936 +480 +gpu +##4 +ettoday +fu +tom +##54 +##ren +##via +149 +##72 +b2b +144 +##79 +##tch +rose +arm +mb +##49 +##ial +##nn +nvidia +step4 +mvp +00㎡ +york +156 +##イ +how +cpi +591 +2765 +gov +kg +joe +##xx +mandy +pa +##ser +copyright +fashion +1935 +don +##け +ecu +##ist +##art +erp +wap +have +##lm +talk +##ek +##ning +##if +ch +##ite +video +1943 +cs +san +iot +look +##84 +##2010 +##ku +october +##ux +trump +##hs +##ide +box +141 +first +##ins +april +##ight +##83 +185 +angel +protected +aa +151 +162 +x1 +m2 +##fe +##× +##ho +size +143 +min +ofo +fun +gomaji +ex +hdmi +food +dns +march +chris +kevin +##のか +##lla +##pp +##ec +ag +ems +6s +720p +##rm +##ham +off +##92 +asp +team +fandom +ed +299 +▌♥ +##ell +info +されています +##82 +sina +4066 +161 +##able +##ctor +330 +399 +315 +dll +rights +ltd +idc +jul +3kg +1927 +142 +ma +surface +##76 +##ク +~~~ +304 +mall +eps +146 +green +##59 +map +space +donald +v2 +sodu +##light +1931 +148 +1700 +まて +310 +reserved +htm +##han +##57 +2d +178 +mod +##ise +##tions +152 +ti +##shi +doc +1933 +icp +055 +wang +##ram +shopping +aug +##pi +##well +now +wam +b2 +からお +##hu +236 +1928 +##gb +266 +f2 +##93 +153 +mix +##ef +##uan +bwl +##plus +##res +core +##ess +tea +5℃ +hktvmall +nhk +##ate +list +##ese +301 +feb +4m +inn +ての +nov +159 +12345 +daniel +##ci +pass +##bet +##nk +coffee +202 +ssl +airbnb +##ute +fbi +woshipm +skype +ea +cg +sp +##fc +##www +yes +edge +alt +007 +##94 +fpga +##ght +##gs +iso9001 +さい +##ile +##wood +##uo +image +lin +icon +american +##em +1932 +set +says +##king +##tive +blogger +##74 +なと +256 +147 +##ox +##zy +##red +##ium +##lf +nokia +claire +##リ +##ding +november +lohas +##500 +##tic +##マ +##cs +##ある +##che +##ire +##gy +##ult +db +january +win +##カ +166 +road +ptt +##ま +##つ +198 +##fa +##mer +anna +pchome +はい +udn +ef +420 +##time +##tte +2030 +##ア +g20 +white +かかります +1929 +308 +garden +eleven +di +##おります +chen +309b +777 +172 +young +cosplay +ちてない +4500 +bat +##123 +##tra +##ては +kindle +npc +steve +etc +##ern +##| +call +xperia +ces +travel +sk +s7 +##ous +1934 +##int +みいたたけます +183 +edu +file +cho +qr +##car +##our +186 +##ant +##d +eric +1914 +rends +##jo +##する +mastercard +##2000 +kb +##min +290 +##ino +vista +##ris +##ud +jack +2400 +##set +169 +pos +1912 +##her +##ou +taipei +しく +205 +beta +##ませんか +232 +##fi +express +255 +body +##ill +aphojoy +user +december +meiki +##ick +tweet +richard +##av +##ᆫ +iphone6 +##dd +ちてすか +views +##mark +321 +pd +##00 +times +##▲ +level +##ash +10g +point +5l +##ome +208 +koreanmall +##ak +george +q2 +206 +wma +tcp +##200 +スタッフ +full +mlb +##lle +##watch +tm +run +179 +911 +smith +business +##und +1919 +color +##tal +222 +171 +##less +moon +4399 +##rl +update +pcb +shop +499 +157 +little +なし +end +##mhz +van +dsp +easy +660 +##house +##key +history +##o +oh +##001 +##hy +##web +oem +let +was +##2009 +##gg +review +##wan +182 +##°c +203 +uc +title +##val +united +233 +2021 +##ons +doi +trivago +overdope +sbs +##ance +##ち +grand +special +573032185 +imf +216 +wx17house +##so +##ーム +audi +##he +london +william +##rp +##ake +science +beach +cfa +amp +ps4 +880 +##800 +##link +##hp +crm +ferragamo +bell +make +##eng +195 +under +zh +photos +2300 +##style +##ント +via +176 +da +##gi +company +i7 +##ray +thomas +370 +ufo +i5 +##max +plc +ben +back +research +8g +173 +mike +##pc +##ッフ +september +189 +##ace +vps +february +167 +pantos +wp +lisa +1921 +★★ +jquery +night +long +offer +##berg +##news +1911 +##いて +ray +fks +wto +せます +over +164 +340 +##all +##rus +1924 +##888 +##works +blogtitle +loftpermalink +##→ +187 +martin +test +ling +km +##め +15000 +fda +v3 +##ja +##ロ +wedding +かある +outlet +family +##ea +をこ +##top +story +##ness +salvatore +##lu +204 +swift +215 +room +している +oracle +##ul +1925 +sam +b2c +week +pi +rock +##のは +##a +##けと +##ean +##300 +##gle +cctv +after +chinese +##back +powered +x2 +##tan +1918 +##nes +##イン +canon +only +181 +##zi +##las +say +##oe +184 +##sd +221 +##bot +##world +##zo +sky +made +top100 +just +1926 +pmi +802 +234 +gap +##vr +177 +les +174 +▲topoct +ball +vogue +vi +ing +ofweek +cos +##list +##ort +▲topmay +##なら +##lon +として +last +##tc +##of +##bus +##gen +real +eva +##コ +a3 +nas +##lie +##ria +##coin +##bt +▲topapr +his +212 +cat +nata +vive +health +⋯⋯ +drive +sir +▲topmar +du +cup +##カー +##ook +##よう +##sy +alex +msg +tour +しました +3ce +##word +193 +ebooks +r8 +block +318 +##より +2200 +nice +pvp +207 +months +1905 +rewards +##ther +1917 +0800 +##xi +##チ +##sc +micro +850 +gg +blogfp +op +1922 +daily +m1 +264 +true +##bb +ml +##tar +##のお +##ky +anthony +196 +253 +##yo +state +218 +##ara +##aa +##rc +##tz +##ston +より +gear +##eo +##ade +ge +see +1923 +##win +##ura +ss +heart +##den +##ita +down +##sm +el +png +2100 +610 +rakuten +whatsapp +bay +dream +add +##use +680 +311 +pad +gucci +mpv +##ode +##fo +island +▲topjun +##▼ +223 +jason +214 +chicago +##❤ +しの +##hone +io +##れる +##ことか +sogo +be2 +##ology +990 +cloud +vcd +##con +2~3 +##ford +##joy +##kb +##こさいます +##rade +but +##ach +docker +##ful +rfid +ul +##ase +hit +ford +##star +580 +##○ +11 +a2 +sdk +reading +edited +##are +cmos +##mc +238 +siri +light +##ella +##ため +bloomberg +##read +pizza +##ison +jimmy +##vm +college +node +journal +ba +18k +##play +245 +##cer +20 +magic +##yu +191 +jump +288 +tt +##ings +asr +##lia +3200 +step5 +network +##cd +mc +いします +1234 +pixstyleme +273 +##600 +2800 +money +★★★★★ +1280 +12 +430 +bl +みの +act +##tus +tokyo +##rial +##life +emba +##ae +saas +tcs +##rk +##wang +summer +##sp +ko +##ving +390 +premium +##その +netflix +##ヒ +uk +mt +##lton +right +frank +two +209 +える +##ple +##cal +021 +##んな +##sen +##ville +hold +nexus +dd +##ius +てお +##mah +##なく +tila +zero +820 +ce +##tin +resort +##ws +charles +old +p10 +5d +report +##360 +##ru +##には +bus +vans +lt +##est +pv +##レ +links +rebecca +##ツ +##dm +azure +##365 +きな +limited +bit +4gb +##mon +1910 +moto +##eam +213 +1913 +var +eos +なとの +226 +blogspot +された +699 +e3 +dos +dm +fc +##ments +##ik +##kw +boy +##bin +##ata +960 +er +##せ +219 +##vin +##tu +##ula +194 +##∥ +station +##ろ +##ature +835 +files +zara +hdr +top10 +nature +950 +magazine +s6 +marriott +##シ +avira +case +##っと +tab +##ran +tony +##home +oculus +im +##ral +jean +saint +cry +307 +rosie +##force +##ini +ice +##bert +のある +##nder +##mber +pet +2600 +##◆ +plurk +▲topdec +##sis +00kg +▲topnov +720 +##ence +tim +##ω +##nc +##ても +##name +log +ips +great +ikea +malaysia +unix +##イト +3600 +##ncy +##nie +12000 +akb48 +##ye +##oid +404 +##chi +##いた +oa +xuehai +##1000 +##orm +##rf +275 +さん +##ware +##リー +980 +ho +##pro +text +##era +560 +bob +227 +##ub +##2008 +8891 +scp +avi +##zen +2022 +mi +wu +museum +qvod +apache +lake +jcb +▲topaug +★★★ +ni +##hr +hill +302 +ne +weibo +490 +ruby +##ーシ +##ヶ +##row +4d +▲topjul +iv +##ish +github +306 +mate +312 +##スト +##lot +##ane +andrew +のハイト +##tina +t1 +rf +ed2k +##vel +##900 +way +final +りの +ns +5a +705 +197 +##メ +sweet +bytes +##ene +▲topjan +231 +##cker +##2007 +##px +100g +topapp +229 +helpapp +rs +low +14k +g4g +care +630 +ldquo +あり +##fork +leave +rm +edition +##gan +##zon +##qq +▲topsep +##google +##ism +gold +224 +explorer +##zer +toyota +category +select +visual +##labels +restaurant +##md +posts +s1 +##ico +もっと +angelababy +123456 +217 +sports +s3 +mbc +1915 +してくたさい +shell +x86 +candy +##new +kbs +face +xl +470 +##here +4a +swissinfo +v8 +▲topfeb +dram +##ual +##vice +3a +##wer +sport +q1 +ios10 +public +int +card +##c +ep +au +rt +##れた +1080 +bill +##mll +kim +30 +460 +wan +##uk +##ミ +x3 +298 +0t +scott +##ming +239 +e5 +##3d +h7n9 +worldcat +brown +##あります +##vo +##led +##580 +##ax +249 +410 +##ert +paris +##~6 +polo +925 +##lr +599 +##ナ +capital +##hing +bank +cv +1g +##chat +##s +##たい +adc +##ule +2m +##e +digital +hotmail +268 +##pad +870 +bbq +quot +##ring +before +wali +##まて +mcu +2k +2b +という +costco +316 +north +333 +switch +##city +##p +philips +##mann +management +panasonic +##cl +##vd +##ping +##rge +alice +##lk +##ましょう +css3 +##ney +vision +alpha +##ular +##400 +##tter +lz +にお +##ありません +mode +gre +1916 +pci +##tm +237 +1~2 +##yan +##そ +について +##let +##キ +work +war +coach +ah +mary +##ᅵ +huang +##pt +a8 +pt +follow +##berry +1895 +##ew +a5 +ghost +##ション +##wn +##og +south +##code +girls +##rid +action +villa +git +r11 +table +games +##cket +error +##anonymoussaid +##ag +here +##ame +##gc +qa +##■ +##lis +gmp +##gin +vmalife +##cher +yu +wedding +##tis +demo +dragon +530 +soho +social +bye +##rant +river +orz +acer +325 +##↑ +##ース +##ats +261 +del +##ven +440 +ups +##ように +##ター +305 +value +macd +yougou +##dn +661 +##ano +ll +##urt +##rent +continue +script +##wen +##ect +paper +263 +319 +shift +##chel +##フト +##cat +258 +x5 +fox +243 +##さん +car +aaa +##blog +loading +##yn +##tp +kuso +799 +si +sns +イカせるテンマ +ヒンクテンマ3 +rmb +vdc +forest +central +prime +help +ultra +##rmb +##ような +241 +square +688 +##しい +のないフロクに +##field +##reen +##ors +##ju +c1 +start +510 +##air +##map +cdn +##wo +cba +stephen +m8 +100km +##get +opera +##base +##ood +vsa +com™ +##aw +##ail +251 +なのて +count +t2 +##ᅡ +##een +2700 +hop +##gp +vsc +tree +##eg +##ose +816 +285 +##ories +##shop +alphago +v4 +1909 +simon +##ᆼ +fluke62max +zip +スホンサー +##sta +louis +cr +bas +##~10 +bc +##yer +hadoop +##ube +##wi +1906 +0755 +hola +##low +place +centre +5v +d3 +##fer +252 +##750 +##media +281 +540 +0l +exchange +262 +series +##ハー +##san +eb +##bank +##k +q3 +##nge +##mail +take +##lp +259 +1888 +client +east +cache +event +vincent +##ールを +きを +##nse +sui +855 +adchoice +##и +##stry +##なたの +246 +##zone +ga +apps +sea +##ab +248 +cisco +##タ +##rner +kymco +##care +dha +##pu +##yi +minkoff +royal +p1 +への +annie +269 +collection +kpi +playstation +257 +になります +866 +bh +##bar +queen +505 +radio +1904 +andy +armani +##xy +manager +iherb +##ery +##share +spring +raid +johnson +1908 +##ob +volvo +hall +##ball +v6 +our +taylor +##hk +bi +242 +##cp +kate +bo +water +technology +##rie +サイトは +277 +##ona +##sl +hpv +303 +gtx +hip +rdquo +jayz +stone +##lex +##rum +namespace +##やり +620 +##ale +##atic +des +##erson +##ql +##ves +##type +enter +##この +##てきます +d2 +##168 +##mix +##bian +との +a9 +jj +ky +##lc +access +movie +##hc +リストに +tower +##ration +##mit +ます +##nch +ua +tel +prefix +##o2 +1907 +##point +1901 +ott +~10 +##http +##ury +baidu +##ink +member +##logy +bigbang +nownews +##js +##shot +##tb +##こと +247 +eba +##tics +##lus +ける +v5 +spark +##ama +there +##ions +god +##lls +##down +hiv +##ress +burberry +day2 +##kv +◆◆ +jeff +related +film +edit +joseph +283 +##ark +cx +32gb +order +g9 +30000 +##ans +##tty +s5 +##bee +かあります +thread +xr +buy +sh +005 +land +spotify +mx +##ari +276 +##verse +×email +sf +why +##ことて +244 +7headlines +nego +sunny +dom +exo +401 +666 +positioning +fit +rgb +##tton +278 +kiss +alexa +adam +lp +みリストを +##g +mp +##ties +##llow +amy +##du +np +002 +institute +271 +##rth +##lar +2345 +590 +##des +sidebar +15 +imax +site +##cky +##kit +##ime +##009 +season +323 +##fun +##ンター +##ひ +gogoro +a7 +pu +lily +fire +twd600 +##ッセーシを +いて +##vis +30ml +##cture +##をお +information +##オ +close +friday +##くれる +yi +nick +てすか +##tta +##tel +6500 +##lock +cbd +economy +254 +かお +267 +tinker +double +375 +8gb +voice +##app +oops +channel +today +985 +##right +raw +xyz +##+ +jim +edm +##cent +7500 +supreme +814 +ds +##its +##asia +dropbox +##てすか +##tti +books +272 +100ml +##tle +##ller +##ken +##more +##boy +sex +309 +##dom +t3 +##ider +##なります +##unch +1903 +810 +feel +5500 +##かった +##put +により +s2 +mo +##gh +men +ka +amoled +div +##tr +##n1 +port +howard +##tags +ken +dnf +##nus +adsense +##а +ide +##へ +buff +thunder +##town +##ique +has +##body +auto +pin +##erry +tee +てした +295 +number +##the +##013 +object +psp +cool +udnbkk +16gb +##mic +miui +##tro +most +r2 +##alk +##nity +1880 +±0 +##いました +428 +s4 +law +version +##oa +n1 +sgs +docomo +##tf +##ack +henry +fc2 +##ded +##sco +##014 +##rite +286 +0mm +linkedin +##ada +##now +wii +##ndy +ucbug +##◎ +sputniknews +legalminer +##ika +##xp +2gb +##bu +q10 +oo +b6 +come +##rman +cheese +ming +maker +##gm +nikon +##fig +ppi +kelly +##ります +jchere +てきます +ted +md +003 +fgo +tech +##tto +dan +soc +##gl +##len +hair +earth +640 +521 +img +##pper +##a1 +##てきる +##ロク +acca +##ition +##ference +suite +##ig +outlook +##mond +##cation +398 +##pr +279 +101vip +358 +##999 +282 +64gb +3800 +345 +airport +##over +284 +##おり +jones +##ith +lab +##su +##いるのて +co2 +town +piece +##llo +no1 +vmware +24h +##qi +focus +reader +##admin +##ora +tb +false +##log +1898 +know +lan +838 +##ces +f4 +##ume +motel +stop +##oper +na +flickr +netcomponents +##af +##─ +pose +williams +local +##ound +##cg +##site +##iko +いお +274 +5m +gsm +con +##ath +1902 +friends +##hip +cell +317 +##rey +780 +cream +##cks +012 +##dp +facebooktwitterpinterestgoogle +sso +324 +shtml +song +swiss +##mw +##キンク +lumia +xdd +string +tiffany +522 +marc +られた +insee +russell +sc +dell +##ations +ok +camera +289 +##vs +##flow +##late +classic +287 +##nter +stay +g1 +mtv +512 +##ever +##lab +##nger +qe +sata +ryan +d1 +50ml +cms +##cing +su +292 +3300 +editor +296 +##nap +security +sunday +association +##ens +##700 +##bra +acg +##かり +sofascore +とは +mkv +##ign +jonathan +gary +build +labels +##oto +tesla +moba +qi +gohappy +general +ajax +1024 +##かる +サイト +society +##test +##urs +wps +fedora +##ich +mozilla +328 +##480 +##dr +usa +urn +##lina +##r +grace +##die +##try +##ader +1250 +##なり +elle +570 +##chen +##ᆯ +price +##ten +uhz +##ough +eq +##hen +states +push +session +balance +wow +506 +##cus +##py +when +##ward +##ep +34e +wong +library +prada +##サイト +##cle +running +##ree +313 +ck +date +q4 +##ctive +##ool +##> +mk +##ira +##163 +388 +die +secret +rq +dota +buffet +は1ヶ +e6 +##ez +pan +368 +ha +##card +##cha +2a +##さ +alan +day3 +eye +f3 +##end +france +keep +adi +rna +tvbs +##ala +solo +nova +##え +##tail +##ょう +support +##ries +##なる +##ved +base +copy +iis +fps +##ways +hero +hgih +profile +fish +mu +ssh +entertainment +chang +##wd +click +cake +##ond +pre +##tom +kic +pixel +##ov +##fl +product +6a +##pd +dear +##gate +es +yumi +audio +##² +##sky +echo +bin +where +##ture +329 +##ape +find +sap +isis +##なと +nand +##101 +##load +##ream +band +a6 +525 +never +##post +festival +50cm +##we +555 +guide +314 +zenfone +##ike +335 +gd +forum +jessica +strong +alexander +##ould +software +allen +##ious +program +360° +else +lohasthree +##gar +することかてきます +please +##れます +rc +##ggle +##ric +bim +50000 +##own +eclipse +355 +brian +3ds +##side +061 +361 +##other +##ける +##tech +##ator +485 +engine +##ged +##t +plaza +##fit +cia +ngo +westbrook +shi +tbs +50mm +##みませんか +sci +291 +reuters +##ily +contextlink +##hn +af +##cil +bridge +very +##cel +1890 +cambridge +##ize +15g +##aid +##data +790 +frm +##head +award +butler +##sun +meta +##mar +america +ps3 +puma +pmid +##すか +lc +670 +kitchen +##lic +オーフン5 +きなしソフトサーヒス +そして +day1 +future +★★★★ +##text +##page +##rris +pm1 +##ket +fans +##っています +1001 +christian +bot +kids +trackback +##hai +c3 +display +##hl +n2 +1896 +idea +さんも +##sent +airmail +##ug +##men +pwm +けます +028 +##lution +369 +852 +awards +schemas +354 +asics +wikipedia +font +##tional +##vy +c2 +293 +##れている +##dget +##ein +っている +contact +pepper +スキル +339 +##~5 +294 +##uel +##ument +730 +##hang +みてす +q5 +##sue +rain +##ndi +wei +swatch +##cept +わせ +331 +popular +##ste +##tag +p2 +501 +trc +1899 +##west +##live +justin +honda +ping +messenger +##rap +v9 +543 +##とは +unity +appqq +はすへて +025 +leo +##tone +##テ +##ass +uniqlo +##010 +502 +her +jane +memory +moneydj +##tical +human +12306 +していると +##m2 +coc +miacare +##mn +tmt +##core +vim +kk +##may +fan +target +use +too +338 +435 +2050 +867 +737 +fast +##2c +services +##ope +omega +energy +##わ +pinkoi +1a +##なから +##rain +jackson +##ement +##シャンルの +374 +366 +そんな +p9 +rd +##ᆨ +1111 +##tier +##vic +zone +##│ +385 +690 +dl +isofix +cpa +m4 +322 +kimi +めて +davis +##lay +lulu +##uck +050 +weeks +qs +##hop +920 +##n +ae +##ear +~5 +eia +405 +##fly +korea +jpeg +boost +##ship +small +##リア +1860 +eur +297 +425 +valley +##iel +simple +##ude +rn +k2 +##ena +されます +non +patrick +しているから +##ナー +feed +5757 +30g +process +well +qqmei +##thing +they +aws +lu +pink +##ters +##kin +または +board +##vertisement +wine +##ien +unicode +##dge +r1 +359 +##tant +いを +##twitter +##3c +cool1 +される +##れて +##l +isp +##012 +standard +45㎡2 +402 +##150 +matt +##fu +326 +##iner +googlemsn +pixnetfacebookyahoo +##ラン +x7 +886 +##uce +メーカー +sao +##ev +##きました +##file +9678 +403 +xddd +shirt +6l +##rio +##hat +3mm +givenchy +ya +bang +##lio +monday +crystal +ロクイン +##abc +336 +head +890 +ubuntuforumwikilinuxpastechat +##vc +##~20 +##rity +cnc +7866 +ipv6 +null +1897 +##ost +yang +imsean +tiger +##fet +##ンス +352 +##= +dji +327 +ji +maria +##come +##んて +foundation +3100 +##beth +##なった +1m +601 +active +##aft +##don +3p +sr +349 +emma +##khz +living +415 +353 +1889 +341 +709 +457 +sas +x6 +##face +pptv +x4 +##mate +han +sophie +##jing +337 +fifa +##mand +other +sale +inwedding +##gn +てきちゃいます +##mmy +##pmlast +bad +nana +nbc +してみてくたさいね +なとはお +##wu +##かあります +##あ +note7 +single +##340 +せからこ +してくたさい♪この +しにはとんとんワークケートを +するとあなたにもっとマッチした +ならワークケートへ +もみつかっちゃうかも +ワークケートの +##bel +window +##dio +##ht +union +age +382 +14 +##ivity +##y +コメント +domain +neo +##isa +##lter +5k +f5 +steven +##cts +powerpoint +tft +self +g2 +ft +##テル +zol +##act +mwc +381 +343 +もう +nbapop +408 +てある +eds +ace +##room +previous +author +tomtom +il +##ets +hu +financial +☆☆☆ +っています +bp +5t +chi +1gb +##hg +fairmont +cross +008 +gay +h2 +function +##けて +356 +also +1b +625 +##ータ +##raph +1894 +3~5 +##ils +i3 +334 +avenue +##host +による +##bon +##tsu +message +navigation +50g +fintech +h6 +##ことを +8cm +##ject +##vas +##firm +credit +##wf +xxxx +form +##nor +##space +huawei +plan +json +sbl +##dc +machine +921 +392 +wish +##120 +##sol +windows7 +edward +##ために +development +washington +##nsis +lo +818 +##sio +##ym +##bor +planet +##~8 +##wt +ieee +gpa +##めて +camp +ann +gm +##tw +##oka +connect +##rss +##work +##atus +wall +chicken +soul +2mm +##times +fa +##ather +##cord +009 +##eep +hitachi +gui +harry +##pan +e1 +disney +##press +##ーション +wind +386 +frigidaire +##tl +liu +hsu +332 +basic +von +ev +いた +てきる +スホンサーサイト +learning +##ull +expedia +archives +change +##wei +santa +cut +ins +6gb +turbo +brand +cf1 +508 +004 +return +747 +##rip +h1 +##nis +##をこ +128gb +##にお +3t +application +しており +emc +rx +##oon +384 +quick +412 +15058 +wilson +wing +chapter +##bug +beyond +##cms +##dar +##oh +zoom +e2 +trip +sb +##nba +rcep +342 +aspx +ci +080 +gc +gnu +める +##count +advanced +dance +dv +##url +##ging +367 +8591 +am09 +shadow +battle +346 +##i +##cia +##という +emily +##のてす +##tation +host +ff +techorz +sars +##mini +##mporary +##ering +nc +4200 +798 +##next +cma +##mbps +##gas +##ift +##dot +##ィ +455 +##~17 +amana +##りの +426 +##ros +ir +00㎡1 +##eet +##ible +##↓ +710 +ˋ▽ˊ +##aka +dcs +iq +##v +l1 +##lor +maggie +##011 +##iu +588 +##~1 +830 +##gt +1tb +articles +create +##burg +##iki +database +fantasy +##rex +##cam +dlc +dean +##you +hard +path +gaming +victoria +maps +cb +##lee +##itor +overchicstoretvhome +systems +##xt +416 +p3 +sarah +760 +##nan +407 +486 +x9 +install +second +626 +##ann +##ph +##rcle +##nic +860 +##nar +ec +##とう +768 +metro +chocolate +##rian +~4 +##table +##しています +skin +##sn +395 +mountain +##0mm +inparadise +6m +7x24 +ib +4800 +##jia +eeworld +creative +g5 +g3 +357 +parker +ecfa +village +からの +18000 +sylvia +サーヒス +hbl +##ques +##onsored +##x2 +##きます +##v4 +##tein +ie6 +383 +##stack +389 +ver +##ads +##baby +sound +bbe +##110 +##lone +##uid +ads +022 +gundam +351 +thinkpad +006 +scrum +match +##ave +mems +##470 +##oy +##なりました +##talk +glass +lamigo +span +##eme +job +##a5 +jay +wade +kde +498 +##lace +ocean +tvg +##covery +##r3 +##ners +##rea +junior +think +##aine +cover +##ision +##sia +↓↓ +##bow +msi +413 +458 +406 +##love +711 +801 +soft +z2 +##pl +456 +1840 +mobil +mind +##uy +427 +nginx +##oi +めた +##rr +6221 +##mple +##sson +##ーシてす +371 +##nts +91tv +comhd +crv3000 +##uard +1868 +397 +deep +lost +field +gallery +##bia +rate +spf +redis +traction +930 +icloud +011 +なら +fe +jose +372 +##tory +into +sohu +fx +899 +379 +kicstart2 +##hia +すく +##~3 +##sit +ra +24 +##walk +##xure +500g +##pact +pacific +xa +natural +carlo +##250 +##walker +1850 +##can +cto +gigi +516 +##サー +pen +##hoo +ob +matlab +##b +##yy +13913459 +##iti +mango +##bbs +sense +c5 +oxford +##ニア +walker +jennifer +##ola +course +##bre +701 +##pus +##rder +lucky +075 +##ぁ +ivy +なお +##nia +sotheby +side +##ugh +joy +##orage +##ush +##bat +##dt +364 +r9 +##2d +##gio +511 +country +wear +##lax +##~7 +##moon +393 +seven +study +411 +348 +lonzo +8k +##ェ +evolution +##イフ +##kk +gs +kd +##レス +arduino +344 +b12 +##lux +arpg +##rdon +cook +##x5 +dark +five +##als +##ida +とても +sign +362 +##ちの +something +20mm +##nda +387 +##posted +fresh +tf +1870 +422 +cam +##mine +##skip +##form +##ssion +education +394 +##tee +dyson +stage +##jie +want +##night +epson +pack +あります +##ppy +テリヘル +##█ +wd +##eh +##rence +left +##lvin +golden +mhz +discovery +##trix +##n2 +loft +##uch +##dra +##sse +speed +~1 +1mdb +sorry +welcome +##urn +wave +gaga +##lmer +teddy +##160 +トラックハック +せよ +611 +##f2016 +378 +rp +##sha +rar +##あなたに +##きた +840 +holiday +##ュー +373 +074 +##vg +##nos +##rail +gartner +gi +6p +##dium +kit +488 +b3 +eco +##ろう +20g +sean +##stone +autocad +nu +##np +f16 +write +029 +m5 +##ias +images +atp +##dk +fsm +504 +1350 +ve +52kb +##xxx +##のに +##cake +414 +unit +lim +ru +1v +##ification +published +angela +16g +analytics +ak +##q +##nel +gmt +##icon +again +##₂ +##bby +ios11 +445 +かこさいます +waze +いてす +##ハ +9985 +##ust +##ティー +framework +##007 +iptv +delete +52sykb +cl +wwdc +027 +30cm +##fw +##ての +1389 +##xon +brandt +##ses +##dragon +tc +vetements +anne +monte +modern +official +##へて +##ere +##nne +##oud +もちろん +50 +etnews +##a2 +##graphy +421 +863 +##ちゃん +444 +##rtex +##てお +l2 +##gma +mount +ccd +たと +archive +morning +tan +ddos +e7 +##ホ +day4 +##ウ +gis +453 +its +495 +factory +bruce +pg +##ito +ってくたさい +guest +cdma +##lling +536 +n3 +しかし +3~4 +mega +eyes +ro +13 +women +dac +church +##jun +singapore +##facebook +6991 +starbucks +##tos +##stin +##shine +zen +##mu +tina +20℃ +1893 +##たけて +503 +465 +request +##gence +qt +##っ +1886 +347 +363 +q7 +##zzi +diary +##tore +409 +##ead +468 +cst +##osa +canada +agent +va +##jiang +##ちは +##ーク +##lam +sg +##nix +##sday +##よって +g6 +##master +bing +##zl +charlie +16 +8mm +nb40 +##ーン +thai +##ルフ +ln284ct +##itz +##2f +bonnie +##food +##lent +originals +##stro +##lts +418 +∟∣ +##bscribe +children +ntd +yesstyle +##かも +hmv +##tment +d5 +2cm +arts +sms +##pn +##я +##いい +topios9 +539 +lifestyle +virtual +##ague +xz +##deo +muji +024 +unt +##nnis +##ᅩ +faq1 +1884 +396 +##ette +fly +64㎡ +はしめまして +441 +curry +##pop +のこ +release +##← +##◆◆ +##cast +073 +ありな +500ml +##ews +5c +##stle +ios7 +##ima +787 +dog +lenovo +##r4 +roger +013 +cbs +vornado +100m +417 +##desk +##クok +##ald +1867 +9595 +2900 +##van +oil +##x +some +break +common +##jy +##lines +g7 +twice +419 +ella +nano +belle +にこ +##mes +##self +##note +jb +##ことかてきます +benz +##との +##ova +451 +save +##wing +##ますのて +kai +りは +##hua +##rect +rainer +##unge +448 +##0m +adsl +##かな +guestname +##uma +##kins +##zu +tokichoi +##price +county +##med +##mus +rmk +391 +address +vm +えて +openload +##group +##hin +##iginal +amg +urban +##oz +jobs +emi +##public +beautiful +##sch +album +##dden +##bell +jerry +works +hostel +miller +##drive +##rmin +##10 +376 +boot +828 +##370 +##fx +##cm~ +1885 +##nome +##ctionary +##oman +##lish +##cr +##hm +433 +##how +432 +francis +xi +c919 +b5 +evernote +##uc +vga +##3000 +coupe +##urg +##cca +##uality +019 +6g +れる +multi +##また +##ett +em +hey +##ani +##tax +##rma +inside +than +740 +leonnhurt +##jin +ict +れた +bird +notes +200mm +くの +##dical +##lli +result +442 +iu +ee +438 +smap +gopro +##last +yin +pure +998 +32g +けた +5kg +##dan +##rame +mama +##oot +bean +marketing +##hur +2l +bella +sync +xuite +##ground +515 +discuz +##getrelax +##ince +##bay +##5s +cj +##イス +gmat +apt +##pass +jing +##rix +c4 +rich +##とても +niusnews +##ello +bag +770 +##eting +##mobile +18 +culture +015 +##のてすか +377 +1020 +area +##ience +616 +details +gp +universal +silver +dit +はお +private +ddd +u11 +kanshu +##ified +fung +##nny +dx +##520 +tai +475 +023 +##fr +##lean +3s +##pin +429 +##rin +25000 +ly +rick +##bility +usb3 +banner +##baru +##gion +metal +dt +vdf +1871 +karl +qualcomm +bear +1010 +oldid +ian +jo +##tors +population +##ernel +1882 +mmorpg +##mv +##bike +603 +##© +ww +friend +##ager +exhibition +##del +##pods +fpx +structure +##free +##tings +kl +##rley +##copyright +##mma +california +3400 +orange +yoga +4l +canmake +honey +##anda +##コメント +595 +nikkie +##ルハイト +dhl +publishing +##mall +##gnet +20cm +513 +##クセス +##┅ +e88 +970 +##dog +fishbase +##! +##" +### +##$ +##% +##& +##' +##( +##) +##* +##+ +##, +##- +##. +##/ +##: +##; +##< +##= +##> +##? +##@ +##[ +##\ +##] +##^ +##_ +##{ +##| +##} +##~ +##£ +##¤ +##¥ +##§ +##« +##± +##³ +##µ +##· +##¹ +##º +##» +##¼ +##ß +##æ +##÷ +##ø +##đ +##ŋ +##ɔ +##ə +##ɡ +##ʰ +##ˇ +##ˈ +##ˊ +##ˋ +##ˍ +##ː +##˙ +##˚ +##ˢ +##α +##β +##γ +##δ +##ε +##η +##θ +##ι +##κ +##λ +##μ +##ν +##ο +##π +##ρ +##ς +##σ +##τ +##υ +##φ +##χ +##ψ +##б +##в +##г +##д +##е +##ж +##з +##к +##л +##м +##н +##о +##п +##р +##с +##т +##у +##ф +##х +##ц +##ч +##ш +##ы +##ь +##і +##ا +##ب +##ة +##ت +##د +##ر +##س +##ع +##ل +##م +##ن +##ه +##و +##ي +##۩ +##ก +##ง +##น +##ม +##ย +##ร +##อ +##า +##เ +##๑ +##་ +##ღ +##ᄀ +##ᄁ +##ᄂ +##ᄃ +##ᄅ +##ᄆ +##ᄇ +##ᄈ +##ᄉ +##ᄋ +##ᄌ +##ᄎ +##ᄏ +##ᄐ +##ᄑ +##ᄒ +##ᅢ +##ᅣ +##ᅥ +##ᅦ +##ᅧ +##ᅨ +##ᅪ +##ᅬ +##ᅭ +##ᅮ +##ᅯ +##ᅲ +##ᅳ +##ᅴ +##ᆷ +##ᆸ +##ᆺ +##ᆻ +##ᗜ +##ᵃ +##ᵉ +##ᵍ +##ᵏ +##ᵐ +##ᵒ +##ᵘ +##‖ +##„ +##† +##• +##‥ +##‧ +##
 +##‰ +##′ +##″ +##‹ +##› +##※ +##‿ +##⁄ +##ⁱ +##⁺ +##ⁿ +##₁ +##₃ +##₄ +##€ +##№ +##ⅰ +##ⅱ +##ⅲ +##ⅳ +##ⅴ +##↔ +##↗ +##↘ +##⇒ +##∀ +##− +##∕ +##∙ +##√ +##∞ +##∟ +##∠ +##∣ +##∩ +##∮ +##∶ +##∼ +##∽ +##≈ +##≒ +##≡ +##≤ +##≥ +##≦ +##≧ +##≪ +##≫ +##⊙ +##⋅ +##⋈ +##⋯ +##⌒ +##① +##② +##③ +##④ +##⑤ +##⑥ +##⑦ +##⑧ +##⑨ +##⑩ +##⑴ +##⑵ +##⑶ +##⑷ +##⑸ +##⒈ +##⒉ +##⒊ +##⒋ +##ⓒ +##ⓔ +##ⓘ +##━ +##┃ +##┆ +##┊ +##┌ +##└ +##├ +##┣ +##═ +##║ +##╚ +##╞ +##╠ +##╭ +##╮ +##╯ +##╰ +##╱ +##╳ +##▂ +##▃ +##▅ +##▇ +##▉ +##▋ +##▌ +##▍ +##▎ +##□ +##▪ +##▫ +##▬ +##△ +##▶ +##► +##▽ +##◇ +##◕ +##◠ +##◢ +##◤ +##☀ +##☕ +##☞ +##☺ +##☼ +##♀ +##♂ +##♠ +##♡ +##♣ +##♦ +##♫ +##♬ +##✈ +##✔ +##✕ +##✖ +##✦ +##✨ +##✪ +##✰ +##✿ +##❀ +##➜ +##➤ +##⦿ +##、 +##。 +##〃 +##々 +##〇 +##〈 +##〉 +##《 +##》 +##「 +##」 +##『 +##』 +##【 +##】 +##〓 +##〔 +##〕 +##〖 +##〗 +##〜 +##〝 +##〞 +##ぃ +##ぇ +##ぬ +##ふ +##ほ +##む +##ゃ +##ゅ +##ゆ +##ょ +##゜ +##ゝ +##ァ +##ゥ +##エ +##ォ +##ケ +##サ +##セ +##ソ +##ッ +##ニ +##ヌ +##ネ +##ノ +##ヘ +##モ +##ャ +##ヤ +##ュ +##ユ +##ョ +##ヨ +##ワ +##ヲ +##・ +##ヽ +##ㄅ +##ㄆ +##ㄇ +##ㄉ +##ㄋ +##ㄌ +##ㄍ +##ㄎ +##ㄏ +##ㄒ +##ㄚ +##ㄛ +##ㄞ +##ㄟ +##ㄢ +##ㄤ +##ㄥ +##ㄧ +##ㄨ +##ㆍ +##㈦ +##㊣ +##㗎 +##一 +##丁 +##七 +##万 +##丈 +##三 +##上 +##下 +##不 +##与 +##丐 +##丑 +##专 +##且 +##丕 +##世 +##丘 +##丙 +##业 +##丛 +##东 +##丝 +##丞 +##丟 +##両 +##丢 +##两 +##严 +##並 +##丧 +##丨 +##个 +##丫 +##中 +##丰 +##串 +##临 +##丶 +##丸 +##丹 +##为 +##主 +##丼 +##丽 +##举 +##丿 +##乂 +##乃 +##久 +##么 +##义 +##之 +##乌 +##乍 +##乎 +##乏 +##乐 +##乒 +##乓 +##乔 +##乖 +##乗 +##乘 +##乙 +##乜 +##九 +##乞 +##也 +##习 +##乡 +##书 +##乩 +##买 +##乱 +##乳 +##乾 +##亀 +##亂 +##了 +##予 +##争 +##事 +##二 +##于 +##亏 +##云 +##互 +##五 +##井 +##亘 +##亙 +##亚 +##些 +##亜 +##亞 +##亟 +##亡 +##亢 +##交 +##亥 +##亦 +##产 +##亨 +##亩 +##享 +##京 +##亭 +##亮 +##亲 +##亳 +##亵 +##人 +##亿 +##什 +##仁 +##仃 +##仄 +##仅 +##仆 +##仇 +##今 +##介 +##仍 +##从 +##仏 +##仑 +##仓 +##仔 +##仕 +##他 +##仗 +##付 +##仙 +##仝 +##仞 +##仟 +##代 +##令 +##以 +##仨 +##仪 +##们 +##仮 +##仰 +##仲 +##件 +##价 +##任 +##份 +##仿 +##企 +##伉 +##伊 +##伍 +##伎 +##伏 +##伐 +##休 +##伕 +##众 +##优 +##伙 +##会 +##伝 +##伞 +##伟 +##传 +##伢 +##伤 +##伦 +##伪 +##伫 +##伯 +##估 +##伴 +##伶 +##伸 +##伺 +##似 +##伽 +##佃 +##但 +##佇 +##佈 +##位 +##低 +##住 +##佐 +##佑 +##体 +##佔 +##何 +##佗 +##佘 +##余 +##佚 +##佛 +##作 +##佝 +##佞 +##佟 +##你 +##佢 +##佣 +##佤 +##佥 +##佩 +##佬 +##佯 +##佰 +##佳 +##併 +##佶 +##佻 +##佼 +##使 +##侃 +##侄 +##來 +##侈 +##例 +##侍 +##侏 +##侑 +##侖 +##侗 +##供 +##依 +##侠 +##価 +##侣 +##侥 +##侦 +##侧 +##侨 +##侬 +##侮 +##侯 +##侵 +##侶 +##侷 +##便 +##係 +##促 +##俄 +##俊 +##俎 +##俏 +##俐 +##俑 +##俗 +##俘 +##俚 +##保 +##俞 +##俟 +##俠 +##信 +##俨 +##俩 +##俪 +##俬 +##俭 +##修 +##俯 +##俱 +##俳 +##俸 +##俺 +##俾 +##倆 +##倉 +##個 +##倌 +##倍 +##倏 +##們 +##倒 +##倔 +##倖 +##倘 +##候 +##倚 +##倜 +##借 +##倡 +##値 +##倦 +##倩 +##倪 +##倫 +##倬 +##倭 +##倶 +##债 +##值 +##倾 +##偃 +##假 +##偈 +##偉 +##偌 +##偎 +##偏 +##偕 +##做 +##停 +##健 +##側 +##偵 +##偶 +##偷 +##偻 +##偽 +##偿 +##傀 +##傅 +##傍 +##傑 +##傘 +##備 +##傚 +##傢 +##傣 +##傥 +##储 +##傩 +##催 +##傭 +##傲 +##傳 +##債 +##傷 +##傻 +##傾 +##僅 +##働 +##像 +##僑 +##僕 +##僖 +##僚 +##僥 +##僧 +##僭 +##僮 +##僱 +##僵 +##價 +##僻 +##儀 +##儂 +##億 +##儆 +##儉 +##儋 +##儒 +##儕 +##儘 +##償 +##儡 +##優 +##儲 +##儷 +##儼 +##儿 +##兀 +##允 +##元 +##兄 +##充 +##兆 +##兇 +##先 +##光 +##克 +##兌 +##免 +##児 +##兑 +##兒 +##兔 +##兖 +##党 +##兜 +##兢 +##入 +##內 +##全 +##兩 +##八 +##公 +##六 +##兮 +##兰 +##共 +##兲 +##关 +##兴 +##兵 +##其 +##具 +##典 +##兹 +##养 +##兼 +##兽 +##冀 +##内 +##円 +##冇 +##冈 +##冉 +##冊 +##册 +##再 +##冏 +##冒 +##冕 +##冗 +##写 +##军 +##农 +##冠 +##冢 +##冤 +##冥 +##冨 +##冪 +##冬 +##冯 +##冰 +##冲 +##决 +##况 +##冶 +##冷 +##冻 +##冼 +##冽 +##冾 +##净 +##凄 +##准 +##凇 +##凈 +##凉 +##凋 +##凌 +##凍 +##减 +##凑 +##凛 +##凜 +##凝 +##几 +##凡 +##凤 +##処 +##凪 +##凭 +##凯 +##凰 +##凱 +##凳 +##凶 +##凸 +##凹 +##出 +##击 +##函 +##凿 +##刀 +##刁 +##刃 +##分 +##切 +##刈 +##刊 +##刍 +##刎 +##刑 +##划 +##列 +##刘 +##则 +##刚 +##创 +##初 +##删 +##判 +##別 +##刨 +##利 +##刪 +##别 +##刮 +##到 +##制 +##刷 +##券 +##刹 +##刺 +##刻 +##刽 +##剁 +##剂 +##剃 +##則 +##剉 +##削 +##剋 +##剌 +##前 +##剎 +##剐 +##剑 +##剔 +##剖 +##剛 +##剜 +##剝 +##剣 +##剤 +##剥 +##剧 +##剩 +##剪 +##副 +##割 +##創 +##剷 +##剽 +##剿 +##劃 +##劇 +##劈 +##劉 +##劊 +##劍 +##劏 +##劑 +##力 +##劝 +##办 +##功 +##加 +##务 +##劣 +##动 +##助 +##努 +##劫 +##劭 +##励 +##劲 +##劳 +##労 +##劵 +##効 +##劾 +##势 +##勁 +##勃 +##勇 +##勉 +##勋 +##勐 +##勒 +##動 +##勖 +##勘 +##務 +##勛 +##勝 +##勞 +##募 +##勢 +##勤 +##勧 +##勳 +##勵 +##勸 +##勺 +##勻 +##勾 +##勿 +##匀 +##包 +##匆 +##匈 +##匍 +##匐 +##匕 +##化 +##北 +##匙 +##匝 +##匠 +##匡 +##匣 +##匪 +##匮 +##匯 +##匱 +##匹 +##区 +##医 +##匾 +##匿 +##區 +##十 +##千 +##卅 +##升 +##午 +##卉 +##半 +##卍 +##华 +##协 +##卑 +##卒 +##卓 +##協 +##单 +##卖 +##南 +##単 +##博 +##卜 +##卞 +##卟 +##占 +##卡 +##卢 +##卤 +##卦 +##卧 +##卫 +##卮 +##卯 +##印 +##危 +##即 +##却 +##卵 +##卷 +##卸 +##卻 +##卿 +##厂 +##厄 +##厅 +##历 +##厉 +##压 +##厌 +##厕 +##厘 +##厚 +##厝 +##原 +##厢 +##厥 +##厦 +##厨 +##厩 +##厭 +##厮 +##厲 +##厳 +##去 +##县 +##叁 +##参 +##參 +##又 +##叉 +##及 +##友 +##双 +##反 +##収 +##发 +##叔 +##取 +##受 +##变 +##叙 +##叛 +##叟 +##叠 +##叡 +##叢 +##口 +##古 +##句 +##另 +##叨 +##叩 +##只 +##叫 +##召 +##叭 +##叮 +##可 +##台 +##叱 +##史 +##右 +##叵 +##叶 +##号 +##司 +##叹 +##叻 +##叼 +##叽 +##吁 +##吃 +##各 +##吆 +##合 +##吉 +##吊 +##吋 +##同 +##名 +##后 +##吏 +##吐 +##向 +##吒 +##吓 +##吕 +##吖 +##吗 +##君 +##吝 +##吞 +##吟 +##吠 +##吡 +##否 +##吧 +##吨 +##吩 +##含 +##听 +##吭 +##吮 +##启 +##吱 +##吳 +##吴 +##吵 +##吶 +##吸 +##吹 +##吻 +##吼 +##吽 +##吾 +##呀 +##呂 +##呃 +##呆 +##呈 +##告 +##呋 +##呎 +##呐 +##呓 +##呕 +##呗 +##员 +##呛 +##呜 +##呢 +##呤 +##呦 +##周 +##呱 +##呲 +##味 +##呵 +##呷 +##呸 +##呻 +##呼 +##命 +##咀 +##咁 +##咂 +##咄 +##咆 +##咋 +##和 +##咎 +##咏 +##咐 +##咒 +##咔 +##咕 +##咖 +##咗 +##咘 +##咙 +##咚 +##咛 +##咣 +##咤 +##咦 +##咧 +##咨 +##咩 +##咪 +##咫 +##咬 +##咭 +##咯 +##咱 +##咲 +##咳 +##咸 +##咻 +##咽 +##咿 +##哀 +##品 +##哂 +##哄 +##哆 +##哇 +##哈 +##哉 +##哋 +##哌 +##响 +##哎 +##哏 +##哐 +##哑 +##哒 +##哔 +##哗 +##哟 +##員 +##哥 +##哦 +##哧 +##哨 +##哩 +##哪 +##哭 +##哮 +##哲 +##哺 +##哼 +##哽 +##唁 +##唄 +##唆 +##唇 +##唉 +##唏 +##唐 +##唑 +##唔 +##唠 +##唤 +##唧 +##唬 +##售 +##唯 +##唰 +##唱 +##唳 +##唷 +##唸 +##唾 +##啃 +##啄 +##商 +##啉 +##啊 +##問 +##啓 +##啕 +##啖 +##啜 +##啞 +##啟 +##啡 +##啤 +##啥 +##啦 +##啧 +##啪 +##啫 +##啬 +##啮 +##啰 +##啱 +##啲 +##啵 +##啶 +##啷 +##啸 +##啻 +##啼 +##啾 +##喀 +##喂 +##喃 +##善 +##喆 +##喇 +##喉 +##喊 +##喋 +##喎 +##喏 +##喔 +##喘 +##喙 +##喚 +##喜 +##喝 +##喟 +##喧 +##喪 +##喫 +##喬 +##單 +##喰 +##喱 +##喲 +##喳 +##喵 +##営 +##喷 +##喹 +##喺 +##喻 +##喽 +##嗅 +##嗆 +##嗇 +##嗎 +##嗑 +##嗒 +##嗓 +##嗔 +##嗖 +##嗚 +##嗜 +##嗝 +##嗟 +##嗡 +##嗣 +##嗤 +##嗦 +##嗨 +##嗪 +##嗬 +##嗯 +##嗰 +##嗲 +##嗳 +##嗶 +##嗷 +##嗽 +##嘀 +##嘅 +##嘆 +##嘈 +##嘉 +##嘌 +##嘍 +##嘎 +##嘔 +##嘖 +##嘗 +##嘘 +##嘚 +##嘛 +##嘜 +##嘞 +##嘟 +##嘢 +##嘣 +##嘤 +##嘧 +##嘩 +##嘭 +##嘮 +##嘯 +##嘰 +##嘱 +##嘲 +##嘴 +##嘶 +##嘸 +##嘹 +##嘻 +##嘿 +##噁 +##噌 +##噎 +##噓 +##噔 +##噗 +##噙 +##噜 +##噠 +##噢 +##噤 +##器 +##噩 +##噪 +##噬 +##噱 +##噴 +##噶 +##噸 +##噹 +##噻 +##噼 +##嚀 +##嚇 +##嚎 +##嚏 +##嚐 +##嚓 +##嚕 +##嚟 +##嚣 +##嚥 +##嚨 +##嚮 +##嚴 +##嚷 +##嚼 +##囂 +##囉 +##囊 +##囍 +##囑 +##囔 +##囗 +##囚 +##四 +##囝 +##回 +##囟 +##因 +##囡 +##团 +##団 +##囤 +##囧 +##囪 +##囫 +##园 +##困 +##囱 +##囲 +##図 +##围 +##囹 +##固 +##国 +##图 +##囿 +##圃 +##圄 +##圆 +##圈 +##國 +##圍 +##圏 +##園 +##圓 +##圖 +##團 +##圜 +##土 +##圣 +##圧 +##在 +##圩 +##圭 +##地 +##圳 +##场 +##圻 +##圾 +##址 +##坂 +##均 +##坊 +##坍 +##坎 +##坏 +##坐 +##坑 +##块 +##坚 +##坛 +##坝 +##坞 +##坟 +##坠 +##坡 +##坤 +##坦 +##坨 +##坪 +##坯 +##坳 +##坵 +##坷 +##垂 +##垃 +##垄 +##型 +##垒 +##垚 +##垛 +##垠 +##垢 +##垣 +##垦 +##垩 +##垫 +##垭 +##垮 +##垵 +##埂 +##埃 +##埋 +##城 +##埔 +##埕 +##埗 +##域 +##埠 +##埤 +##埵 +##執 +##埸 +##培 +##基 +##埼 +##堀 +##堂 +##堃 +##堅 +##堆 +##堇 +##堑 +##堕 +##堙 +##堡 +##堤 +##堪 +##堯 +##堰 +##報 +##場 +##堵 +##堺 +##堿 +##塊 +##塌 +##塑 +##塔 +##塗 +##塘 +##塚 +##塞 +##塢 +##塩 +##填 +##塬 +##塭 +##塵 +##塾 +##墀 +##境 +##墅 +##墉 +##墊 +##墒 +##墓 +##増 +##墘 +##墙 +##墜 +##增 +##墟 +##墨 +##墩 +##墮 +##墳 +##墻 +##墾 +##壁 +##壅 +##壆 +##壇 +##壊 +##壑 +##壓 +##壕 +##壘 +##壞 +##壟 +##壢 +##壤 +##壩 +##士 +##壬 +##壮 +##壯 +##声 +##売 +##壳 +##壶 +##壹 +##壺 +##壽 +##处 +##备 +##変 +##复 +##夏 +##夔 +##夕 +##外 +##夙 +##多 +##夜 +##够 +##夠 +##夢 +##夥 +##大 +##天 +##太 +##夫 +##夭 +##央 +##夯 +##失 +##头 +##夷 +##夸 +##夹 +##夺 +##夾 +##奂 +##奄 +##奇 +##奈 +##奉 +##奋 +##奎 +##奏 +##奐 +##契 +##奔 +##奕 +##奖 +##套 +##奘 +##奚 +##奠 +##奢 +##奥 +##奧 +##奪 +##奬 +##奮 +##女 +##奴 +##奶 +##奸 +##她 +##好 +##如 +##妃 +##妄 +##妆 +##妇 +##妈 +##妊 +##妍 +##妒 +##妓 +##妖 +##妘 +##妙 +##妝 +##妞 +##妣 +##妤 +##妥 +##妨 +##妩 +##妪 +##妮 +##妲 +##妳 +##妹 +##妻 +##妾 +##姆 +##姉 +##姊 +##始 +##姍 +##姐 +##姑 +##姒 +##姓 +##委 +##姗 +##姚 +##姜 +##姝 +##姣 +##姥 +##姦 +##姨 +##姪 +##姫 +##姬 +##姹 +##姻 +##姿 +##威 +##娃 +##娄 +##娅 +##娆 +##娇 +##娉 +##娑 +##娓 +##娘 +##娛 +##娜 +##娟 +##娠 +##娣 +##娥 +##娩 +##娱 +##娲 +##娴 +##娶 +##娼 +##婀 +##婁 +##婆 +##婉 +##婊 +##婕 +##婚 +##婢 +##婦 +##婧 +##婪 +##婭 +##婴 +##婵 +##婶 +##婷 +##婺 +##婿 +##媒 +##媚 +##媛 +##媞 +##媧 +##媲 +##媳 +##媽 +##媾 +##嫁 +##嫂 +##嫉 +##嫌 +##嫑 +##嫔 +##嫖 +##嫘 +##嫚 +##嫡 +##嫣 +##嫦 +##嫩 +##嫲 +##嫵 +##嫻 +##嬅 +##嬉 +##嬌 +##嬗 +##嬛 +##嬢 +##嬤 +##嬪 +##嬰 +##嬴 +##嬷 +##嬸 +##嬿 +##孀 +##孃 +##子 +##孑 +##孔 +##孕 +##孖 +##字 +##存 +##孙 +##孚 +##孛 +##孜 +##孝 +##孟 +##孢 +##季 +##孤 +##学 +##孩 +##孪 +##孫 +##孬 +##孰 +##孱 +##孳 +##孵 +##學 +##孺 +##孽 +##孿 +##宁 +##它 +##宅 +##宇 +##守 +##安 +##宋 +##完 +##宏 +##宓 +##宕 +##宗 +##官 +##宙 +##定 +##宛 +##宜 +##宝 +##实 +##実 +##宠 +##审 +##客 +##宣 +##室 +##宥 +##宦 +##宪 +##宫 +##宮 +##宰 +##害 +##宴 +##宵 +##家 +##宸 +##容 +##宽 +##宾 +##宿 +##寂 +##寄 +##寅 +##密 +##寇 +##富 +##寐 +##寒 +##寓 +##寛 +##寝 +##寞 +##察 +##寡 +##寢 +##寥 +##實 +##寧 +##寨 +##審 +##寫 +##寬 +##寮 +##寰 +##寵 +##寶 +##寸 +##对 +##寺 +##寻 +##导 +##対 +##寿 +##封 +##専 +##射 +##将 +##將 +##專 +##尉 +##尊 +##尋 +##對 +##導 +##小 +##少 +##尔 +##尕 +##尖 +##尘 +##尚 +##尝 +##尤 +##尧 +##尬 +##就 +##尴 +##尷 +##尸 +##尹 +##尺 +##尻 +##尼 +##尽 +##尾 +##尿 +##局 +##屁 +##层 +##屄 +##居 +##屆 +##屈 +##屉 +##届 +##屋 +##屌 +##屍 +##屎 +##屏 +##屐 +##屑 +##展 +##屜 +##属 +##屠 +##屡 +##屢 +##層 +##履 +##屬 +##屯 +##山 +##屹 +##屿 +##岀 +##岁 +##岂 +##岌 +##岐 +##岑 +##岔 +##岖 +##岗 +##岘 +##岙 +##岚 +##岛 +##岡 +##岩 +##岫 +##岬 +##岭 +##岱 +##岳 +##岷 +##岸 +##峇 +##峋 +##峒 +##峙 +##峡 +##峤 +##峥 +##峦 +##峨 +##峪 +##峭 +##峯 +##峰 +##峴 +##島 +##峻 +##峽 +##崁 +##崂 +##崆 +##崇 +##崎 +##崑 +##崔 +##崖 +##崗 +##崙 +##崛 +##崧 +##崩 +##崭 +##崴 +##崽 +##嵇 +##嵊 +##嵋 +##嵌 +##嵐 +##嵘 +##嵩 +##嵬 +##嵯 +##嶂 +##嶄 +##嶇 +##嶋 +##嶙 +##嶺 +##嶼 +##嶽 +##巅 +##巍 +##巒 +##巔 +##巖 +##川 +##州 +##巡 +##巢 +##工 +##左 +##巧 +##巨 +##巩 +##巫 +##差 +##己 +##已 +##巳 +##巴 +##巷 +##巻 +##巽 +##巾 +##巿 +##币 +##市 +##布 +##帅 +##帆 +##师 +##希 +##帐 +##帑 +##帕 +##帖 +##帘 +##帚 +##帛 +##帜 +##帝 +##帥 +##带 +##帧 +##師 +##席 +##帮 +##帯 +##帰 +##帳 +##帶 +##帷 +##常 +##帼 +##帽 +##幀 +##幂 +##幄 +##幅 +##幌 +##幔 +##幕 +##幟 +##幡 +##幢 +##幣 +##幫 +##干 +##平 +##年 +##并 +##幸 +##幹 +##幺 +##幻 +##幼 +##幽 +##幾 +##广 +##庁 +##広 +##庄 +##庆 +##庇 +##床 +##序 +##庐 +##库 +##应 +##底 +##庖 +##店 +##庙 +##庚 +##府 +##庞 +##废 +##庠 +##度 +##座 +##庫 +##庭 +##庵 +##庶 +##康 +##庸 +##庹 +##庾 +##廁 +##廂 +##廃 +##廈 +##廉 +##廊 +##廓 +##廖 +##廚 +##廝 +##廟 +##廠 +##廢 +##廣 +##廬 +##廳 +##延 +##廷 +##建 +##廿 +##开 +##弁 +##异 +##弃 +##弄 +##弈 +##弊 +##弋 +##式 +##弑 +##弒 +##弓 +##弔 +##引 +##弗 +##弘 +##弛 +##弟 +##张 +##弥 +##弦 +##弧 +##弩 +##弭 +##弯 +##弱 +##張 +##強 +##弹 +##强 +##弼 +##弾 +##彅 +##彆 +##彈 +##彌 +##彎 +##归 +##当 +##录 +##彗 +##彙 +##彝 +##形 +##彤 +##彥 +##彦 +##彧 +##彩 +##彪 +##彫 +##彬 +##彭 +##彰 +##影 +##彷 +##役 +##彻 +##彼 +##彿 +##往 +##征 +##径 +##待 +##徇 +##很 +##徉 +##徊 +##律 +##後 +##徐 +##徑 +##徒 +##従 +##徕 +##得 +##徘 +##徙 +##徜 +##從 +##徠 +##御 +##徨 +##復 +##循 +##徬 +##微 +##徳 +##徴 +##徵 +##德 +##徹 +##徼 +##徽 +##心 +##必 +##忆 +##忌 +##忍 +##忏 +##忐 +##忑 +##忒 +##忖 +##志 +##忘 +##忙 +##応 +##忠 +##忡 +##忤 +##忧 +##忪 +##快 +##忱 +##念 +##忻 +##忽 +##忿 +##怀 +##态 +##怂 +##怅 +##怆 +##怎 +##怏 +##怒 +##怔 +##怕 +##怖 +##怙 +##怜 +##思 +##怠 +##怡 +##急 +##怦 +##性 +##怨 +##怪 +##怯 +##怵 +##总 +##怼 +##恁 +##恃 +##恆 +##恋 +##恍 +##恐 +##恒 +##恕 +##恙 +##恚 +##恢 +##恣 +##恤 +##恥 +##恨 +##恩 +##恪 +##恫 +##恬 +##恭 +##息 +##恰 +##恳 +##恵 +##恶 +##恸 +##恺 +##恻 +##恼 +##恿 +##悄 +##悅 +##悉 +##悌 +##悍 +##悔 +##悖 +##悚 +##悟 +##悠 +##患 +##悦 +##您 +##悩 +##悪 +##悬 +##悯 +##悱 +##悲 +##悴 +##悵 +##悶 +##悸 +##悻 +##悼 +##悽 +##情 +##惆 +##惇 +##惊 +##惋 +##惑 +##惕 +##惘 +##惚 +##惜 +##惟 +##惠 +##惡 +##惦 +##惧 +##惨 +##惩 +##惫 +##惬 +##惭 +##惮 +##惯 +##惰 +##惱 +##想 +##惴 +##惶 +##惹 +##惺 +##愁 +##愆 +##愈 +##愉 +##愍 +##意 +##愕 +##愚 +##愛 +##愜 +##感 +##愣 +##愤 +##愧 +##愫 +##愷 +##愿 +##慄 +##慈 +##態 +##慌 +##慎 +##慑 +##慕 +##慘 +##慚 +##慟 +##慢 +##慣 +##慧 +##慨 +##慫 +##慮 +##慰 +##慳 +##慵 +##慶 +##慷 +##慾 +##憂 +##憊 +##憋 +##憎 +##憐 +##憑 +##憔 +##憚 +##憤 +##憧 +##憨 +##憩 +##憫 +##憬 +##憲 +##憶 +##憾 +##懂 +##懇 +##懈 +##應 +##懊 +##懋 +##懑 +##懒 +##懦 +##懲 +##懵 +##懶 +##懷 +##懸 +##懺 +##懼 +##懾 +##懿 +##戀 +##戈 +##戊 +##戌 +##戍 +##戎 +##戏 +##成 +##我 +##戒 +##戕 +##或 +##战 +##戚 +##戛 +##戟 +##戡 +##戦 +##截 +##戬 +##戮 +##戰 +##戲 +##戳 +##戴 +##戶 +##户 +##戸 +##戻 +##戾 +##房 +##所 +##扁 +##扇 +##扈 +##扉 +##手 +##才 +##扎 +##扑 +##扒 +##打 +##扔 +##払 +##托 +##扛 +##扣 +##扦 +##执 +##扩 +##扪 +##扫 +##扬 +##扭 +##扮 +##扯 +##扰 +##扱 +##扳 +##扶 +##批 +##扼 +##找 +##承 +##技 +##抄 +##抉 +##把 +##抑 +##抒 +##抓 +##投 +##抖 +##抗 +##折 +##抚 +##抛 +##抜 +##択 +##抟 +##抠 +##抡 +##抢 +##护 +##报 +##抨 +##披 +##抬 +##抱 +##抵 +##抹 +##押 +##抽 +##抿 +##拂 +##拄 +##担 +##拆 +##拇 +##拈 +##拉 +##拋 +##拌 +##拍 +##拎 +##拐 +##拒 +##拓 +##拔 +##拖 +##拗 +##拘 +##拙 +##拚 +##招 +##拜 +##拟 +##拡 +##拢 +##拣 +##拥 +##拦 +##拧 +##拨 +##择 +##括 +##拭 +##拮 +##拯 +##拱 +##拳 +##拴 +##拷 +##拼 +##拽 +##拾 +##拿 +##持 +##挂 +##指 +##挈 +##按 +##挎 +##挑 +##挖 +##挙 +##挚 +##挛 +##挝 +##挞 +##挟 +##挠 +##挡 +##挣 +##挤 +##挥 +##挨 +##挪 +##挫 +##振 +##挲 +##挹 +##挺 +##挽 +##挾 +##捂 +##捅 +##捆 +##捉 +##捋 +##捌 +##捍 +##捎 +##捏 +##捐 +##捕 +##捞 +##损 +##捡 +##换 +##捣 +##捧 +##捨 +##捩 +##据 +##捱 +##捲 +##捶 +##捷 +##捺 +##捻 +##掀 +##掂 +##掃 +##掇 +##授 +##掉 +##掌 +##掏 +##掐 +##排 +##掖 +##掘 +##掙 +##掛 +##掠 +##採 +##探 +##掣 +##接 +##控 +##推 +##掩 +##措 +##掬 +##掰 +##掲 +##掳 +##掴 +##掷 +##掸 +##掺 +##揀 +##揃 +##揄 +##揆 +##揉 +##揍 +##描 +##提 +##插 +##揖 +##揚 +##換 +##握 +##揣 +##揩 +##揪 +##揭 +##揮 +##援 +##揶 +##揸 +##揹 +##揽 +##搀 +##搁 +##搂 +##搅 +##損 +##搏 +##搐 +##搓 +##搔 +##搖 +##搗 +##搜 +##搞 +##搡 +##搪 +##搬 +##搭 +##搵 +##搶 +##携 +##搽 +##摀 +##摁 +##摄 +##摆 +##摇 +##摈 +##摊 +##摒 +##摔 +##摘 +##摞 +##摟 +##摧 +##摩 +##摯 +##摳 +##摸 +##摹 +##摺 +##摻 +##撂 +##撃 +##撅 +##撇 +##撈 +##撐 +##撑 +##撒 +##撓 +##撕 +##撚 +##撞 +##撤 +##撥 +##撩 +##撫 +##撬 +##播 +##撮 +##撰 +##撲 +##撵 +##撷 +##撸 +##撻 +##撼 +##撿 +##擀 +##擁 +##擂 +##擄 +##擅 +##擇 +##擊 +##擋 +##操 +##擎 +##擒 +##擔 +##擘 +##據 +##擞 +##擠 +##擡 +##擢 +##擦 +##擬 +##擰 +##擱 +##擲 +##擴 +##擷 +##擺 +##擼 +##擾 +##攀 +##攏 +##攒 +##攔 +##攘 +##攙 +##攜 +##攝 +##攞 +##攢 +##攣 +##攤 +##攥 +##攪 +##攫 +##攬 +##支 +##收 +##攸 +##改 +##攻 +##放 +##政 +##故 +##效 +##敌 +##敍 +##敎 +##敏 +##救 +##敕 +##敖 +##敗 +##敘 +##教 +##敛 +##敝 +##敞 +##敢 +##散 +##敦 +##敬 +##数 +##敲 +##整 +##敵 +##敷 +##數 +##斂 +##斃 +##文 +##斋 +##斌 +##斎 +##斐 +##斑 +##斓 +##斗 +##料 +##斛 +##斜 +##斟 +##斡 +##斤 +##斥 +##斧 +##斩 +##斫 +##斬 +##断 +##斯 +##新 +##斷 +##方 +##於 +##施 +##旁 +##旃 +##旅 +##旋 +##旌 +##旎 +##族 +##旖 +##旗 +##无 +##既 +##日 +##旦 +##旧 +##旨 +##早 +##旬 +##旭 +##旮 +##旱 +##时 +##旷 +##旺 +##旻 +##昀 +##昂 +##昆 +##昇 +##昉 +##昊 +##昌 +##明 +##昏 +##易 +##昔 +##昕 +##昙 +##星 +##映 +##春 +##昧 +##昨 +##昭 +##是 +##昱 +##昴 +##昵 +##昶 +##昼 +##显 +##晁 +##時 +##晃 +##晉 +##晋 +##晌 +##晏 +##晒 +##晓 +##晔 +##晕 +##晖 +##晗 +##晚 +##晝 +##晞 +##晟 +##晤 +##晦 +##晨 +##晩 +##普 +##景 +##晰 +##晴 +##晶 +##晷 +##智 +##晾 +##暂 +##暄 +##暇 +##暈 +##暉 +##暌 +##暐 +##暑 +##暖 +##暗 +##暝 +##暢 +##暧 +##暨 +##暫 +##暮 +##暱 +##暴 +##暸 +##暹 +##曄 +##曆 +##曇 +##曉 +##曖 +##曙 +##曜 +##曝 +##曠 +##曦 +##曬 +##曰 +##曲 +##曳 +##更 +##書 +##曹 +##曼 +##曾 +##替 +##最 +##會 +##月 +##有 +##朋 +##服 +##朐 +##朔 +##朕 +##朗 +##望 +##朝 +##期 +##朦 +##朧 +##木 +##未 +##末 +##本 +##札 +##朮 +##术 +##朱 +##朴 +##朵 +##机 +##朽 +##杀 +##杂 +##权 +##杆 +##杈 +##杉 +##李 +##杏 +##材 +##村 +##杓 +##杖 +##杜 +##杞 +##束 +##杠 +##条 +##来 +##杨 +##杭 +##杯 +##杰 +##東 +##杳 +##杵 +##杷 +##杼 +##松 +##板 +##极 +##构 +##枇 +##枉 +##枋 +##析 +##枕 +##林 +##枚 +##果 +##枝 +##枢 +##枣 +##枪 +##枫 +##枭 +##枯 +##枰 +##枱 +##枳 +##架 +##枷 +##枸 +##柄 +##柏 +##某 +##柑 +##柒 +##染 +##柔 +##柘 +##柚 +##柜 +##柞 +##柠 +##柢 +##查 +##柩 +##柬 +##柯 +##柱 +##柳 +##柴 +##柵 +##査 +##柿 +##栀 +##栃 +##栄 +##栅 +##标 +##栈 +##栉 +##栋 +##栎 +##栏 +##树 +##栓 +##栖 +##栗 +##校 +##栩 +##株 +##样 +##核 +##根 +##格 +##栽 +##栾 +##桀 +##桁 +##桂 +##桃 +##桅 +##框 +##案 +##桉 +##桌 +##桎 +##桐 +##桑 +##桓 +##桔 +##桜 +##桠 +##桡 +##桢 +##档 +##桥 +##桦 +##桧 +##桨 +##桩 +##桶 +##桿 +##梁 +##梅 +##梆 +##梏 +##梓 +##梗 +##條 +##梟 +##梢 +##梦 +##梧 +##梨 +##梭 +##梯 +##械 +##梳 +##梵 +##梶 +##检 +##棂 +##棄 +##棉 +##棋 +##棍 +##棒 +##棕 +##棗 +##棘 +##棚 +##棟 +##棠 +##棣 +##棧 +##森 +##棱 +##棲 +##棵 +##棹 +##棺 +##椁 +##椅 +##椋 +##植 +##椎 +##椒 +##検 +##椪 +##椭 +##椰 +##椹 +##椽 +##椿 +##楂 +##楊 +##楓 +##楔 +##楚 +##楝 +##楞 +##楠 +##楣 +##楨 +##楫 +##業 +##楮 +##極 +##楷 +##楸 +##楹 +##楼 +##楽 +##概 +##榄 +##榆 +##榈 +##榉 +##榔 +##榕 +##榖 +##榛 +##榜 +##榨 +##榫 +##榭 +##榮 +##榱 +##榴 +##榷 +##榻 +##槁 +##槃 +##構 +##槌 +##槍 +##槎 +##槐 +##槓 +##様 +##槛 +##槟 +##槤 +##槭 +##槲 +##槳 +##槻 +##槽 +##槿 +##樁 +##樂 +##樊 +##樑 +##樓 +##標 +##樞 +##樟 +##模 +##樣 +##権 +##横 +##樫 +##樯 +##樱 +##樵 +##樸 +##樹 +##樺 +##樽 +##樾 +##橄 +##橇 +##橋 +##橐 +##橘 +##橙 +##機 +##橡 +##橢 +##橫 +##橱 +##橹 +##橼 +##檀 +##檄 +##檎 +##檐 +##檔 +##檗 +##檜 +##檢 +##檬 +##檯 +##檳 +##檸 +##檻 +##櫃 +##櫚 +##櫛 +##櫥 +##櫸 +##櫻 +##欄 +##權 +##欒 +##欖 +##欠 +##次 +##欢 +##欣 +##欧 +##欲 +##欸 +##欺 +##欽 +##款 +##歆 +##歇 +##歉 +##歌 +##歎 +##歐 +##歓 +##歙 +##歛 +##歡 +##止 +##正 +##此 +##步 +##武 +##歧 +##歩 +##歪 +##歯 +##歲 +##歳 +##歴 +##歷 +##歸 +##歹 +##死 +##歼 +##殁 +##殃 +##殆 +##殇 +##殉 +##殊 +##残 +##殒 +##殓 +##殖 +##殘 +##殞 +##殡 +##殤 +##殭 +##殯 +##殲 +##殴 +##段 +##殷 +##殺 +##殼 +##殿 +##毀 +##毁 +##毂 +##毅 +##毆 +##毋 +##母 +##毎 +##每 +##毒 +##毓 +##比 +##毕 +##毗 +##毘 +##毙 +##毛 +##毡 +##毫 +##毯 +##毽 +##氈 +##氏 +##氐 +##民 +##氓 +##气 +##氖 +##気 +##氙 +##氛 +##氟 +##氡 +##氢 +##氣 +##氤 +##氦 +##氧 +##氨 +##氪 +##氫 +##氮 +##氯 +##氰 +##氲 +##水 +##氷 +##永 +##氹 +##氾 +##汀 +##汁 +##求 +##汆 +##汇 +##汉 +##汎 +##汐 +##汕 +##汗 +##汙 +##汛 +##汝 +##汞 +##江 +##池 +##污 +##汤 +##汨 +##汩 +##汪 +##汰 +##汲 +##汴 +##汶 +##汹 +##決 +##汽 +##汾 +##沁 +##沂 +##沃 +##沅 +##沈 +##沉 +##沌 +##沏 +##沐 +##沒 +##沓 +##沖 +##沙 +##沛 +##沟 +##没 +##沢 +##沣 +##沥 +##沦 +##沧 +##沪 +##沫 +##沭 +##沮 +##沱 +##河 +##沸 +##油 +##治 +##沼 +##沽 +##沾 +##沿 +##況 +##泄 +##泉 +##泊 +##泌 +##泓 +##法 +##泗 +##泛 +##泞 +##泠 +##泡 +##波 +##泣 +##泥 +##注 +##泪 +##泫 +##泮 +##泯 +##泰 +##泱 +##泳 +##泵 +##泷 +##泸 +##泻 +##泼 +##泽 +##泾 +##洁 +##洄 +##洋 +##洒 +##洗 +##洙 +##洛 +##洞 +##津 +##洩 +##洪 +##洮 +##洱 +##洲 +##洵 +##洶 +##洸 +##洹 +##活 +##洼 +##洽 +##派 +##流 +##浃 +##浄 +##浅 +##浆 +##浇 +##浊 +##测 +##济 +##浏 +##浑 +##浒 +##浓 +##浔 +##浙 +##浚 +##浜 +##浣 +##浦 +##浩 +##浪 +##浬 +##浮 +##浯 +##浴 +##海 +##浸 +##涂 +##涅 +##涇 +##消 +##涉 +##涌 +##涎 +##涓 +##涔 +##涕 +##涙 +##涛 +##涝 +##涞 +##涟 +##涠 +##涡 +##涣 +##涤 +##润 +##涧 +##涨 +##涩 +##涪 +##涮 +##涯 +##液 +##涵 +##涸 +##涼 +##涿 +##淀 +##淄 +##淅 +##淆 +##淇 +##淋 +##淌 +##淑 +##淒 +##淖 +##淘 +##淙 +##淚 +##淞 +##淡 +##淤 +##淦 +##淨 +##淩 +##淪 +##淫 +##淬 +##淮 +##深 +##淳 +##淵 +##混 +##淹 +##淺 +##添 +##淼 +##清 +##済 +##渉 +##渊 +##渋 +##渍 +##渎 +##渐 +##渔 +##渗 +##渙 +##渚 +##減 +##渝 +##渠 +##渡 +##渣 +##渤 +##渥 +##渦 +##温 +##測 +##渭 +##港 +##渲 +##渴 +##游 +##渺 +##渾 +##湃 +##湄 +##湊 +##湍 +##湖 +##湘 +##湛 +##湟 +##湧 +##湫 +##湮 +##湯 +##湳 +##湾 +##湿 +##満 +##溃 +##溅 +##溉 +##溏 +##源 +##準 +##溜 +##溝 +##溟 +##溢 +##溥 +##溧 +##溪 +##溫 +##溯 +##溱 +##溴 +##溶 +##溺 +##溼 +##滁 +##滂 +##滄 +##滅 +##滇 +##滋 +##滌 +##滑 +##滓 +##滔 +##滕 +##滙 +##滚 +##滝 +##滞 +##滟 +##满 +##滢 +##滤 +##滥 +##滦 +##滨 +##滩 +##滬 +##滯 +##滲 +##滴 +##滷 +##滸 +##滾 +##滿 +##漁 +##漂 +##漆 +##漉 +##漏 +##漓 +##演 +##漕 +##漠 +##漢 +##漣 +##漩 +##漪 +##漫 +##漬 +##漯 +##漱 +##漲 +##漳 +##漸 +##漾 +##漿 +##潆 +##潇 +##潋 +##潍 +##潑 +##潔 +##潘 +##潛 +##潜 +##潞 +##潟 +##潢 +##潤 +##潦 +##潧 +##潭 +##潮 +##潰 +##潴 +##潸 +##潺 +##潼 +##澀 +##澄 +##澆 +##澈 +##澍 +##澎 +##澗 +##澜 +##澡 +##澤 +##澧 +##澱 +##澳 +##澹 +##激 +##濁 +##濂 +##濃 +##濑 +##濒 +##濕 +##濘 +##濛 +##濟 +##濠 +##濡 +##濤 +##濫 +##濬 +##濮 +##濯 +##濱 +##濺 +##濾 +##瀅 +##瀆 +##瀉 +##瀋 +##瀏 +##瀑 +##瀕 +##瀘 +##瀚 +##瀛 +##瀝 +##瀞 +##瀟 +##瀧 +##瀨 +##瀬 +##瀰 +##瀾 +##灌 +##灏 +##灑 +##灘 +##灝 +##灞 +##灣 +##火 +##灬 +##灭 +##灯 +##灰 +##灵 +##灶 +##灸 +##灼 +##災 +##灾 +##灿 +##炀 +##炁 +##炅 +##炉 +##炊 +##炎 +##炒 +##炔 +##炕 +##炖 +##炙 +##炜 +##炫 +##炬 +##炭 +##炮 +##炯 +##炳 +##炷 +##炸 +##点 +##為 +##炼 +##炽 +##烁 +##烂 +##烃 +##烈 +##烊 +##烏 +##烘 +##烙 +##烛 +##烟 +##烤 +##烦 +##烧 +##烨 +##烩 +##烫 +##烬 +##热 +##烯 +##烷 +##烹 +##烽 +##焉 +##焊 +##焕 +##焖 +##焗 +##焘 +##焙 +##焚 +##焜 +##無 +##焦 +##焯 +##焰 +##焱 +##然 +##焼 +##煅 +##煉 +##煊 +##煌 +##煎 +##煒 +##煖 +##煙 +##煜 +##煞 +##煤 +##煥 +##煦 +##照 +##煨 +##煩 +##煮 +##煲 +##煸 +##煽 +##熄 +##熊 +##熏 +##熒 +##熔 +##熙 +##熟 +##熠 +##熨 +##熬 +##熱 +##熵 +##熹 +##熾 +##燁 +##燃 +##燄 +##燈 +##燉 +##燊 +##燎 +##燒 +##燔 +##燕 +##燙 +##燜 +##營 +##燥 +##燦 +##燧 +##燭 +##燮 +##燴 +##燻 +##燼 +##燿 +##爆 +##爍 +##爐 +##爛 +##爪 +##爬 +##爭 +##爰 +##爱 +##爲 +##爵 +##父 +##爷 +##爸 +##爹 +##爺 +##爻 +##爽 +##爾 +##牆 +##片 +##版 +##牌 +##牍 +##牒 +##牙 +##牛 +##牝 +##牟 +##牠 +##牡 +##牢 +##牦 +##牧 +##物 +##牯 +##牲 +##牴 +##牵 +##特 +##牺 +##牽 +##犀 +##犁 +##犄 +##犊 +##犍 +##犒 +##犢 +##犧 +##犬 +##犯 +##状 +##犷 +##犸 +##犹 +##狀 +##狂 +##狄 +##狈 +##狎 +##狐 +##狒 +##狗 +##狙 +##狞 +##狠 +##狡 +##狩 +##独 +##狭 +##狮 +##狰 +##狱 +##狸 +##狹 +##狼 +##狽 +##猎 +##猕 +##猖 +##猗 +##猙 +##猛 +##猜 +##猝 +##猥 +##猩 +##猪 +##猫 +##猬 +##献 +##猴 +##猶 +##猷 +##猾 +##猿 +##獄 +##獅 +##獎 +##獐 +##獒 +##獗 +##獠 +##獣 +##獨 +##獭 +##獰 +##獲 +##獵 +##獷 +##獸 +##獺 +##獻 +##獼 +##獾 +##玄 +##率 +##玉 +##王 +##玑 +##玖 +##玛 +##玟 +##玠 +##玥 +##玩 +##玫 +##玮 +##环 +##现 +##玲 +##玳 +##玷 +##玺 +##玻 +##珀 +##珂 +##珅 +##珈 +##珉 +##珊 +##珍 +##珏 +##珐 +##珑 +##珙 +##珞 +##珠 +##珣 +##珥 +##珩 +##珪 +##班 +##珮 +##珲 +##珺 +##現 +##球 +##琅 +##理 +##琇 +##琉 +##琊 +##琍 +##琏 +##琐 +##琛 +##琢 +##琥 +##琦 +##琨 +##琪 +##琬 +##琮 +##琰 +##琲 +##琳 +##琴 +##琵 +##琶 +##琺 +##琼 +##瑀 +##瑁 +##瑄 +##瑋 +##瑕 +##瑗 +##瑙 +##瑚 +##瑛 +##瑜 +##瑞 +##瑟 +##瑠 +##瑣 +##瑤 +##瑩 +##瑪 +##瑯 +##瑰 +##瑶 +##瑾 +##璀 +##璁 +##璃 +##璇 +##璉 +##璋 +##璎 +##璐 +##璜 +##璞 +##璟 +##璧 +##璨 +##環 +##璽 +##璿 +##瓊 +##瓏 +##瓒 +##瓜 +##瓢 +##瓣 +##瓤 +##瓦 +##瓮 +##瓯 +##瓴 +##瓶 +##瓷 +##甄 +##甌 +##甕 +##甘 +##甙 +##甚 +##甜 +##生 +##產 +##産 +##甥 +##甦 +##用 +##甩 +##甫 +##甬 +##甭 +##甯 +##田 +##由 +##甲 +##申 +##电 +##男 +##甸 +##町 +##画 +##甾 +##畀 +##畅 +##界 +##畏 +##畑 +##畔 +##留 +##畜 +##畝 +##畢 +##略 +##畦 +##番 +##畫 +##異 +##畲 +##畳 +##畴 +##當 +##畸 +##畹 +##畿 +##疆 +##疇 +##疊 +##疏 +##疑 +##疔 +##疖 +##疗 +##疙 +##疚 +##疝 +##疟 +##疡 +##疣 +##疤 +##疥 +##疫 +##疮 +##疯 +##疱 +##疲 +##疳 +##疵 +##疸 +##疹 +##疼 +##疽 +##疾 +##痂 +##病 +##症 +##痈 +##痉 +##痊 +##痍 +##痒 +##痔 +##痕 +##痘 +##痙 +##痛 +##痞 +##痠 +##痢 +##痣 +##痤 +##痧 +##痨 +##痪 +##痫 +##痰 +##痱 +##痴 +##痹 +##痺 +##痼 +##痿 +##瘀 +##瘁 +##瘋 +##瘍 +##瘓 +##瘘 +##瘙 +##瘟 +##瘠 +##瘡 +##瘢 +##瘤 +##瘦 +##瘧 +##瘩 +##瘪 +##瘫 +##瘴 +##瘸 +##瘾 +##療 +##癇 +##癌 +##癒 +##癖 +##癜 +##癞 +##癡 +##癢 +##癣 +##癥 +##癫 +##癬 +##癮 +##癱 +##癲 +##癸 +##発 +##登 +##發 +##白 +##百 +##皂 +##的 +##皆 +##皇 +##皈 +##皋 +##皎 +##皑 +##皓 +##皖 +##皙 +##皚 +##皮 +##皰 +##皱 +##皴 +##皺 +##皿 +##盂 +##盃 +##盅 +##盆 +##盈 +##益 +##盎 +##盏 +##盐 +##监 +##盒 +##盔 +##盖 +##盗 +##盘 +##盛 +##盜 +##盞 +##盟 +##盡 +##監 +##盤 +##盥 +##盧 +##盪 +##目 +##盯 +##盱 +##盲 +##直 +##相 +##盹 +##盼 +##盾 +##省 +##眈 +##眉 +##看 +##県 +##眙 +##眞 +##真 +##眠 +##眦 +##眨 +##眩 +##眯 +##眶 +##眷 +##眸 +##眺 +##眼 +##眾 +##着 +##睁 +##睇 +##睏 +##睐 +##睑 +##睛 +##睜 +##睞 +##睡 +##睢 +##督 +##睥 +##睦 +##睨 +##睪 +##睫 +##睬 +##睹 +##睽 +##睾 +##睿 +##瞄 +##瞅 +##瞇 +##瞋 +##瞌 +##瞎 +##瞑 +##瞒 +##瞓 +##瞞 +##瞟 +##瞠 +##瞥 +##瞧 +##瞩 +##瞪 +##瞬 +##瞭 +##瞰 +##瞳 +##瞻 +##瞼 +##瞿 +##矇 +##矍 +##矗 +##矚 +##矛 +##矜 +##矢 +##矣 +##知 +##矩 +##矫 +##短 +##矮 +##矯 +##石 +##矶 +##矽 +##矾 +##矿 +##码 +##砂 +##砌 +##砍 +##砒 +##研 +##砖 +##砗 +##砚 +##砝 +##砣 +##砥 +##砧 +##砭 +##砰 +##砲 +##破 +##砷 +##砸 +##砺 +##砼 +##砾 +##础 +##硅 +##硐 +##硒 +##硕 +##硝 +##硫 +##硬 +##确 +##硯 +##硼 +##碁 +##碇 +##碉 +##碌 +##碍 +##碎 +##碑 +##碓 +##碗 +##碘 +##碚 +##碛 +##碟 +##碣 +##碧 +##碩 +##碰 +##碱 +##碳 +##碴 +##確 +##碼 +##碾 +##磁 +##磅 +##磊 +##磋 +##磐 +##磕 +##磚 +##磡 +##磨 +##磬 +##磯 +##磲 +##磷 +##磺 +##礁 +##礎 +##礙 +##礡 +##礦 +##礪 +##礫 +##礴 +##示 +##礼 +##社 +##祀 +##祁 +##祂 +##祇 +##祈 +##祉 +##祎 +##祐 +##祕 +##祖 +##祗 +##祚 +##祛 +##祜 +##祝 +##神 +##祟 +##祠 +##祢 +##祥 +##票 +##祭 +##祯 +##祷 +##祸 +##祺 +##祿 +##禀 +##禁 +##禄 +##禅 +##禍 +##禎 +##福 +##禛 +##禦 +##禧 +##禪 +##禮 +##禱 +##禹 +##禺 +##离 +##禽 +##禾 +##禿 +##秀 +##私 +##秃 +##秆 +##秉 +##秋 +##种 +##科 +##秒 +##秘 +##租 +##秣 +##秤 +##秦 +##秧 +##秩 +##秭 +##积 +##称 +##秸 +##移 +##秽 +##稀 +##稅 +##程 +##稍 +##税 +##稔 +##稗 +##稚 +##稜 +##稞 +##稟 +##稠 +##稣 +##種 +##稱 +##稲 +##稳 +##稷 +##稹 +##稻 +##稼 +##稽 +##稿 +##穀 +##穂 +##穆 +##穌 +##積 +##穎 +##穗 +##穢 +##穩 +##穫 +##穴 +##究 +##穷 +##穹 +##空 +##穿 +##突 +##窃 +##窄 +##窈 +##窍 +##窑 +##窒 +##窓 +##窕 +##窖 +##窗 +##窘 +##窜 +##窝 +##窟 +##窠 +##窥 +##窦 +##窨 +##窩 +##窪 +##窮 +##窯 +##窺 +##窿 +##竄 +##竅 +##竇 +##竊 +##立 +##竖 +##站 +##竜 +##竞 +##竟 +##章 +##竣 +##童 +##竭 +##端 +##競 +##竹 +##竺 +##竽 +##竿 +##笃 +##笆 +##笈 +##笋 +##笏 +##笑 +##笔 +##笙 +##笛 +##笞 +##笠 +##符 +##笨 +##第 +##笹 +##笺 +##笼 +##筆 +##等 +##筊 +##筋 +##筍 +##筏 +##筐 +##筑 +##筒 +##答 +##策 +##筛 +##筝 +##筠 +##筱 +##筲 +##筵 +##筷 +##筹 +##签 +##简 +##箇 +##箋 +##箍 +##箏 +##箐 +##箔 +##箕 +##算 +##箝 +##管 +##箩 +##箫 +##箭 +##箱 +##箴 +##箸 +##節 +##篁 +##範 +##篆 +##篇 +##築 +##篑 +##篓 +##篙 +##篝 +##篠 +##篡 +##篤 +##篩 +##篪 +##篮 +##篱 +##篷 +##簇 +##簌 +##簍 +##簡 +##簦 +##簧 +##簪 +##簫 +##簷 +##簸 +##簽 +##簾 +##簿 +##籁 +##籃 +##籌 +##籍 +##籐 +##籟 +##籠 +##籤 +##籬 +##籮 +##籲 +##米 +##类 +##籼 +##籽 +##粄 +##粉 +##粑 +##粒 +##粕 +##粗 +##粘 +##粟 +##粤 +##粥 +##粧 +##粪 +##粮 +##粱 +##粲 +##粳 +##粵 +##粹 +##粼 +##粽 +##精 +##粿 +##糅 +##糊 +##糍 +##糕 +##糖 +##糗 +##糙 +##糜 +##糞 +##糟 +##糠 +##糧 +##糬 +##糯 +##糰 +##糸 +##系 +##糾 +##紀 +##紂 +##約 +##紅 +##紉 +##紊 +##紋 +##納 +##紐 +##紓 +##純 +##紗 +##紘 +##紙 +##級 +##紛 +##紜 +##素 +##紡 +##索 +##紧 +##紫 +##紮 +##累 +##細 +##紳 +##紹 +##紺 +##終 +##絃 +##組 +##絆 +##経 +##結 +##絕 +##絞 +##絡 +##絢 +##給 +##絨 +##絮 +##統 +##絲 +##絳 +##絵 +##絶 +##絹 +##綁 +##綏 +##綑 +##經 +##継 +##続 +##綜 +##綠 +##綢 +##綦 +##綫 +##綬 +##維 +##綱 +##網 +##綴 +##綵 +##綸 +##綺 +##綻 +##綽 +##綾 +##綿 +##緊 +##緋 +##総 +##緑 +##緒 +##緘 +##線 +##緝 +##緞 +##締 +##緣 +##編 +##緩 +##緬 +##緯 +##練 +##緹 +##緻 +##縁 +##縄 +##縈 +##縛 +##縝 +##縣 +##縫 +##縮 +##縱 +##縴 +##縷 +##總 +##績 +##繁 +##繃 +##繆 +##繇 +##繋 +##織 +##繕 +##繚 +##繞 +##繡 +##繩 +##繪 +##繫 +##繭 +##繳 +##繹 +##繼 +##繽 +##纂 +##續 +##纍 +##纏 +##纓 +##纔 +##纖 +##纜 +##纠 +##红 +##纣 +##纤 +##约 +##级 +##纨 +##纪 +##纫 +##纬 +##纭 +##纯 +##纰 +##纱 +##纲 +##纳 +##纵 +##纶 +##纷 +##纸 +##纹 +##纺 +##纽 +##纾 +##线 +##绀 +##练 +##组 +##绅 +##细 +##织 +##终 +##绊 +##绍 +##绎 +##经 +##绑 +##绒 +##结 +##绔 +##绕 +##绘 +##给 +##绚 +##绛 +##络 +##绝 +##绞 +##统 +##绡 +##绢 +##绣 +##绥 +##绦 +##继 +##绩 +##绪 +##绫 +##续 +##绮 +##绯 +##绰 +##绳 +##维 +##绵 +##绶 +##绷 +##绸 +##绻 +##综 +##绽 +##绾 +##绿 +##缀 +##缄 +##缅 +##缆 +##缇 +##缈 +##缉 +##缎 +##缓 +##缔 +##缕 +##编 +##缘 +##缙 +##缚 +##缜 +##缝 +##缠 +##缢 +##缤 +##缥 +##缨 +##缩 +##缪 +##缭 +##缮 +##缰 +##缱 +##缴 +##缸 +##缺 +##缽 +##罂 +##罄 +##罌 +##罐 +##网 +##罔 +##罕 +##罗 +##罚 +##罡 +##罢 +##罩 +##罪 +##置 +##罰 +##署 +##罵 +##罷 +##罹 +##羁 +##羅 +##羈 +##羊 +##羌 +##美 +##羔 +##羚 +##羞 +##羟 +##羡 +##羣 +##群 +##羥 +##羧 +##羨 +##義 +##羯 +##羲 +##羸 +##羹 +##羽 +##羿 +##翁 +##翅 +##翊 +##翌 +##翎 +##習 +##翔 +##翘 +##翟 +##翠 +##翡 +##翦 +##翩 +##翰 +##翱 +##翳 +##翹 +##翻 +##翼 +##耀 +##老 +##考 +##耄 +##者 +##耆 +##耋 +##而 +##耍 +##耐 +##耒 +##耕 +##耗 +##耘 +##耙 +##耦 +##耨 +##耳 +##耶 +##耷 +##耸 +##耻 +##耽 +##耿 +##聂 +##聆 +##聊 +##聋 +##职 +##聒 +##联 +##聖 +##聘 +##聚 +##聞 +##聪 +##聯 +##聰 +##聲 +##聳 +##聴 +##聶 +##職 +##聽 +##聾 +##聿 +##肃 +##肄 +##肅 +##肆 +##肇 +##肉 +##肋 +##肌 +##肏 +##肓 +##肖 +##肘 +##肚 +##肛 +##肝 +##肠 +##股 +##肢 +##肤 +##肥 +##肩 +##肪 +##肮 +##肯 +##肱 +##育 +##肴 +##肺 +##肽 +##肾 +##肿 +##胀 +##胁 +##胃 +##胄 +##胆 +##背 +##胍 +##胎 +##胖 +##胚 +##胛 +##胜 +##胝 +##胞 +##胡 +##胤 +##胥 +##胧 +##胫 +##胭 +##胯 +##胰 +##胱 +##胳 +##胴 +##胶 +##胸 +##胺 +##能 +##脂 +##脅 +##脆 +##脇 +##脈 +##脉 +##脊 +##脍 +##脏 +##脐 +##脑 +##脓 +##脖 +##脘 +##脚 +##脛 +##脣 +##脩 +##脫 +##脯 +##脱 +##脲 +##脳 +##脸 +##脹 +##脾 +##腆 +##腈 +##腊 +##腋 +##腌 +##腎 +##腐 +##腑 +##腓 +##腔 +##腕 +##腥 +##腦 +##腩 +##腫 +##腭 +##腮 +##腰 +##腱 +##腳 +##腴 +##腸 +##腹 +##腺 +##腻 +##腼 +##腾 +##腿 +##膀 +##膈 +##膊 +##膏 +##膑 +##膘 +##膚 +##膛 +##膜 +##膝 +##膠 +##膦 +##膨 +##膩 +##膳 +##膺 +##膻 +##膽 +##膾 +##膿 +##臀 +##臂 +##臃 +##臆 +##臉 +##臊 +##臍 +##臓 +##臘 +##臟 +##臣 +##臥 +##臧 +##臨 +##自 +##臬 +##臭 +##至 +##致 +##臺 +##臻 +##臼 +##臾 +##舀 +##舂 +##舅 +##舆 +##與 +##興 +##舉 +##舊 +##舌 +##舍 +##舎 +##舐 +##舒 +##舔 +##舖 +##舗 +##舛 +##舜 +##舞 +##舟 +##航 +##舫 +##般 +##舰 +##舱 +##舵 +##舶 +##舷 +##舸 +##船 +##舺 +##舾 +##艇 +##艋 +##艘 +##艙 +##艦 +##艮 +##良 +##艰 +##艱 +##色 +##艳 +##艷 +##艹 +##艺 +##艾 +##节 +##芃 +##芈 +##芊 +##芋 +##芍 +##芎 +##芒 +##芙 +##芜 +##芝 +##芡 +##芥 +##芦 +##芩 +##芪 +##芫 +##芬 +##芭 +##芮 +##芯 +##花 +##芳 +##芷 +##芸 +##芹 +##芻 +##芽 +##芾 +##苁 +##苄 +##苇 +##苋 +##苍 +##苏 +##苑 +##苒 +##苓 +##苔 +##苕 +##苗 +##苛 +##苜 +##苞 +##苟 +##苡 +##苣 +##若 +##苦 +##苫 +##苯 +##英 +##苷 +##苹 +##苻 +##茁 +##茂 +##范 +##茄 +##茅 +##茉 +##茎 +##茏 +##茗 +##茜 +##茧 +##茨 +##茫 +##茬 +##茭 +##茯 +##茱 +##茲 +##茴 +##茵 +##茶 +##茸 +##茹 +##茼 +##荀 +##荃 +##荆 +##草 +##荊 +##荏 +##荐 +##荒 +##荔 +##荖 +##荘 +##荚 +##荞 +##荟 +##荠 +##荡 +##荣 +##荤 +##荥 +##荧 +##荨 +##荪 +##荫 +##药 +##荳 +##荷 +##荸 +##荻 +##荼 +##荽 +##莅 +##莆 +##莉 +##莊 +##莎 +##莒 +##莓 +##莖 +##莘 +##莞 +##莠 +##莢 +##莧 +##莪 +##莫 +##莱 +##莲 +##莴 +##获 +##莹 +##莺 +##莽 +##莿 +##菀 +##菁 +##菅 +##菇 +##菈 +##菊 +##菌 +##菏 +##菓 +##菖 +##菘 +##菜 +##菟 +##菠 +##菡 +##菩 +##華 +##菱 +##菲 +##菸 +##菽 +##萁 +##萃 +##萄 +##萊 +##萋 +##萌 +##萍 +##萎 +##萘 +##萝 +##萤 +##营 +##萦 +##萧 +##萨 +##萩 +##萬 +##萱 +##萵 +##萸 +##萼 +##落 +##葆 +##葉 +##著 +##葚 +##葛 +##葡 +##董 +##葦 +##葩 +##葫 +##葬 +##葭 +##葯 +##葱 +##葳 +##葵 +##葷 +##葺 +##蒂 +##蒋 +##蒐 +##蒔 +##蒙 +##蒜 +##蒞 +##蒟 +##蒡 +##蒨 +##蒲 +##蒸 +##蒹 +##蒻 +##蒼 +##蒿 +##蓁 +##蓄 +##蓆 +##蓉 +##蓋 +##蓑 +##蓓 +##蓖 +##蓝 +##蓟 +##蓦 +##蓬 +##蓮 +##蓼 +##蓿 +##蔑 +##蔓 +##蔔 +##蔗 +##蔘 +##蔚 +##蔡 +##蔣 +##蔥 +##蔫 +##蔬 +##蔭 +##蔵 +##蔷 +##蔺 +##蔻 +##蔼 +##蔽 +##蕁 +##蕃 +##蕈 +##蕉 +##蕊 +##蕎 +##蕙 +##蕤 +##蕨 +##蕩 +##蕪 +##蕭 +##蕲 +##蕴 +##蕻 +##蕾 +##薄 +##薅 +##薇 +##薈 +##薊 +##薏 +##薑 +##薔 +##薙 +##薛 +##薦 +##薨 +##薩 +##薪 +##薬 +##薯 +##薰 +##薹 +##藉 +##藍 +##藏 +##藐 +##藓 +##藕 +##藜 +##藝 +##藤 +##藥 +##藩 +##藹 +##藻 +##藿 +##蘆 +##蘇 +##蘊 +##蘋 +##蘑 +##蘚 +##蘭 +##蘸 +##蘼 +##蘿 +##虎 +##虏 +##虐 +##虑 +##虔 +##處 +##虚 +##虛 +##虜 +##虞 +##號 +##虢 +##虧 +##虫 +##虬 +##虱 +##虹 +##虻 +##虽 +##虾 +##蚀 +##蚁 +##蚂 +##蚊 +##蚌 +##蚓 +##蚕 +##蚜 +##蚝 +##蚣 +##蚤 +##蚩 +##蚪 +##蚯 +##蚱 +##蚵 +##蛀 +##蛆 +##蛇 +##蛊 +##蛋 +##蛎 +##蛐 +##蛔 +##蛙 +##蛛 +##蛟 +##蛤 +##蛭 +##蛮 +##蛰 +##蛳 +##蛹 +##蛻 +##蛾 +##蜀 +##蜂 +##蜃 +##蜆 +##蜇 +##蜈 +##蜊 +##蜍 +##蜒 +##蜓 +##蜕 +##蜗 +##蜘 +##蜚 +##蜜 +##蜡 +##蜢 +##蜥 +##蜱 +##蜴 +##蜷 +##蜻 +##蜿 +##蝇 +##蝈 +##蝉 +##蝌 +##蝎 +##蝕 +##蝗 +##蝙 +##蝟 +##蝠 +##蝦 +##蝨 +##蝴 +##蝶 +##蝸 +##蝼 +##螂 +##螃 +##融 +##螞 +##螢 +##螨 +##螯 +##螳 +##螺 +##蟀 +##蟄 +##蟆 +##蟋 +##蟎 +##蟑 +##蟒 +##蟠 +##蟬 +##蟲 +##蟹 +##蟻 +##蟾 +##蠅 +##蠍 +##蠔 +##蠕 +##蠛 +##蠟 +##蠡 +##蠢 +##蠣 +##蠱 +##蠶 +##蠹 +##蠻 +##血 +##衄 +##衅 +##衆 +##行 +##衍 +##術 +##衔 +##街 +##衙 +##衛 +##衝 +##衞 +##衡 +##衢 +##衣 +##补 +##表 +##衩 +##衫 +##衬 +##衮 +##衰 +##衲 +##衷 +##衹 +##衾 +##衿 +##袁 +##袂 +##袄 +##袅 +##袈 +##袋 +##袍 +##袒 +##袖 +##袜 +##袞 +##袤 +##袪 +##被 +##袭 +##袱 +##裁 +##裂 +##装 +##裆 +##裊 +##裏 +##裔 +##裕 +##裘 +##裙 +##補 +##裝 +##裟 +##裡 +##裤 +##裨 +##裱 +##裳 +##裴 +##裸 +##裹 +##製 +##裾 +##褂 +##複 +##褐 +##褒 +##褓 +##褔 +##褚 +##褥 +##褪 +##褫 +##褲 +##褶 +##褻 +##襁 +##襄 +##襟 +##襠 +##襪 +##襬 +##襯 +##襲 +##西 +##要 +##覃 +##覆 +##覇 +##見 +##規 +##覓 +##視 +##覚 +##覦 +##覧 +##親 +##覬 +##観 +##覷 +##覺 +##覽 +##觀 +##见 +##观 +##规 +##觅 +##视 +##览 +##觉 +##觊 +##觎 +##觐 +##觑 +##角 +##觞 +##解 +##觥 +##触 +##觸 +##言 +##訂 +##計 +##訊 +##討 +##訓 +##訕 +##訖 +##託 +##記 +##訛 +##訝 +##訟 +##訣 +##訥 +##訪 +##設 +##許 +##訳 +##訴 +##訶 +##診 +##註 +##証 +##詆 +##詐 +##詔 +##評 +##詛 +##詞 +##詠 +##詡 +##詢 +##詣 +##試 +##詩 +##詫 +##詬 +##詭 +##詮 +##詰 +##話 +##該 +##詳 +##詹 +##詼 +##誅 +##誇 +##誉 +##誌 +##認 +##誓 +##誕 +##誘 +##語 +##誠 +##誡 +##誣 +##誤 +##誥 +##誦 +##誨 +##說 +##説 +##読 +##誰 +##課 +##誹 +##誼 +##調 +##諄 +##談 +##請 +##諏 +##諒 +##論 +##諗 +##諜 +##諡 +##諦 +##諧 +##諫 +##諭 +##諮 +##諱 +##諳 +##諷 +##諸 +##諺 +##諾 +##謀 +##謁 +##謂 +##謄 +##謊 +##謎 +##謐 +##謔 +##謗 +##謙 +##講 +##謝 +##謠 +##謨 +##謬 +##謹 +##謾 +##譁 +##證 +##譎 +##譏 +##識 +##譙 +##譚 +##譜 +##警 +##譬 +##譯 +##議 +##譲 +##譴 +##護 +##譽 +##讀 +##變 +##讓 +##讚 +##讞 +##计 +##订 +##认 +##讥 +##讧 +##讨 +##让 +##讪 +##讫 +##训 +##议 +##讯 +##记 +##讲 +##讳 +##讴 +##讶 +##讷 +##许 +##讹 +##论 +##讼 +##讽 +##设 +##访 +##诀 +##证 +##诃 +##评 +##诅 +##识 +##诈 +##诉 +##诊 +##诋 +##词 +##诏 +##译 +##试 +##诗 +##诘 +##诙 +##诚 +##诛 +##话 +##诞 +##诟 +##诠 +##诡 +##询 +##诣 +##诤 +##该 +##详 +##诧 +##诩 +##诫 +##诬 +##语 +##误 +##诰 +##诱 +##诲 +##说 +##诵 +##诶 +##请 +##诸 +##诺 +##读 +##诽 +##课 +##诿 +##谀 +##谁 +##调 +##谄 +##谅 +##谆 +##谈 +##谊 +##谋 +##谌 +##谍 +##谎 +##谏 +##谐 +##谑 +##谒 +##谓 +##谔 +##谕 +##谗 +##谘 +##谙 +##谚 +##谛 +##谜 +##谟 +##谢 +##谣 +##谤 +##谥 +##谦 +##谧 +##谨 +##谩 +##谪 +##谬 +##谭 +##谯 +##谱 +##谲 +##谴 +##谶 +##谷 +##豁 +##豆 +##豇 +##豈 +##豉 +##豊 +##豌 +##豎 +##豐 +##豔 +##豚 +##象 +##豢 +##豪 +##豫 +##豬 +##豹 +##豺 +##貂 +##貅 +##貌 +##貓 +##貔 +##貘 +##貝 +##貞 +##負 +##財 +##貢 +##貧 +##貨 +##販 +##貪 +##貫 +##責 +##貯 +##貰 +##貳 +##貴 +##貶 +##買 +##貸 +##費 +##貼 +##貽 +##貿 +##賀 +##賁 +##賂 +##賃 +##賄 +##資 +##賈 +##賊 +##賑 +##賓 +##賜 +##賞 +##賠 +##賡 +##賢 +##賣 +##賤 +##賦 +##質 +##賬 +##賭 +##賴 +##賺 +##購 +##賽 +##贅 +##贈 +##贊 +##贍 +##贏 +##贓 +##贖 +##贛 +##贝 +##贞 +##负 +##贡 +##财 +##责 +##贤 +##败 +##账 +##货 +##质 +##贩 +##贪 +##贫 +##贬 +##购 +##贮 +##贯 +##贰 +##贱 +##贲 +##贴 +##贵 +##贷 +##贸 +##费 +##贺 +##贻 +##贼 +##贾 +##贿 +##赁 +##赂 +##赃 +##资 +##赅 +##赈 +##赊 +##赋 +##赌 +##赎 +##赏 +##赐 +##赓 +##赔 +##赖 +##赘 +##赚 +##赛 +##赝 +##赞 +##赠 +##赡 +##赢 +##赣 +##赤 +##赦 +##赧 +##赫 +##赭 +##走 +##赳 +##赴 +##赵 +##赶 +##起 +##趁 +##超 +##越 +##趋 +##趕 +##趙 +##趟 +##趣 +##趨 +##足 +##趴 +##趵 +##趸 +##趺 +##趾 +##跃 +##跄 +##跆 +##跋 +##跌 +##跎 +##跑 +##跖 +##跚 +##跛 +##距 +##跟 +##跡 +##跤 +##跨 +##跩 +##跪 +##路 +##跳 +##践 +##跷 +##跹 +##跺 +##跻 +##踉 +##踊 +##踌 +##踏 +##踐 +##踝 +##踞 +##踟 +##踢 +##踩 +##踪 +##踮 +##踱 +##踴 +##踵 +##踹 +##蹂 +##蹄 +##蹇 +##蹈 +##蹉 +##蹊 +##蹋 +##蹑 +##蹒 +##蹙 +##蹟 +##蹣 +##蹤 +##蹦 +##蹩 +##蹬 +##蹭 +##蹲 +##蹴 +##蹶 +##蹺 +##蹼 +##蹿 +##躁 +##躇 +##躉 +##躊 +##躋 +##躍 +##躏 +##躪 +##身 +##躬 +##躯 +##躲 +##躺 +##軀 +##車 +##軋 +##軌 +##軍 +##軒 +##軟 +##転 +##軸 +##軼 +##軽 +##軾 +##較 +##載 +##輒 +##輓 +##輔 +##輕 +##輛 +##輝 +##輟 +##輩 +##輪 +##輯 +##輸 +##輻 +##輾 +##輿 +##轄 +##轅 +##轆 +##轉 +##轍 +##轎 +##轟 +##车 +##轧 +##轨 +##轩 +##转 +##轭 +##轮 +##软 +##轰 +##轲 +##轴 +##轶 +##轻 +##轼 +##载 +##轿 +##较 +##辄 +##辅 +##辆 +##辇 +##辈 +##辉 +##辊 +##辍 +##辐 +##辑 +##输 +##辕 +##辖 +##辗 +##辘 +##辙 +##辛 +##辜 +##辞 +##辟 +##辣 +##辦 +##辨 +##辩 +##辫 +##辭 +##辮 +##辯 +##辰 +##辱 +##農 +##边 +##辺 +##辻 +##込 +##辽 +##达 +##迁 +##迂 +##迄 +##迅 +##过 +##迈 +##迎 +##运 +##近 +##返 +##还 +##这 +##进 +##远 +##违 +##连 +##迟 +##迢 +##迤 +##迥 +##迦 +##迩 +##迪 +##迫 +##迭 +##述 +##迴 +##迷 +##迸 +##迹 +##迺 +##追 +##退 +##送 +##适 +##逃 +##逅 +##逆 +##选 +##逊 +##逍 +##透 +##逐 +##递 +##途 +##逕 +##逗 +##這 +##通 +##逛 +##逝 +##逞 +##速 +##造 +##逢 +##連 +##逮 +##週 +##進 +##逵 +##逶 +##逸 +##逻 +##逼 +##逾 +##遁 +##遂 +##遅 +##遇 +##遊 +##運 +##遍 +##過 +##遏 +##遐 +##遑 +##遒 +##道 +##達 +##違 +##遗 +##遙 +##遛 +##遜 +##遞 +##遠 +##遢 +##遣 +##遥 +##遨 +##適 +##遭 +##遮 +##遲 +##遴 +##遵 +##遶 +##遷 +##選 +##遺 +##遼 +##遽 +##避 +##邀 +##邁 +##邂 +##邃 +##還 +##邇 +##邈 +##邊 +##邋 +##邏 +##邑 +##邓 +##邕 +##邛 +##邝 +##邢 +##那 +##邦 +##邨 +##邪 +##邬 +##邮 +##邯 +##邰 +##邱 +##邳 +##邵 +##邸 +##邹 +##邺 +##邻 +##郁 +##郅 +##郊 +##郎 +##郑 +##郜 +##郝 +##郡 +##郢 +##郤 +##郦 +##郧 +##部 +##郫 +##郭 +##郴 +##郵 +##郷 +##郸 +##都 +##鄂 +##鄉 +##鄒 +##鄔 +##鄙 +##鄞 +##鄢 +##鄧 +##鄭 +##鄰 +##鄱 +##鄲 +##鄺 +##酉 +##酊 +##酋 +##酌 +##配 +##酐 +##酒 +##酗 +##酚 +##酝 +##酢 +##酣 +##酥 +##酩 +##酪 +##酬 +##酮 +##酯 +##酰 +##酱 +##酵 +##酶 +##酷 +##酸 +##酿 +##醃 +##醇 +##醉 +##醋 +##醍 +##醐 +##醒 +##醚 +##醛 +##醜 +##醞 +##醣 +##醪 +##醫 +##醬 +##醮 +##醯 +##醴 +##醺 +##釀 +##釁 +##采 +##釉 +##释 +##釋 +##里 +##重 +##野 +##量 +##釐 +##金 +##釗 +##釘 +##釜 +##針 +##釣 +##釦 +##釧 +##釵 +##鈀 +##鈉 +##鈍 +##鈎 +##鈔 +##鈕 +##鈞 +##鈣 +##鈦 +##鈪 +##鈴 +##鈺 +##鈾 +##鉀 +##鉄 +##鉅 +##鉉 +##鉑 +##鉗 +##鉚 +##鉛 +##鉤 +##鉴 +##鉻 +##銀 +##銃 +##銅 +##銑 +##銓 +##銖 +##銘 +##銜 +##銬 +##銭 +##銮 +##銳 +##銷 +##銹 +##鋁 +##鋅 +##鋒 +##鋤 +##鋪 +##鋰 +##鋸 +##鋼 +##錄 +##錐 +##錘 +##錚 +##錠 +##錢 +##錦 +##錨 +##錫 +##錮 +##錯 +##録 +##錳 +##錶 +##鍊 +##鍋 +##鍍 +##鍛 +##鍥 +##鍰 +##鍵 +##鍺 +##鍾 +##鎂 +##鎊 +##鎌 +##鎏 +##鎔 +##鎖 +##鎗 +##鎚 +##鎧 +##鎬 +##鎮 +##鎳 +##鏈 +##鏖 +##鏗 +##鏘 +##鏞 +##鏟 +##鏡 +##鏢 +##鏤 +##鏽 +##鐘 +##鐮 +##鐲 +##鐳 +##鐵 +##鐸 +##鐺 +##鑄 +##鑊 +##鑑 +##鑒 +##鑣 +##鑫 +##鑰 +##鑲 +##鑼 +##鑽 +##鑾 +##鑿 +##针 +##钉 +##钊 +##钎 +##钏 +##钒 +##钓 +##钗 +##钙 +##钛 +##钜 +##钝 +##钞 +##钟 +##钠 +##钡 +##钢 +##钣 +##钤 +##钥 +##钦 +##钧 +##钨 +##钩 +##钮 +##钯 +##钰 +##钱 +##钳 +##钴 +##钵 +##钺 +##钻 +##钼 +##钾 +##钿 +##铀 +##铁 +##铂 +##铃 +##铄 +##铅 +##铆 +##铉 +##铎 +##铐 +##铛 +##铜 +##铝 +##铠 +##铡 +##铢 +##铣 +##铤 +##铨 +##铩 +##铬 +##铭 +##铮 +##铰 +##铲 +##铵 +##银 +##铸 +##铺 +##链 +##铿 +##销 +##锁 +##锂 +##锄 +##锅 +##锆 +##锈 +##锉 +##锋 +##锌 +##锏 +##锐 +##锑 +##错 +##锚 +##锟 +##锡 +##锢 +##锣 +##锤 +##锥 +##锦 +##锭 +##键 +##锯 +##锰 +##锲 +##锵 +##锹 +##锺 +##锻 +##镀 +##镁 +##镂 +##镇 +##镉 +##镌 +##镍 +##镐 +##镑 +##镕 +##镖 +##镗 +##镛 +##镜 +##镣 +##镭 +##镯 +##镰 +##镳 +##镶 +##長 +##长 +##門 +##閃 +##閉 +##開 +##閎 +##閏 +##閑 +##閒 +##間 +##閔 +##閘 +##閡 +##関 +##閣 +##閥 +##閨 +##閩 +##閱 +##閲 +##閹 +##閻 +##閾 +##闆 +##闇 +##闊 +##闌 +##闍 +##闔 +##闕 +##闖 +##闘 +##關 +##闡 +##闢 +##门 +##闪 +##闫 +##闭 +##问 +##闯 +##闰 +##闲 +##间 +##闵 +##闷 +##闸 +##闹 +##闺 +##闻 +##闽 +##闾 +##阀 +##阁 +##阂 +##阅 +##阆 +##阇 +##阈 +##阉 +##阎 +##阐 +##阑 +##阔 +##阕 +##阖 +##阙 +##阚 +##阜 +##队 +##阡 +##阪 +##阮 +##阱 +##防 +##阳 +##阴 +##阵 +##阶 +##阻 +##阿 +##陀 +##陂 +##附 +##际 +##陆 +##陇 +##陈 +##陋 +##陌 +##降 +##限 +##陕 +##陛 +##陝 +##陞 +##陟 +##陡 +##院 +##陣 +##除 +##陨 +##险 +##陪 +##陰 +##陲 +##陳 +##陵 +##陶 +##陷 +##陸 +##険 +##陽 +##隅 +##隆 +##隈 +##隊 +##隋 +##隍 +##階 +##随 +##隐 +##隔 +##隕 +##隘 +##隙 +##際 +##障 +##隠 +##隣 +##隧 +##隨 +##險 +##隱 +##隴 +##隶 +##隸 +##隻 +##隼 +##隽 +##难 +##雀 +##雁 +##雄 +##雅 +##集 +##雇 +##雉 +##雋 +##雌 +##雍 +##雎 +##雏 +##雑 +##雒 +##雕 +##雖 +##雙 +##雛 +##雜 +##雞 +##離 +##難 +##雨 +##雪 +##雯 +##雰 +##雲 +##雳 +##零 +##雷 +##雹 +##電 +##雾 +##需 +##霁 +##霄 +##霆 +##震 +##霈 +##霉 +##霊 +##霍 +##霎 +##霏 +##霑 +##霓 +##霖 +##霜 +##霞 +##霧 +##霭 +##霰 +##露 +##霸 +##霹 +##霽 +##霾 +##靂 +##靄 +##靈 +##青 +##靓 +##靖 +##静 +##靚 +##靛 +##靜 +##非 +##靠 +##靡 +##面 +##靥 +##靦 +##革 +##靳 +##靴 +##靶 +##靼 +##鞅 +##鞋 +##鞍 +##鞏 +##鞑 +##鞘 +##鞠 +##鞣 +##鞦 +##鞭 +##韆 +##韋 +##韌 +##韓 +##韜 +##韦 +##韧 +##韩 +##韬 +##韭 +##音 +##韵 +##韶 +##韻 +##響 +##頁 +##頂 +##頃 +##項 +##順 +##須 +##頌 +##預 +##頑 +##頒 +##頓 +##頗 +##領 +##頜 +##頡 +##頤 +##頫 +##頭 +##頰 +##頷 +##頸 +##頹 +##頻 +##頼 +##顆 +##題 +##額 +##顎 +##顏 +##顔 +##願 +##顛 +##類 +##顧 +##顫 +##顯 +##顱 +##顴 +##页 +##顶 +##顷 +##项 +##顺 +##须 +##顼 +##顽 +##顾 +##顿 +##颁 +##颂 +##预 +##颅 +##领 +##颇 +##颈 +##颉 +##颊 +##颌 +##颍 +##颐 +##频 +##颓 +##颔 +##颖 +##颗 +##题 +##颚 +##颛 +##颜 +##额 +##颞 +##颠 +##颡 +##颢 +##颤 +##颦 +##颧 +##風 +##颯 +##颱 +##颳 +##颶 +##颼 +##飄 +##飆 +##风 +##飒 +##飓 +##飕 +##飘 +##飙 +##飚 +##飛 +##飞 +##食 +##飢 +##飨 +##飩 +##飪 +##飯 +##飲 +##飼 +##飽 +##飾 +##餃 +##餅 +##餉 +##養 +##餌 +##餐 +##餒 +##餓 +##餘 +##餚 +##餛 +##餞 +##餡 +##館 +##餮 +##餵 +##餾 +##饅 +##饈 +##饋 +##饌 +##饍 +##饑 +##饒 +##饕 +##饗 +##饞 +##饥 +##饨 +##饪 +##饬 +##饭 +##饮 +##饯 +##饰 +##饱 +##饲 +##饴 +##饵 +##饶 +##饷 +##饺 +##饼 +##饽 +##饿 +##馀 +##馁 +##馄 +##馅 +##馆 +##馈 +##馋 +##馍 +##馏 +##馒 +##馔 +##首 +##馗 +##香 +##馥 +##馨 +##馬 +##馭 +##馮 +##馳 +##馴 +##駁 +##駄 +##駅 +##駆 +##駐 +##駒 +##駕 +##駛 +##駝 +##駭 +##駱 +##駿 +##騁 +##騎 +##騏 +##験 +##騙 +##騨 +##騰 +##騷 +##驀 +##驅 +##驊 +##驍 +##驒 +##驕 +##驗 +##驚 +##驛 +##驟 +##驢 +##驥 +##马 +##驭 +##驮 +##驯 +##驰 +##驱 +##驳 +##驴 +##驶 +##驷 +##驸 +##驹 +##驻 +##驼 +##驾 +##驿 +##骁 +##骂 +##骄 +##骅 +##骆 +##骇 +##骈 +##骊 +##骋 +##验 +##骏 +##骐 +##骑 +##骗 +##骚 +##骛 +##骜 +##骞 +##骠 +##骡 +##骤 +##骥 +##骧 +##骨 +##骯 +##骰 +##骶 +##骷 +##骸 +##骼 +##髂 +##髅 +##髋 +##髏 +##髒 +##髓 +##體 +##髖 +##高 +##髦 +##髪 +##髮 +##髯 +##髻 +##鬃 +##鬆 +##鬍 +##鬓 +##鬚 +##鬟 +##鬢 +##鬣 +##鬥 +##鬧 +##鬱 +##鬼 +##魁 +##魂 +##魄 +##魅 +##魇 +##魍 +##魏 +##魔 +##魘 +##魚 +##魯 +##魷 +##鮑 +##鮨 +##鮪 +##鮭 +##鮮 +##鯉 +##鯊 +##鯖 +##鯛 +##鯨 +##鯰 +##鯽 +##鰍 +##鰓 +##鰭 +##鰲 +##鰻 +##鰾 +##鱈 +##鱉 +##鱔 +##鱗 +##鱷 +##鱸 +##鱼 +##鱿 +##鲁 +##鲈 +##鲍 +##鲑 +##鲛 +##鲜 +##鲟 +##鲢 +##鲤 +##鲨 +##鲫 +##鲱 +##鲲 +##鲶 +##鲷 +##鲸 +##鳃 +##鳄 +##鳅 +##鳌 +##鳍 +##鳕 +##鳖 +##鳗 +##鳝 +##鳞 +##鳥 +##鳩 +##鳳 +##鳴 +##鳶 +##鴉 +##鴕 +##鴛 +##鴦 +##鴨 +##鴻 +##鴿 +##鵑 +##鵜 +##鵝 +##鵡 +##鵬 +##鵰 +##鵲 +##鶘 +##鶩 +##鶯 +##鶴 +##鷗 +##鷲 +##鷹 +##鷺 +##鸚 +##鸞 +##鸟 +##鸠 +##鸡 +##鸢 +##鸣 +##鸥 +##鸦 +##鸨 +##鸪 +##鸭 +##鸯 +##鸳 +##鸵 +##鸽 +##鸾 +##鸿 +##鹂 +##鹃 +##鹄 +##鹅 +##鹈 +##鹉 +##鹊 +##鹌 +##鹏 +##鹑 +##鹕 +##鹘 +##鹜 +##鹞 +##鹤 +##鹦 +##鹧 +##鹫 +##鹭 +##鹰 +##鹳 +##鹵 +##鹹 +##鹼 +##鹽 +##鹿 +##麂 +##麋 +##麒 +##麓 +##麗 +##麝 +##麟 +##麥 +##麦 +##麩 +##麴 +##麵 +##麸 +##麺 +##麻 +##麼 +##麽 +##麾 +##黃 +##黄 +##黍 +##黎 +##黏 +##黑 +##黒 +##黔 +##默 +##黛 +##黜 +##黝 +##點 +##黠 +##黨 +##黯 +##黴 +##鼋 +##鼎 +##鼐 +##鼓 +##鼠 +##鼬 +##鼹 +##鼻 +##鼾 +##齁 +##齊 +##齋 +##齐 +##齒 +##齡 +##齢 +##齣 +##齦 +##齿 +##龄 +##龅 +##龈 +##龊 +##龋 +##龌 +##龍 +##龐 +##龔 +##龕 +##龙 +##龚 +##龛 +##龜 +##龟 +##︰ +##︱ +##︶ +##︿ +##﹁ +##﹂ +##﹍ +##﹏ +##﹐ +##﹑ +##﹒ +##﹔ +##﹕ +##﹖ +##﹗ +##﹙ +##﹚ +##﹝ +##﹞ +##﹡ +##﹣ +##! +##" +### +##$ +##% +##& +##' +##( +##) +##* +##, +##- +##. +##/ +##: +##; +##< +##? +##@ +##[ +##\ +##] +##^ +##_ +##` +##f +##h +##j +##u +##w +##z +##{ +##} +##。 +##「 +##」 +##、 +##・ +##ッ +##ー +##イ +##ク +##シ +##ス +##ト +##ノ +##フ +##ラ +##ル +##ン +##゙ +##゚ +## ̄ +##¥ +##👍 +##🔥 +##😂 +##😎 diff --git a/baselines/models/albert/bert_utils.py b/baselines/models/albert/bert_utils.py new file mode 100755 index 0000000..9403716 --- /dev/null +++ b/baselines/models/albert/bert_utils.py @@ -0,0 +1,143 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import six +import tensorflow as tf + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + +# add sequence mask for: +# 1. random shuffle lm modeling---xlnet with random shuffled input +# 2. left2right and right2left language modeling +# 3. conditional generation +def generate_seq2seq_mask(attention_mask, mask_sequence, seq_type, **kargs): + if seq_type == 'seq2seq': + if mask_sequence is not None: + seq_shape = get_shape_list(mask_sequence, expected_rank=2) + seq_len = seq_shape[1] + ones = tf.ones((1, seq_len, seq_len)) + a_mask = tf.matrix_band_part(ones, -1, 0) + s_ex12 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 2) + s_ex13 = tf.expand_dims(tf.expand_dims(mask_sequence, 1), 3) + a_mask = (1 - s_ex13) * (1 - s_ex12) + s_ex13 * a_mask + # generate mask of batch x seq_len x seq_len + a_mask = tf.reshape(a_mask, (-1, seq_len, seq_len)) + out_mask = attention_mask * a_mask + else: + ones = tf.ones_like(attention_mask[:1]) + mask = (tf.matrix_band_part(ones, -1, 0)) + out_mask = attention_mask * mask + else: + out_mask = attention_mask + + return out_mask + diff --git a/baselines/models/albert/create_pretrain_data.sh b/baselines/models/albert/create_pretrain_data.sh new file mode 100755 index 0000000..b7185f1 --- /dev/null +++ b/baselines/models/albert/create_pretrain_data.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +BERT_BASE_DIR=./albert_config +python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=data/news_zh_1.txt \ +--output_file=data/tf_news_2016_zh_raw_news2016zh_1.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \ +--max_seq_length=512 --max_predictions_per_seq=51 --masked_lm_prob=0.10 \ No newline at end of file diff --git a/baselines/models/albert/create_pretraining_data.py b/baselines/models/albert/create_pretraining_data.py new file mode 100755 index 0000000..86b6317 --- /dev/null +++ b/baselines/models/albert/create_pretraining_data.py @@ -0,0 +1,708 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf +import jieba +import re +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + +flags.DEFINE_bool("non_chinese", False,"manually set this to True if you are not doing chinese pre-train task.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + strings=reader.readline() + strings=strings.replace(" "," ").replace(" "," ") # 如果有两个或三个空格,替换为一个空格 + line = tokenization.convert_to_unicode(strings) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document_albert( # change to albert style for sentence order prediction(SOP), 2019-08-28, brightmart + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + +def get_new_segment(segment): # 新增的方法 #### + """ + 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 + :param segment: 一句话. e.g. ['悬', '灸', '技', '术', '培', '训', '专', '家', '教', '你', '艾', '灸', '降', '血', '糖', ',', '为', '爸', '妈', '收', '好', '了', '!'] + :return: 一句处理过的话 e.g. ['悬', '##灸', '技', '术', '培', '训', '专', '##家', '教', '你', '艾', '##灸', '降', '##血', '##糖', ',', '为', '爸', '##妈', '收', '##好', '了', '!'] + """ + seq_cws = jieba.lcut("".join(segment)) # 分词 + seq_cws_dict = {x: 1 for x in seq_cws} # 分词后的词加入到词典dict + new_segment = [] + i = 0 + while i < len(segment): # 从句子的第一个字开始处理,知道处理完整个句子 + if len(re.findall('[\u4E00-\u9FA5]', segment[i])) == 0: # 如果找不到中文的,原文加进去即不用特殊处理。 + new_segment.append(segment[i]) + i += 1 + continue + + has_add = False + for length in range(3, 0, -1): + if i + length > len(segment): + continue + if ''.join(segment[i:i + length]) in seq_cws_dict: + new_segment.append(segment[i]) + for l in range(1, length): + new_segment.append('##' + segment[i + l]) + i += length + has_add = True + break + if not has_add: + new_segment.append(segment[i]) + i += 1 + # print("get_new_segment.wwm.get_new_segment:",new_segment) + return new_segment + +def create_instances_from_document_albert( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document. + This method is changed to create sentence-order prediction (SOP) followed by idea from paper of ALBERT, 2019-08-28, brightmart + """ + document = all_documents[document_index] # 得到一个文档 + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 + instances = [] + current_chunk = [] # 当前处理的文本段,包含多个句子 + current_length = 0 + i = 0 + # print("###document:",document) # 一个document可以是一整篇文章、新闻、词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] + while i < len(document): # 从文档的第一个位置开始,按个往下看 + segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] + if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then do something to make chinese whole word mask works. + segment = get_new_segment(segment) # whole word mask for chinese: 结合分词的中文的whole mask设置即在需要的地方加上“##” + + current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 + current_length += len(segment) # 累计到为止位置接触到句子的总长度 + if i == len(document) - 1 or current_length >= target_seq_length: + # 如果累计的序列长度达到了目标的长度,或当前走到了文档结尾==>构造并添加到“A[SEP]B“中的A和B中; + if current_chunk: # 如果当前块不为空 + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,取当前块的一部分作为“A[SEP]B“中的A部分 + a_end = rng.randint(1, len(current_chunk) - 1) + # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + # 构造“A[SEP]B“中的B部分(有一部分是正常的当前文档中的后半部;在原BERT的实现中一部分是随机的从另一个文档中选取的,) + tokens_b = [] + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + + # 有百分之50%的概率交换一下tokens_a和tokens_b的位置 + # print("tokens_a length1:",len(tokens_a)) + # print("tokens_b length1:",len(tokens_b)) # len(tokens_b) = 0 + + if len(tokens_a)==0 or len(tokens_b)==0: continue + if rng.random() < 0.5: # 交换一下tokens_a和tokens_b + is_random_next=True + temp=tokens_a + tokens_a=tokens_b + tokens_b=temp + else: + is_random_next=False + + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( # 创建训练实例的对象 + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] # 清空当前块 + current_length = 0 # 重置当前文本块的长度 + i += 1 # 接着文档中的内容往后看 + + return instances + + +def create_instances_from_document_original( # THIS IS ORIGINAL BERT STYLE FOR CREATE DATA OF MLM AND NEXT SENTENCE PREDICTION TASK + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] # 得到一个文档 + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: # 有一定的比例,如10%的概率,我们使用比较短的序列长度,以缓解预训练的长序列和调优阶段(可能的)短序列的不一致情况 + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + # 设法使用实际的句子,而不是任意的截断句子,从而更好的构造句子连贯性预测的任务 + instances = [] + current_chunk = [] # 当前处理的文本段,包含多个句子 + current_length = 0 + i = 0 + # print("###document:",document) # 一个document可以是一整篇文章、新闻、一个词条等. document:[['是', '爷', '们', ',', '就', '得', '给', '媳', '妇', '幸', '福'], ['关', '注', '【', '晨', '曦', '教', '育', '】', ',', '获', '取', '育', '儿', '的', '智', '慧', ',', '与', '孩', '子', '一', '同', '成', '长', '!'], ['方', '法', ':', '打', '开', '微', '信', '→', '添', '加', '朋', '友', '→', '搜', '号', '→', '##he', '##bc', '##x', '##jy', '##→', '关', '注', '!', '我', '是', '一', '个', '爷', '们', ',', '孝', '顺', '是', '做', '人', '的', '第', '一', '准', '则', '。'], ['甭', '管', '小', '时', '候', '怎', '么', '跟', '家', '长', '犯', '混', '蛋', ',', '长', '大', '了', ',', '就', '底', '报', '答', '父', '母', ',', '以', '后', '我', '媳', '妇', '也', '必', '须', '孝', '顺', '。'], ['我', '是', '一', '个', '爷', '们', ',', '可', '以', '花', '心', ',', '可', '以', '好', '玩', '。'], ['但', '我', '一', '定', '会', '找', '一', '个', '管', '的', '住', '我', '的', '女', '人', ',', '和', '我', '一', '起', '生', '活', '。'], ['28', '岁', '以', '前', '在', '怎', '么', '玩', '都', '行', ',', '但', '我', '最', '后', '一', '定', '会', '找', '一', '个', '勤', '俭', '持', '家', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '我', '不', '会', '让', '自', '己', '的', '女', '人', '受', '一', '点', '委', '屈', ',', '每', '次', '把', '她', '抱', '在', '怀', '里', ',', '看', '她', '洋', '溢', '着', '幸', '福', '的', '脸', ',', '我', '都', '会', '引', '以', '为', '傲', ',', '这', '特', '么', '就', '是', '我', '的', '女', '人', '。'], ['我', '是', '一', '爷', '们', ',', '干', '什', '么', '也', '不', '能', '忘', '了', '自', '己', '媳', '妇', ',', '就', '算', '和', '哥', '们', '一', '起', '喝', '酒', ',', '喝', '到', '很', '晚', ',', '也', '要', '提', '前', '打', '电', '话', '告', '诉', '她', ',', '让', '她', '早', '点', '休', '息', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '绝', '对', '不', '能', '抽', '烟', ',', '喝', '酒', '还', '勉', '强', '过', '得', '去', ',', '不', '过', '该', '喝', '的', '时', '候', '喝', ',', '不', '该', '喝', '的', '时', '候', ',', '少', '扯', '纳', '极', '薄', '蛋', '。'], ['我', '是', '一', '爷', '们', ',', '我', '媳', '妇', '必', '须', '听', '我', '话', ',', '在', '人', '前', '一', '定', '要', '给', '我', '面', '子', ',', '回', '家', '了', '咱', '什', '么', '都', '好', '说', '。'], ['我', '是', '一', '爷', '们', ',', '就', '算', '难', '的', '吃', '不', '上', '饭', '了', ',', '都', '不', '张', '口', '跟', '媳', '妇', '要', '一', '分', '钱', '。'], ['我', '是', '一', '爷', '们', ',', '不', '管', '上', '学', '还', '是', '上', '班', ',', '我', '都', '会', '送', '媳', '妇', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '交', '往', '不', '到', '1', '年', ',', '绝', '对', '不', '会', '和', '媳', '妇', '提', '过', '分', '的', '要', '求', ',', '我', '会', '尊', '重', '她', '。'], ['我', '是', '一', '爷', '们', ',', '游', '戏', '永', '远', '比', '不', '上', '我', '媳', '妇', '重', '要', ',', '只', '要', '媳', '妇', '发', '话', ',', '我', '绝', '对', '唯', '命', '是', '从', '。'], ['我', '是', '一', '爷', '们', ',', '上', 'q', '绝', '对', '是', '为', '了', '等', '媳', '妇', ',', '所', '有', '暧', '昧', '的', '心', '情', '只', '为', '她', '一', '个', '女', '人', '而', '写', ',', '我', '不', '一', '定', '会', '经', '常', '写', '日', '志', ',', '可', '是', '我', '会', '告', '诉', '全', '世', '界', ',', '我', '很', '爱', '她', '。'], ['我', '是', '一', '爷', '们', ',', '不', '一', '定', '要', '经', '常', '制', '造', '浪', '漫', '、', '偶', '尔', '过', '个', '节', '日', '也', '要', '送', '束', '玫', '瑰', '花', '给', '媳', '妇', '抱', '回', '家', '。'], ['我', '是', '一', '爷', '们', ',', '手', '机', '会', '24', '小', '时', '为', '她', '开', '机', ',', '让', '她', '半', '夜', '痛', '经', '的', '时', '候', ',', '做', '恶', '梦', '的', '时', '候', ',', '随', '时', '可', '以', '联', '系', '到', '我', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '经', '常', '带', '媳', '妇', '出', '去', '玩', ',', '她', '不', '一', '定', '要', '和', '我', '所', '有', '的', '哥', '们', '都', '认', '识', ',', '但', '见', '面', '能', '说', '的', '上', '话', '就', '行', '。'], ['我', '是', '一', '爷', '们', ',', '我', '会', '和', '媳', '妇', '的', '姐', '妹', '哥', '们', '搞', '好', '关', '系', ',', '让', '她', '们', '相', '信', '我', '一', '定', '可', '以', '给', '我', '媳', '妇', '幸', '福', '。'], ['我', '是', '一', '爷', '们', ',', '吵', '架', '后', '、', '也', '要', '主', '动', '打', '电', '话', '关', '心', '她', ',', '咱', '是', '一', '爷', '们', ',', '给', '媳', '妇', '服', '个', '软', ',', '道', '个', '歉', '怎', '么', '了', '?'], ['我', '是', '一', '爷', '们', ',', '绝', '对', '不', '会', '嫌', '弃', '自', '己', '媳', '妇', ',', '拿', '她', '和', '别', '人', '比', ',', '说', '她', '这', '不', '如', '人', '家', ',', '纳', '不', '如', '人', '家', '的', '。'], ['我', '是', '一', '爷', '们', ',', '陪', '媳', '妇', '逛', '街', '时', ',', '碰', '见', '熟', '人', ',', '无', '论', '我', '媳', '妇', '长', '的', '好', '看', '与', '否', ',', '我', '都', '会', '大', '方', '的', '介', '绍', '。'], ['谁', '让', '咱', '爷', '们', '就', '好', '这', '口', '呢', '。'], ['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'], ['【', '我', '们', '重', '在', '分', '享', '。'], ['所', '有', '文', '字', '和', '美', '图', ',', '来', '自', '网', '络', ',', '晨', '欣', '教', '育', '整', '理', '。'], ['对', '原', '文', '作', '者', ',', '表', '示', '敬', '意', '。'], ['】', '关', '注', '晨', '曦', '教', '育', '[UNK]', '[UNK]', '晨', '曦', '教', '育', '(', '微', '信', '号', ':', 'he', '##bc', '##x', '##jy', ')', '。'], ['打', '开', '微', '信', ',', '扫', '描', '二', '维', '码', ',', '关', '注', '[UNK]', '晨', '曦', '教', '育', '[UNK]', ',', '获', '取', '更', '多', '育', '儿', '资', '源', '。'], ['点', '击', '下', '面', '订', '阅', '按', '钮', '订', '阅', ',', '会', '有', '更', '多', '惊', '喜', '哦', '!']] + while i < len(document): # 从文档的第一个位置开始,按个往下看 + segment = document[i] # segment是列表,代表的是按字分开的一个完整句子,如 segment=['我', '是', '一', '爷', '们', ',', '我', '想', '我', '会', '给', '我', '媳', '妇', '最', '好', '的', '幸', '福', '。'] + # print("###i:",i,";segment:",segment) + current_chunk.append(segment) # 将一个独立的句子加入到当前的文本块中 + current_length += len(segment) # 累计到为止位置接触到句子的总长度 + if i == len(document) - 1 or current_length >= target_seq_length: # 如果累计的序列长度达到了目标的长度==>构造并添加到“A[SEP]B“中的A和B中。 + if current_chunk: # 如果当前块不为空 + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: # 当前块,如果包含超过两个句子,怎取当前块的一部分作为“A[SEP]B“中的A部分 + a_end = rng.randint(1, len(current_chunk) - 1) + # 将当前文本段中选取出来的前半部分,赋值给A即tokens_a + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + # 构造“A[SEP]B“中的B部分(原本的B有一部分是随机的从另一个文档中选取的,有一部分是正常的当前文档中的后半部) + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: # 有50%的概率,是从其他文档中随机的选取一个文档,并得到这个文档的后半版本作为B即tokens_b + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + random_document_index=0 + for _ in range(10): # 随机的选出一个与当前的文档不一样的文档的索引 + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] # 选出这个文档 + random_start = rng.randint(0, len(random_document) - 1) # 从这个文档选出一个段落的开始位置 + for j in range(random_start, len(random_document)): # 从这个文档的开始位置到结束,作为我们的“A[SEP]B“中的B即tokens_b + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. 这里是为了防止文本的浪费的一个小技巧 + num_unused_segments = len(current_chunk) - a_end # e.g. 550-200=350 + i -= num_unused_segments # i=i-num_unused_segments, e.g. i=400, num_unused_segments=350, 那么 i=i-num_unused_segments=400-350=50 + # Actual next + else: # 有另外50%的几乎,从当前文本块(长度为max_sequence_length)中的后段中填充到tokens_b即“A[SEP]B“中的B。 + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + # 把tokens_a & tokens_b加入到按照bert的风格,即以[CLS]tokens_a[SEP]tokens_b[SEP]的形式,结合到一起,作为最终的tokens; 也带上segment_ids,前面部分segment_ids的值是0,后面部分的值是1. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + # 创建masked LM的任务的数据 Creates the predictions for the masked LM objective + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( # 创建训练实例的对象 + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] # 清空当前块 + current_length = 0 # 重置当前文本块的长度 + i += 1 # 接着文档中的内容往后看 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + if FLAGS.non_chinese==False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously + output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t)) > 0 else t for t in tokens] # 去掉"##" + else: # english and other language, which is not chinese + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + if FLAGS.non_chinese == False: # if non chinese is False, that means it is chinese, then try to remove "##" which is added previously + masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index])) > 0 else tokens[index] # 去掉"##" + else: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + # tf.logging.info('%s' % (tokens)) + # tf.logging.info('%s' % (output_tokens)) + return (output_tokens, masked_lm_positions, masked_lm_labels) + +def create_masked_lm_predictions_original(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() \ No newline at end of file diff --git a/baselines/models/albert/modeling.py b/baselines/models/albert/modeling.py new file mode 100644 index 0000000..eee43d5 --- /dev/null +++ b/baselines/models/albert/modeling.py @@ -0,0 +1,1264 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf +import bert_utils + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids, but use stype of factorized embedding parameterization from albert. add by brightmart, 2019-09-28 + (self.embedding_output, self.embedding_table,self.embedding_table_2) = embedding_lookup_factorized( + input_ids=input_ids, + vocab_size=config.vocab_size, + hidden_size=config.hidden_size, + embedding_size=config.embedding_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + ln_type=config.ln_type + print("ln_type:",ln_type) + if ln_type=='postln' or ln_type is None: # currently, base or large of albert used post-LN structure + print("old structure of transformer.use: transformer_model,which use post-LN") + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + else: # xlarge or xxlarge of albert, used pre-LN structure + print("new structure of transformer.use: prelln_transformer_model,which use pre-LN") + self.all_encoder_layers = prelln_transformer_model( # change by brightmart, 4th, oct, 2019. pre-Layer Normalization can converge fast and better. check paper: ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True, + shared_type='all') # do_return_all_layers=True + + self.sequence_output = self.all_encoder_layers[-1] # [batch_size, seq_length, hidden_size] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + def get_embedding_table_2(self): + return self.embedding_table_2 + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] + + embedding_table = tf.get_variable( # [vocab_size, embedding_size] + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] + output = tf.matmul(one_hot_input_ids, embedding_table) # output=[batch_size * sequence_length,embedding_size] + else: + output = tf.gather(embedding_table, flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] + + input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] + + output = tf.reshape(output,input_shape[0:-1] + [input_shape[-1] * embedding_size]) # output=[batch_size,sequence_length,embedding_size] + return (output, embedding_table) + +def embedding_lookup_factorized(input_ids, # Factorized embedding parameterization provide by albert + vocab_size, + hidden_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor, but in a factorized style followed by albert. it is used to reduce much percentage of parameters previous exists. + Check "Factorized embedding parameterization" session in the paper. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + + # 1.first project one-hot vectors into a lower dimensional embedding space of size E + print("embedding_lookup_factorized. factorized embedding parameterization is used.") + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) # shape of input_ids is:[ batch_size, seq_length, 1] + + embedding_table = tf.get_variable( # [vocab_size, embedding_size] + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) # one rank. shape as (batch_size * sequence_length,) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids,depth=vocab_size) # one_hot_input_ids=[batch_size * sequence_length,vocab_size] + output_middle = tf.matmul(one_hot_input_ids, embedding_table) # output=[batch_size * sequence_length,embedding_size] + else: + output_middle = tf.gather(embedding_table,flat_input_ids) # [vocab_size, embedding_size]*[batch_size * sequence_length,]--->[batch_size * sequence_length,embedding_size] + + # 2. project vector(output_middle) to the hidden space + project_variable = tf.get_variable( # [embedding_size, hidden_size] + name=word_embedding_name+"_2", + shape=[embedding_size, hidden_size], + initializer=create_initializer(initializer_range)) + output = tf.matmul(output_middle, project_variable) # ([batch_size * sequence_length, embedding_size] * [embedding_size, hidden_size])--->[batch_size * sequence_length, hidden_size] + # reshape back to 3 rank + input_shape = get_shape_list(input_ids) # input_shape=[ batch_size, seq_length, 1] + batch_size, sequene_length, _=input_shape + output = tf.reshape(output, (batch_size,sequene_length,hidden_size)) # output=[batch_size, sequence_length, hidden_size] + return (output, embedding_table, project_variable) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False, + share_parameter_across_layers=True): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + if share_parameter_across_layers: + name_variable_scope="layer_shared" + else: + name_variable_scope="layer_%d" % layer_idx + # share all parameters across layers. add by brightmart, 2019-09-28. previous it is like this: "layer_%d" % layer_idx + with tf.variable_scope(name_variable_scope, reuse=True if (share_parameter_across_layers and layer_idx>0) else False): + + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) + +def prelln_transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False, + shared_type='all', # None, + adapter_fn=None): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + + input_shape = bert_utils.get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = bert_utils.reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + + def layer_scope(idx, shared_type): + if shared_type == 'all': + tmp = { + "layer":"layer_shared", + 'attention':'attention', + 'intermediate':'intermediate', + 'output':'output' + } + elif shared_type == 'attention': + tmp = { + "layer":"layer_shared", + 'attention':'attention', + 'intermediate':'intermediate_{}'.format(idx), + 'output':'output_{}'.format(idx) + } + elif shared_type == 'ffn': + tmp = { + "layer":"layer_shared", + 'attention':'attention_{}'.format(idx), + 'intermediate':'intermediate', + 'output':'output' + } + else: + tmp = { + "layer":"layer_{}".format(idx), + 'attention':'attention', + 'intermediate':'intermediate', + 'output':'output' + } + + return tmp + + all_layer_outputs = [] + + for layer_idx in range(num_hidden_layers): + + idx_scope = layer_scope(layer_idx, shared_type) + + with tf.variable_scope(idx_scope['layer'], reuse=tf.AUTO_REUSE): + layer_input = prev_output + + with tf.variable_scope(idx_scope['attention'], reuse=tf.AUTO_REUSE): + attention_heads = [] + + with tf.variable_scope("output", reuse=tf.AUTO_REUSE): + layer_input_pre = layer_norm(layer_input) + + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input_pre, + to_tensor=layer_input_pre, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output", reuse=tf.AUTO_REUSE): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + + # attention_output = layer_norm(attention_output + layer_input) + attention_output = attention_output + layer_input + + with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): + attention_output_pre = layer_norm(attention_output) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope(idx_scope['intermediate'], reuse=tf.AUTO_REUSE): + intermediate_output = tf.layers.dense( + attention_output_pre, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope(idx_scope['output'], reuse=tf.AUTO_REUSE): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + + # layer_output = layer_norm(layer_output + attention_output) + layer_output = layer_output + attention_output + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = bert_utils.reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = bert_utils.reshape_from_matrix(prev_output, input_shape) + return final_output diff --git a/baselines/models/albert/optimization.py b/baselines/models/albert/optimization.py new file mode 100755 index 0000000..8978f5b --- /dev/null +++ b/baselines/models/albert/optimization.py @@ -0,0 +1,300 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = LAMBOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name + + +# +class LAMBOptimizer(tf.train.Optimizer): + """ + LAMBOptimizer optimizer. + https://github.com/ymcui/LAMB_Optimizer_TF + # IMPORTANT NOTE + - This is NOT an official implementation. + - LAMB optimizer is changed from arXiv v1 ~ v3. + - We implement v3 version (which is the latest version on June, 2019.). + - Our implementation is based on `AdamWeightDecayOptimizer` in BERT (provided by Google). + + # References + - Large Batch Optimization for Deep Learning: Training BERT in 76 minutes. https://arxiv.org/abs/1904.00962v3 + - BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding. https://arxiv.org/abs/1810.04805 + # Parameters + - There is nothing special, just the same as `AdamWeightDecayOptimizer`. + """ + + def __init__(self, + learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="LAMBOptimizer"): + """Constructs a LAMBOptimizer.""" + super(LAMBOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/lamb_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/lamb_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + ############## BELOW ARE THE SPECIFIC PARTS FOR LAMB ############## + + # Note: Here are two choices for scaling function \phi(z) + # minmax: \phi(z) = min(max(z, \gamma_l), \gamma_u) + # identity: \phi(z) = z + # The authors does not mention what is \gamma_l and \gamma_u + # UPDATE: after asking authors, they provide me the code below. + # ratio = array_ops.where(math_ops.greater(w_norm, 0), array_ops.where( + # math_ops.greater(g_norm, 0), (w_norm / g_norm), 1.0), 1.0) + + r1 = tf.sqrt(tf.reduce_sum(tf.square(param))) + r2 = tf.sqrt(tf.reduce_sum(tf.square(update))) + + r = tf.where(tf.greater(r1, 0.0), + tf.where(tf.greater(r2, 0.0), + r1 / r2, + 1.0), + 1.0) + + eta = self.learning_rate * r + + update_with_lr = eta * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name \ No newline at end of file diff --git a/baselines/models/albert/optimization_finetuning.py b/baselines/models/albert/optimization_finetuning.py new file mode 100755 index 0000000..bfd0ad3 --- /dev/null +++ b/baselines/models/albert/optimization_finetuning.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/albert/resources/add_data_removing_dropout.jpg b/baselines/models/albert/resources/add_data_removing_dropout.jpg new file mode 100644 index 0000000000000000000000000000000000000000..21016290679e079cceeae6d6879c47225691df4c GIT binary patch literal 98747 zcmeFZ1y~l_x;Q>xkdo4kbO_Roba%H(cb9-jDJs$kN_U4M-4@*~B}hnvlG4Ayy>;*F z+2?FM*ZVy8fB$EF&-&(_HSer==bg3QT66aHYzDxRmzI+TAP@kE1b@KUdmu&XrnLnC zC@BF<001BZ2#|{a41^%y4}cH@2wz|TkcW_bgViAnzvDmw;GQ)A{~bpkJf0`8=JUIM zKfTpaO`1ps}f|lbEWi;(484k6-Hl_P00k zWn{o0%XwYjihm5iHZylM1xcd{qMMkxn%aYKIRLr0{r|Iw^mWYxhV?m69T9Tm=DG!SlcFi}VV~7KcRgd z*4oM-i~+)vuC^NIeK>D-#oJ2ay!;D}i@W-HpTTDi11wym&&LSjz-a7ULFh}nFaaxT zxpRE54=_bHD@|Dt2JeMkGd2FwH$o7$F?ZEH@57gVnwd*VfiO5OSfIIs*6;m?MLW5P zpVxnmpXFrth40VntT(rlIWMOI;T~5Hjj#89bOV_UEC*=<2b|ng&*=m94UTJLEPu|^ zU?1R2fFd9PNCK3AJD5!XN5BTK?pdo^1J8cDA`KV=E`TLq4zT=Q^7Rg#FIODEe+|GJ zSO*+H9M^C6iGR6j4tRj&C*MkcyO#y9_;S_b%N@4B2e1WuKmo7=&)q@T9K7diYudm( zSYrL{{A=r`U|;RPe17d8$|y0vmLW`$YE(}A^e!^PpkVEtg-aC|s1 zzyjw2TM~ndfNgVwrGM_L`?nGQLCfFij>3ilMIk^DMPc~92N+Bk+}~--tE9@n#9aayk0w`fkuxeNrtl=E~y?yB~ z&&KjwYwBN~oGmyi>u)$;+cO4B&f7eHx~^=6Y|CuNzt;k|ncs8+`4NzCbn__J3FiYqfB7|0LnfB z0P55qWenhZh%o>FZI4XdT|9oP4}N|?09O=D@J%5C$N_4A0UQwrzyk;ZqJR`252yf| zfF5uSq_Y*sr!Igea0>_kf`NzNni&fu0I5JGkOve4&w(nS9(W0~0G&V|Fbqt9ys`kS z0v~}r;0OYNAVJU}I1nNT1%wvD4B>$AL9RliAc_!mh#uqy#1i5FafjT31VJ7`q9F;8 zbVxp=6jB3u32BG)L&hNQAFY( zAcvrfV2R*~5R4Fqkb_Wx&(AF_IfnFj72H5mFP<0MY`|XXFdW6v!ON(#ZPA4#3)1ix^>!Z7& zhok4CzeazD{uu)YgB3##!wka@BN3w#V*q0V69tnNQw;MOrZ;9BW*KHL<{B0X79ExZ zmNC|CtVFCDtWm68Y#eNMY!z&K>`?3i>~`#>i%1vgE=pZAyLjhf=Ec_+=Wt*+)Ho72 zrZ_=3Svbu&3%H25^tf`kHnG*MK*J55uR!m&3QikHCM9 zKZ1WqKt>=&U{3IWpoCz6V4skLP?XS|Fof_a;V|JL5halnku6aaQ5De?F`Sr*SdG|| zIEA>Gc#Q;?M3BUkB!uKS$v7#Ll!;WG)Q2>aw2O3?jGRn{%!w?X>^0dMIX<~4xefUv z@<#Gy3LJ{76xI}vD4Hl%DDf!8DD5bpP`;u3c!~6q>?QY0>6dyg9Z}I!X;B4IJ*AqW zMy2MXwxE7Q{fhb{4F!!7%`KWjnsHhbT0UAU+Q+o5wEJ}QbUJkR=&Idr%ur??W^3kT=KjkFm-#O{T+X;W z%7Vrs#^S+J$nu^QpH-1HkhO|+;|k3c{VP#dTCbe4akJU6WwA}LUu2hO4`i=p-{N56 zFy%<(7~(|dl;-r~tmgd0#mr^SmBRIw8;4to`yTfz?h_tf9v7Yxo@HJd-W$A0yrX=$ ze5!n*d~N&){Nnt6{Pp|?0z3k40?!3D1z7}b1q%e1gy@CLg|dX^gsFs$h0}#+u2NpT zaW(DgjL0PsW04GzSy5_HGtnH;MKMM(8?hp>4RJPc7x4=5eF*^xUx^ozFi9E75Xl~? zi&9!r2~zK*siZBX3#C8GaLf3}ypTnZRgjI69hD=KGnLDi`zX&V?<@aC0ZlZ!Jz_m;y=r|VeQo_5{apho zgII$FLvF)h!x1BTqnk$U*NCs#U2C|GdENMW*$spnIyVY#92=_`XBh9A$eARWd^D9X zeQdgFCTjM`Y}s7cJlcHGLf9hOV#!k2GRAV*O2jJ8YRy{0I>GvrjjT@If( z_q*=zJ+68rdVKcO_AI-JebeD)mlu=Q1FuDIY42Ab?E9RSi8*l{xbe~0jL4C0o{RYfzg3GK{`RTcgXJe-*1z`WW@G`{S7>ichNJY2qW}j}xpCh7-jTpC*weJxJP5 zHcK8z5ltycB};vndYERN_BLH6y)uI?BQ6s$(iptLv{OUhBRddZYHHr&*!7twp-!b*orwQ=4#GeY-$=O$T2`RVPnpWfyl> zMK^bMMGsF;WiM}Ubsv9UZNE@|N_x1)xmGjGk_E{)lZ zeH!-|Kbr8LM3@Yj!kUVENBS;fnsNH+49`r%d&&1*vl_D#bEb2v^RDy93wJ)Ce~4S8 zSj=DISgK!^UhZ2lSXo$gT0L3|UdLWf+F;ly-xS^K`l$PH;gid!v#ro=qV3!ruANuA zs=M#@?DvlLLp~FI&O6{cXg$<9Ts-nTMmm0S!gx}9s&G1aW`B0(Wb9)6MHYa8|4<81 zzuw9P0CYV7!07|!2ZP^qvo9E5^z7duP)+#){dW8l{G0yvybRBT+fyas))B zBU6i)zQj}S|474W;u460icdgDL`+LZ&%nsU#m&RZ$1fluDJ3lFM7*0+&;H!f^& zTu?Y121hy{7X<1FW*9ac;SxLIMKLubV`m&HjsRp_@u>9DmnhVn>L2k;T>4S*X}I3g zemWo8my!M0zyklRk$oH3ALE(;1!(*)i4Gs1j)#tpjz^9MOmc>D=gH8~@Jn+1J>C71 z6320fOb);%>Lnwr7;n)H~$watbAmPQ>N>f-gPuRiP`?kD)7d>OMvV($EPv%6P{< z7)6AJyCCmzJCt&3Z*8#TkV(vadC;kRuzhI1=hC{S?(BM&43Z zEEl6&ZE=bKsIL?+Od2Q(+-se&f;ogRq(T3WWyCTjMo&Iiw36m&f9V;J(ta@fAYE>- z)34S$LO;_BA-SQf?UG_Zz*cx}HgG-MDqDqv2dyzPTlDdWi`)WDfx#d*uJ#RmDTisd z8_#z4j#pBi+Cs+vNamh}sWF<@uIH5pdQ1Kc6Yi27j3InHU$aUJX=9 z!H3F6C8Ol|;N&g-jLZE`*0oYMwD`I7E?!EmgGa9XTvEaZ3u*&mER^-%{VNl>_8$P?)e(USL=-i zZ7}@#wq9b{Iy2SR@1prK?_(3+?KM<|$Hv-QqjhZ3&3aaSV%uD^)y~&F0}34XCl30c zXMp8|H}A>b^qkC|Z&jw>wEZc!O9+sAl4`#+y+7d>MlrVcF6<2Ot@=FN!2nJDip1sj z^*$YjG*rKDNd4b#G|cd4{7>!wq0Ik3c}CWb<=IWgqxGhik^%Bye!8> zgvRPA9!q!al*|GrN6@Hsi_N_~Az`8Arp>3P6@V?GItPXniw;jIiR1I_G7{ev#>=KY z?p4}mD1AZQfJC8Wv|g$4_7x`Im7Pm>7${2mQ%jg=Dq5!b$Bi0zO)1g}H6hBgdLz2C zK~~dA8}J?Sp+tAgO_>9VPuYp5B9v!9fFDzhC~oBn#k+_tkvZupiKG5gnPZ!ET!?~0 z`pME6Kz+((KP1vXG4Iz>(*GaK|4`}w2!{}D(ym=jF%B(c)9`ogozurt-aggy8w`P6Q{H)vtA);^UPC9j2t|PX|cH7Y?-k{n__>I{&KkT(2|Wa?@(Y z86Z%jxhb;n_{8n{q7DO)5%R+sb~gMI>{42Bp^~#WRH@99^ov#Cd1{6>Bm{S~LwaOG(%Ycl?m*52o;gic=k6v=?C z7Bj`J(&>?(K%Hih-!Sjd`tcHU{ePRbaB0cf8dE|P@3P3;@a^1#gc{AYuh1`3Yj8iD z0TFE?GuJyxwD_CmwlF^vY!yBN)ZOL>hIp}wGOS*<`1HY!+s=S`5dop!4sTA|6OUF8 z`%hwc_eXaNpKmCQReg{-d4JIV<$r{JPjPzs`bflDd{bjnF}~5OwTW%&(~rkXU;bBd zj_4dWpRUUMhV_v3y@&ll)ERK|wwqT}>MTXFiJw{b$?l1@j@%q9p znN#=NJl4-~+%$sEfIykiZ1Zd44lLJIuh)~foB_3bXTa+45r6YeA>T5w-&|`kV0A0* zW9mj;FFm)0qBhT|>&dd5gRh{vjh!O1L#nsMh0mVFaMkP(zwzZYZynMtj8HZs#fgRCVU29E^4ne%?(`HmjW!ayY@V7t=%RDRH_%B_Knm; za|`PEU%fMeU;4PF^1;;uyMb=FQc*@Mp&3oY^Lx(u`$RN>{uTO`NDt964yBlx)widjB5 zZ?hYw91n-7_aKy8o2o`oT)L+ZytU9IdE$@csu*r%KFh1!n%-@a)$Q!-h&u&0{ zMLwlnu+B4R=rjnXU|3P{6VqA6@7FofUJQSIs67nes!9FQtWkdk#9iO*_e(Y`@J!w6 zJgvR54Kl8p6z-2^$l1u>Pg!XnIjrw5Q*DY9lF#~6NU4R*C&&RY))HaW6i>ZNNh7M8 zS$QEz;dfNw!$D{^?)mk7m>sAestS_!Qp`GCaqOb_dN`1GsQft{=di9D^dgUUO(+Wv zw-=cmsn%CGMTD|ZQx3fvPH|6pAB209h6dSGSBz=jYI#pEN+j}LlyL3&h_NhD>a zIkB6pXr+og6f?$=S7U{o;I5p>J<^iVtj^WZ0IY@!!h(0b zm-(~?74up~kt#$A_rGL>Z}VfUvg7CDpi_k7f`xy|(|F3KL)Q$R`0OHJS)d{@-kAE$=;USibMBIOZOpJql_Rz&;G7`PuunIm zzCJ1~H|3|_c~MNv0KMA~@tD0P`M6z$XbITE!x^9ANvkro7$$L!NnOx?q>4Z?AyQEg zD`+MWdv#~*Ms;nd#MNQLun9T-)?8>gLmHZrxyQtt{1#G(_J_dPWjLq3W^b2fw6X{W z<7lPDN2bT)xQhYrN*HcbyT+^_GC;$+qoQRuSlaZcW-QamgX0vu9OAF`;i+M)r!CTN zk!l?>z*tuXW~Q9V!9ap+v$Rt=pvpMIRQJ_VkE}k?)pfjE#XvSiXtz247u{CS_)<{Amxk| zEfwuzKfgq)QhU4U#JjJ&;-*}jo?@3YM95j`c@+Ic4#uqq2c8&>_1>DF3}`SEB@0q# z)<1izA|!k-zW?ueg%i5+vY*VLnrYCiHgZOZ>c)K)_7h&a#P)D17un|4^4gRx$cQaj zb7YiU8fe4ZHai2*4NC)cshKaW_6RdA_3#%>N!#9!Rg^*XPTA3KcICIS?pK{?8+cXn z8XoSdp}~oIM*+8}pV=%tnG1cc2LFo)~~hTL}i=XB_D(`ET=)aO40u(=RIibY@4`{ zvRV*iy*Dq9J}Q9|eR`w!V_TCab^6}1jlrI#X0)n4x7{;}^y3xYgfWxt^(py%!+GrG z@F55gqdhTSZcQnKgalg+84Z({&Qq2JEDA`p?{Ajbr>;@d`n~>GcwaY9}K-6-dHqmoECRVOUF znowuUS5qa|`U!KypZX6oism08;(wVB|LfbzA#En!QFnihDK`0Y{nSDxYY&F=h(8xY z(c4|Ded)oD8*?SHX8>`TyYrsz@xWdG)liV38Si(s5k1Nm4$}_a;y3k5oR>7=@R{{z zRnxqM%!M*PZT8eYfh{%|Z@t;j3weVBJ2U(t~FNEY#hUB90 zUhp;ys-lnC6!RYM8p{X?oHd3k(PwkgJXk7-dn{rtKH)JJQYAWG86d@>)Qr*F5YEi= z>cW#O(<`vP?F0!0g*eBr+8I;q;_>q{pyc84$~uSmx6%E*j|uaK9OC(tM)#H5`m z3Qv3)pbL1R4&2c$l`BZ;#;J=M&AybEu9&dJ92@Cur=c@36zub=>E|4%Tm#7D_R=ok zYFX)jP!tL*72kTa|}|S8v=x(`DEzVKwUeVd+@^D ziaXFX4!I=+s>xczhX$eM!CLnu)

lzDXzl)3$_!E<6SH7q}~dvlLgU$vqYa0@6)V z7+$?A<^b9XoCF>BEsG>l!Fchsa8 zaJEEWrd=*}LB29q{ir3A)JerV|MIqkZ{NCb?T7O8ia-oCuU&3-t)5wavZZGS(cY;1 zMA=0e!f+K=I7DYqJ!zXGuS;D1WOx5WrdXVCc`xX;UW5(TvIDI) zM^fIB&xdj{d?|)G%7JahI-jDzkp%}0rbgih zLoc_1I`BOjNyuQN6@~9+n`^)K7ty~=+Hr*Ez*>J_ zGW4qHcy^$9b*OAwL@~$8Tf;QNfS^%4A*<3fsU-%SihBcCB!Qd?QXGnjb$cNyOxSI! z3AHN@_LV!AMf6im;wD9Viv^-|jL=pE>8kA44(OjP>vO3oDpeJtO7snYM(u!Kq7oyE zQX@s$L@!q|UP&qiUZ-veXsz^C53Q{ynl|csT(g3ShqOoNmwJrr<{rJO(25=6ND7)% zg^oC+-Fl6e25KK+y?rjW^7#)vd2<#%m9q|0^!T&7&1n0+@-2M>JJBbt*`=;fajUw5 zZJ?eW;n7PGYnes-#s-@l3u*M20cM#CE#)rV5UNprT7JRcheX-xbhy@SE=Osm7+PR6$>0BwHsS4$~aZ#DS@Cj~tNNr=YAmh7P zsx1J+xmlfe`(Y~cg|N3vel3;OYz!9)*R<-BXYP65NZQZQoosgs9RS#Tv{t|ix-BWE zH#gAa34pnDa9$sd3h|e5L0VLKk)1{&CWWg(!<-s%azWSXkYyJ{&tj5sd#<!GyA~QCkoTF(c zUwr>7*Olh=>w`+zuI4;@+MiZg`JB{|-_d)h{^Vqmry=0ow3yl|{&SU!omicct^t#( zh_!iIsPzCFGfdmD?47}6E)w6>*-DlT>gxNl_f~|_%?z*Xg{KosbvbQyPh5QjY0@Qr zdVCwyV;fGS97jb8IUY_Jri7hH0d7bs=&mF26~pEg4>$NGk?cmgs%Aof<33p)w689jB)g&pws;g;4_gljH<*ZR#D74Xlux;MJR9$w;=5aK-7Znn zaF;_{Yb0~h`4Ja{>D}aOkIK=I>X?n^mK4Rh2Jx(PH;X^QZ{br59YUJ5L7(iki8o1) z1Sj5VdVAP!va)2))HxdN<3emmYS3Tn^+4MYHN)@tGF~ppsoB=r3l6IcM55?PH`}go zj}nkfW#1PrAQP}kPgtP6k$c>uz3o zm9}jfChY@QBY!@Z9+tkBS=h@6vL%?!Pb^8$FSv{;CXQQMyxyu%1*$i}GBc>zwPeyP zX|dEvT3#DwKACm^gDfFciNRQL{oVd>OdILA=S^yRm9NH7EuHE43li>jMPcLjSKOnk zuxV&6_hf?2o2KKUC>T5!RJfU8^vM!4sGGxfKG=~azqi;(Ydl?sZ?_=kZL`zSWTbOA z1AOrIr^`C3T?^FwRH;<2uWV$!+$$LI@UxhdYZvJ#R4~077tx23U`v+J*R>rzKzEexG_71yTUd{B%3qAkkn*ye=8sP;T|A@B!ji1`Bn zP!a>_K^T zE0JYA_%XWi{jf5GA^#AgbL+k#w`gQbqCx0y=CGWq zzf?%nI>s$h-h|yzr{IC~&YxnhGtsBKst#3KrM5Gwx4PhcbIMcd9*?ZftD+G1G)%c5opPK9^lqv>O!=DYs5xv9L%LM0ZDfA8L*NCZEVJHb7Os8e9 zDXe?qfYHB&=1!{Syfrn3*;pS(gwJl;rgk`t_nW#FlJ8!^K3YS&vFFRgCt~bASwy(JCXJ z3J;C#HPu}$wF+sZ3YNb%y{Uti1JKt3QGb0Zth3@5Ul^}l=A(Nu!_jVpddXpF)wN_dk@f5J!KaJ+9lI4l7|V6Ur@g* zY=*v>DE=%x|I{(B=-wbV)x8sj6aiNQkC`IuLML&5KPDjIyBF@~+&7FwS^?TbNTu1J zmrY7b?Jgm1xhf3q`~a<*B&#|^o>krl3^Zcs7S`m0wDksVnwCL zMVuzMjn?!Ofs^WRj3&~Jf0|@Aq_Eu3Ox@R zA=WHC@{udWmMV^{jVM1a4e_#bIxQ~R?u(o(td{0EDQ$-=3>*EwbN!fwQ;VlS}G%>@aGCOZ9h2J9=@LcBw-yv--MGg(!IS z%rkX~2$bp7`W8hxSWBbKXvZR}SL}d}(D+N=oA(5J56sU1BZ31kRN_w#v7d6aP@MgJ zoFWrZ31yYhHlNz2dP;Cj-p<1ZW=E21C30;TN&rjqwhj8im{@(poRH~~&u#Z&WAu5o z*$vUfXFJE%7X_Dwn7o*fZlGg5?5#*Gk7X_kt7%mOXXA7keTFn}KY%~GT!p8@3s8Rk zEvQ{!H|xAmK?=Z=Z#~iQYn<*W8UE_~0_wga!GG20f6!K^erYR0DHZrP|{VfJKB=$&GGXF&4gRsKsP zp}}I7#;&`(4!ztqAs+Pc<8N+j8XR|LdY14Porq2`GrvwQTwdJ-Eu36y8>;w)P7i2b z930I)?U#Z&ya&fX`TOMcQ)cEYVRV*DOIFbAd&k~Sht2>_9j4@O;NLO-KVB~&sKRHb zn|Pjkl9H1#bo#a#UXOe`!^WT)@bXY!_= zTCpV8k+Q6m18PB*R@l+2k{4Zl)V%6xsU7~b-wh1S;&%^-y zhMObwfUxr!3|!0AItk-AiTIEP(rP_zwWDDZ{N%AQdx@EL>~)kUn>I?133>CM+S*j6 z2PS>UOgLpQ+q%!csXoqGQCoiRvNe5q$otsTyq!&%G;6<`S?+A~jQ$!qw$enI$-FHt z0-9IM9XiDGkO;fA*o4E%%Iy>=2Et@E8iE_2bW>kFtq}3O6lcDN<8xe@=DPD2~`yAq+XP|HLN*`6DtS|D9jw zmBXc`o|1oI;rhq-0qI9#GqZ$XhKH;Dd1Ewox}+D`MadvQC~;4KdGNveclsQO{z5a0 zBl&)>#E_1+{02eWA@h7=!{rdo%QqKS7=D+cBf1~@J+5}x>E-(1E>_b98|s7Q<{_qN}evGB)#o;4KeO6cLn*JJ(cT`x<& zXwwx%Z=aE5KmNYb>=b#;~1090PLOx}p+yTH|n1f_HIil7Z7|(5h;;ttaifnQ4Zl zcd;elMjh8p+2Tr*ekt6xpJ(2`2zdF;hVUgY?hixoZ_p1%$9(Mw8Y)xl)S8Dm+9We6 z&Pbu3{qpp*$bS17xGlfTXYbDaZilx_oaLcG+Luf|W~T5m2>FC`Jk}c=r_*wgLnR$XxU{WTKT#$4=2U16tZ%1S zBgt98Rbgu>R42t2EAE(=^px9qbN0a-2aKCN0%*euUX8;prqxR62E2#=dOJ z;y1`Qa{-0Pyf(&?hg6T8Bdv#2VQP4|OlXkt5JAiG|MhzZ3y*Qo5`0%yQ_GViJP88DZ7%s8@NS>-9Sw&<+M&qrOoo5!Os z)jajQ^|TLox;XKv@R@-DO^WX`Rn)l!a0Oa0wIQ&xG$2g0*f459up2>R&mYOz78PM& zP5=hWc>;L!pH~XZ%%5_Le=x?#C8_~c79CU-(P6Sm)3l6G zqoK|1oPey(Ax(7^T*=F!xGC{DodPo0Pj0>NR>e1aGndd4p(azk<$s_83>G~lUWnIo z4wq(n4@2(nfk4MvU`ISmS;fDTk<16cBz=RtRstZQsxo` z(vyljsn;^fg_bJ45`(ApALHM1(a&YsEoZ4nr12mfvF}OSreK<;B5DymF47q}Gyrr@K}sOC);m>6-c>TWGU6I^^>we_O3-(At;yMUaO znDZiG$jjF$ke&Uab{5Bsuq@lxcZozO)6ff;k{=|EQ!7p+2fO&LWY_Pb=eNU4Pz&Y( zHU`+W5GKZKT@&l;8KHgfa&1zzYm*V|R}eAzR564Lt#xef_BaV70EK@@N%`I1?4$pj zYX6F&e|-(yWrBz1+RTg>_FSh}NmQ3H94jBQ?^E~Hyy@4MKf&)?rdSc{M;VSijn*FX zGT{t5z6E+$^xtV!P1FIf-C_}MUV_)P_@HPbW@Mi(r`S<#s?CrQPzn0FpQ?w4IDgPtzI$#Yb_jmXvQy|rZc~ToAg3XUcdOU{Tx;}i z_hPRJoB?6hV`BzyPdBI(uw!CQ1T-37i7Xa1?GN*ih)~UUktN}^cDaUIK~FSN2D{=N zOzflSiCVWiCm6Wg^*O^iu(GZR>a0$>u#|o8ZHvQa5qcmXS=qvun`DpeI%Rp zGkpoj6yf5~Z~|kDhiYvp!DR{$WWqvr@zPh9g#6hx)TDyqL7^&)%BVCH)%TyOJ|Wi8 z$XyYAs_+VZgSwlykI)V=luU@8UNJ24YE7HbdQu+k6<@l>bf7+&xX03*Zy-A%$GlrO z6Vz6e@0J&GIv;y=S5kN(=*FUB6IHn8-i3;q{%D}RD6v>hhkatpuVFl2&e`?WJAE`v zu@hVQQRQ$_p(f7TfyvR6SA0X_v{{(F_Q?eNeMT1L^_fVj2cD^8A-~Mdi0nEjU-j}* zdv@e^*P=4sg~VniXqvlUO*5yjMooF7N6WpuvIkFbvni5E8RBj&x?T!bt5GqP zV}B&s0$iB$_qHgMqg^q`EHa)yrA4BM1uU*DubMU4m(rrT=m=pO^ozt3x1Y>lE^21s zT$5VhAdGF?n-%az)7piIs`WXG;?ezO#OJS)EVY%hg-T;(KO+@itEoN`aAY!ARyDXT z<>G-I^p@OJ?lJCYg9!;U!QD4qQ?;0@`WJ+hKXBjBb;r9zL|F7@P}~qj!d$EeQy+Rf zUb^o&J9%JJ<9IukG$E+@8AqV5+05ba0cD6b#%go}<8%F&%!uyoxHc%Qr2z;=P9V_YoqaMomhdlC~zxJohSl?TS_-SM#>pmjz5X~ifO1mJ?zPWl! z8vN|jDit9s%f_p;%b87f#Gs)jCv3opQ}?lFPgj$L-{Xm5yS5Y)TqGjGZuRm~{Dp^D zV@p=X_;E#cyrqVzmxks)>6ivH)P#v1YGRE^!E-cwg~JTfQI*`lDf%A_sXuE)MEH{c z^4~DL{(W>F6MyVDoo+Z`IA(*Ksu29`z>(p5Hip%1gHHaW0vw_Cj4SBizABN!a*a*LJM*tiS-Hbqy<` zYq+)5zhVbGw5|J8)|fUxS5reYKU_7*G{4Zp^Px+f!96l53th3j_ylWd(fOkosj1BI zrc^{Qj-F{ayOh8u8Qt85_nQ6zc4~a>5vt@+-wVAnUTX6hT=pO5S zlCh7-ERKgvWIC}dij%lBcZoINPD=ceAGVHGZ14g<(m)yIK$`Tn14@dD&hmk}p%iU> zzi>2m&9fQbNuP*SQR4FN))tMQ4cz-jv*usDzzEC-X2E3rNu)!nQvzcLTMOYnT4NR= zSZKMxCt0EV*%u4?QP#@^?AK0bEj0)_C&F#LJnczp_w&-LLLm2Bn7py%a-V@R#xF79 z{R-vf@4pJebRoyB%0rnD16@V-1(t-{rYK%9rR_}*pOdoz+2blta6!f+4io+ zrZV1@lf6=3ak_~fk)FTPTiVLp42w4m>pCrmROxXv(>9y|6L<>LC8ICe$v+p{Fu>?; zcCQc7LbhfFA7Y0dp$&9IAZDqL#+e`7?W(uJJ9N6bAjez#V4@ZtZ1HwhHz2j5ONo34&O5UAyvq~VYSh8 zM~F?Iz@k~KfXQ#XHXEFac4nt1lPyNgHG9rdz3m#aI#Qx85##e`ukk;uSVd&=rZgE- zg6@YS6qS zx7bSS!f+5|{&BnWz5%$WXtxjcbh@L``sH{Vhsdj<*Yt@O)+n9G zi4SSSp92_7xa5jUnL$>9x&`jWLiN3~_xze{oDJMZj0VkBUwgwWKwq2}yGPi%eLnNR z**`0E2K66;0HEJKS}{ESHR4{B!0KPnoGH0^-lGTmCI1c>>)^vsLMj-IyptYFa$S9( zasf>+bTA)y0mjLFiMc3JNR}J1e>P{cyI&{}HX5CxP7&WErp4m+i3VYQrfO=hFhoIH z!56y}t&qz^YRN`_v1rj&u8ZR%%F8|OzJcgxgVkzM9CG+Htc!qVXvl~6_H(RLKB%t= zMq61iWClY}9;?;7c<&ck638$>gPryQFDHuL7293!-n6Qp{NtnRr}3vvCpM#2nSPCF zCsI;-295_Gnqrw-pXtS?jxU~ionH-j`ji#pZLsho?dh@NGhi{{R6JWtlTa1d%Zzsu zv$&Cy=*l?7KCwGLGK5uxeSfq6O1SiUiPB>MJKOUAL~giFzkA zbiddpXQ1l}ToQt9IKN9j;LpKopi&W*9!E)gy>4rFhr0nraF;aHT&y*lxkzbS7BbN* zG@DdJCUHfag8?$6NpSIr32R|ZEo8-3XlljZfhP?DTHw*WnToStPa0g_s8eta=$1fV zRk*pAjk;zypW-JE^qmKPyH(!JiAhUCSt?qrtq#FMK-lt;+UV>o|wV);9~Ka2JM z_6Fl0h^$~7CMVECMip!4W|44uK*JhM%1B5*?$4&7-2cjd5bBdWTcT^AC*ji4y??7( zXt9BQtX$;~wBY;t?Ji$#Y$e-nwt|qjM)mM7CjjFaAuonlZ%vdxo757hAycUU>}b0< zc-neqg(1}2%ZdqGBZy;M0V;;X<`u(MAJ^5+8Db}#T7 z%8>e(E+H5e-3g%|YVrulz*__`pNupBGMD>bq?i@q$+)UY`C4<{pN-@s2UQ6bj*ch( z-NaimcBc(YMgp@fb9C>x!$$L7Anl}~%YU7Let6qnkrprAQS!?RK67}_-!R1q7gRJ% z>d@TZj_T!#R~uL;IM?7*dpTYbMb&9!$6gZOB83Eyk*9npV6RTZAE$4q z=4js+JNIlZ`2#mJRmGo4Q(v31v7y;OA<~b1`lQhtGv50sgzKo;>NsB(!5e2;zrhyY zo+UxOpAFU5M?;C*g041WE#Yo~AO2=8cZ{pc{&VCE_!Z+-9n%35idDj@!H@>LA}m+p znZv<)?U#iYN59R_sLEf}A?r=7FVWYBpn&;#_j>-RMTwT+SLml= z0lo;$dITdLIyi-sZ8#S~&7ii|${g&=59p>nG%Eo}xJWTzDVMvna_6Aw^XKBvnludl zi>QcxsIO|O`seN@^Ekk$(Nk{67uR{9L)i!sWXO}iO{Eg*E|sVHzRR*=pN9{-{H9lA zQ(vQQh-eJ>b5VZqA^Vn+Hp7YswhzV_s{Pcxk&Q9sSy0g1cR@FUG(gf0tRa`tv`|8Mb{TjG=BHfprA zxdx;1kO9h0yL4GTW63>zd*|ut^$-2l_;bl4w1Eu8m|* z4G%IKxa<4qE}F;C8VPXhrlC#AvHRH4_}!`$K^+4luzE!~RcW6;kQ`w5gaIkjqD*qL zIB)t&DcN39fu8^?P-JnM_O@X8>`_b_I-79+d?Q9CH0|i}b?c++f}g6M;zBH*lhYbq zl<>Xm+xY^2KU5$t23Gv1AlxTJJ^b0AnEIzsv0v!~ zMT;-`5WU$$I1!~k9KW}CbbV|u)RL%JK(^+(0;sJi`sZ0Dv&?ab*d#pTmaA&JbnW5j zsurFH-I|@jByHDi6XyxU+#)5~gPWkqZ6ZUy=fx8>DLm@`&ddHQUhuyU1z}>G-!idg zPtN4aw+6pQfo9(y%m7VdCbZuynYbyf-w_6gk8 z{N^G;3LipTnhBDM5bxO0^=iK)D;Ak8Vb&h?2%JSDs=HU(gpWyF>*)pt9S9@ zbC9X2;xd9om}n#T#|4HeqKv;I$6uKT|5@2DvhDx+3 zYP!y??s$jd4#Z-CUC6CughyCz9_tLY$7;HfE8R(DJI!ygwv!L+@ z_|sj%d+dI}NZEvwOH7oK@g>bjwobwb%Gfon&P#1vkl;P+hn`W3k&li#3*F2_AqM=?k8yhUocsuQb)3w~*c4OHsC8`Lkcs^h8eInqr%aTh|hhNRLJ ztX|So)g;vp%kHZce{mx`qs(npE1EZ2Ea@Ys>R3%h9lMCQN9wZnz!+Ad024FM;m*W+ z>Xx}f)KW+14+bcyq3*{_{6-~O?J2r*txsOM4sz4FJl3e_%09wKc~w`{<8ppqCuKNK8`4-I#uVK``q_0Fh@S|CJad=AF*G=6M4gYb~=b4es#+)w?umzCc`-&cMuPbTa52^Jmxl~EElTZ zXY>voL^rrs@4F;Toye++Ro0JJZRb&85un@k=yc7#j7R#sU^v~zT-06~5l5@El}L zhhdH+*4#+2PxawE5HMsBWR*aGioLpUU$4B2k-WYm{_qFTd~A*LHq@{A)+@%>4U%>z z=TxWSBgFMrs%XmsaHWuQImJUtU1eFkLyX47kQ)c#&e@TBGf9^~xt(K(QY8t`XO7g~ zJu4&hH9Ul)h8+EycST^9>L(GQzw6xpe8e)Cel>dNd_0tANks)Rsxsv?`BldQ7I=zg zJk68b)-~2ATueS#IbjMnr{Pery;O5OEOYd~yXbuTE9*oJjsxxz&~Uu~FF)`MkUA=_$6~)G_}1Na>(vmSgm^8wNRM{ zR&o05>f+RuWeSM>Mf%lei>W#}`BiTGWnWf2S!FP50_6XyC;OjB%d!>D zwh+D3e*Oqp+FzYR=(>+*q8FuTpzy@%rMTUVl$E z&r79_-u*pUzzza&iOt>w(=Nj8l%Os_>yateC`Er2X#CBB>30|bd5+B2?^rSjr(Mkh zg*#-{ubHFWkYifzst)A~jdJ2w?LQ>$d9CAPa4~cxnoJx=KZ~be;+6)K>pm#mP+_!` zx*j*<^N~K!vNph&!><3`M(%&Up#y4u}>`#RDek@ee=mm z<&v7FS6FidhKe-xD4|H0P#1Abo>4=23;8@;Rcxi8^+x~Joh0vboV!7RE27@3C6;!D z{%B9&P7+m)7yBLY^rs#9`*!rz!>Ppb~^skw7_;jVnj#X#p_rk%8qtz_psQx?8#Hq(D#uZ7rSYBfFa zhm!8l8*E9#vvf^T8j9m)606t?L#C6((2VLby{bGlo{)vej?mH{0_r?sahS1{iwd zChBx-2piU)nTUBE!_t@~bhjEy+1_oboi}yYv#}k&4KC~725EIOXL+;vmq5+2o}3)j zg><)TLzN_W5^T3@VOtkELmCQ_5iUvFf7$KTbm}QA#!XveV2LWn-M>NX-B6OpfZB3Rl~zY)P#EjO*W_PQA?KqdGjUU=B%!A5t-LP`t`%D)DqjlS0rRH zn0Ls|3MExA)P%O`yOF+_C~%hbteyg+&T|FSz2cD4Ip(saTW~_6K%6iTn{rTa24=IF zmz|dObwqA{O~Y>F`8-~wYTS&VDQ{tip6_D z&?jY20~Chg92?H6)wq4E=Hc3t^CFl-fjvxN zJUywt!^S(L?E%2^9s|xe&JaT@W5IPRGzfLSnb=b+Wnpfn&maCC*op2_ALIH|SFJ`H zckC~W8Ir1(C{)_+q@NFb^*&KY^R7M&!kClSx9~U_e>PQHQ{lxB&5@Q=ig|2SaUg_e ziJia4Rvx7=jQD8O!ko&^;M6U3bfS)Gc)T#OnDEYqv6)6o(})v!oqX|i1#Oom2Htuq zNO*i!T$y)|hF>nId9%&B%ZI4?X?jojqUmPpJ#{oa((-q24trdP7H@}+>Ji*mu&T*W z=Yg>M2Yws%!LsegarTE` zHc;x8<$m9aO=wyG9}7yL>fY=d{Wqr4u@5FvS_I`{MhsqvJ)*?Dx_P2*af*@4Sh1wQ z^Qi$X6? zz~%Kvu8P}Mhx27OKvNGwI+okyY~0F&{RXe8MWM_m;XS$wjF;Bh6>2A3>}-bA>7Q_f z*tg1lp%Wo4lS2QY2nb6PW2U015C)Dc*RY-ggyBHFfjKvaqQzW;BDK7zI@)mmkj%|? zZS2x*%)COj_pkYO@#yQ0i@CWzkvK`MM^+`Y3nM1YcQf=)wg5kEpoBtEsJ64;dB0#{ z_*kg8XY^I(#^b`61lQBkNr%&$Unsw>OA%EO3mx(pa_{hfYR$GLcMz70z5zSmbi}Rr z#@AVBHjun0It^Vq_9^O***g}cut%6|ip;H+{KLD8jDr(r$3IebHuvd{L>NxqO!Sy{ z6xm9A%NUyH)Pgg(iU1zS`U++;l7781k!HDW4AAbfi)ril+|=Y#n>{%!EX`Mpq|8&t zTb3gJRSoU8g1NsWyPnPyu4{90KkI!awADe6==Esr7^GNrJDPHPcc#;3%m*D^Ujs|O z0U;Jqj9?%QXDw49YT+#v(7$TOuWb)~cCmvHZ5^9=t(IR9v=IAY9p`9b%|_5;nQ+6F znzhVHZ&cIvPUE2TB*YLpcQf5p=}CLnn(Og+mFtN)QJwXGlftqgsb`P92Me-eKQC%R zIyc>H;{i7I`A)}pg+N|;AMaLkL2cbA6J@<$K+XI!+EDpq^jjIFj=cy&?gUHfZ-Sc2 zzOgo3pyFilYj_JbA6p}m+0NsY_hGtPnsB@ji&ecba%<^=*i|}LFqIiErS@MiU?ps1rQBE z5O?>x5|rKl;zol|1@#v}_JXE0WK_qxSl2Z^Cgc+ENjtag1Bi5d-ozu!_pF4eg$u?{ zlB-4oSUd^0d8H@>mvv~UDAEgSwl z+|uj1d1+WXhAkw#QF=KGsuBr>VEg<_60u%g>gTTsyh-XPCR9*G-@>@yYQ-WkimBK& zq`p{WU=l?q?*eh$Xe3nH!1hj9EmdfUVZhYUOs0>nU#`>o3-v`$!!Gs)Vi&~z%EkjU zBe?T{AlhK#>z<&aP6YHC-4#Mf4uu4a9LJMTf-ggKM;O^!b9Q+wD-*B`oKq zXHbe;e5iD-zjUqpZy9`jRNqLj7`k+-8B(3;xQOV8&T}|o*vmY3%>FKOpgmKu^f5vF ziXd}Rj&dfKsKtf5nqBmgtcVYE+-(C_VNuIqM!dJ(>*9-vPvRU~))HJrG&8AK0{~2& zm4EP<@kis&4;fSb81*JQI3bb9k>z*`JArZDMi1I{B^28J1_xtDj#h2vWf@J*)p}2a%D8N$b8gk80f2cM{1Wc{pd+ZoYO+TFfTJ)iut@ zNVM?3-LZZzU_}gm&ueiUUQyLUq@@#Rf!jf}fktd%*WlxV)=w(Vugeyc{pj$`Eh!08 zK8XzYO}ls!UVfqLN7+Ful>YHhC+aqh(qzZVFLN-gm`Ba9XIg_R%~3-YyU9JNvQm_s zo{r_m9Fs8B1GBtVw3;ZjDUA0A8@31FNKR8jhdaa|+V0e9ev(*5wWEBcR%;hOX%ICl zR#@vv5fSdni|nw+tz}gCYqrnY2BU^%&CQ;v^|edX_B(&ma+@OHNR>zZk1>8?u zpmhJpkA7ocV24rC+Wcv|>z09G@_Qf5Ui*TU0Ot!4VtW;Ak}G=wJ*5`r;&~{$VA*VN ziKkP5A5F8@36r^k>&rb|l!aMgQ{Iu#vG{Gtp{LOd0MSDRk;NGS$ZdWFXR=s^IyG6% zF{sj~6bK`#t*d{imfs|~avP;$A<}%mYXuZ~)j|J^ z5u(1_FvKyv%k=996eTy) z5CvduCQf%pv9tmH*$LCzAXm|ek}3X^w!2_BjMb+{$AU%nsAG>j4d;aI?F8?KO>*?) zi;b|#&h25P8`Rkr(owlo;f|va$B@pl#*_sc+4N?IOtKH>UWGwhPsW3o`1r7^c{^6* z0BA;9O>NkUB%egWd(1rg++U}^A3DE~REqq>RC{6Dz9-+ zXYU=*e1cw24MqI^C7o%V*L2D2un#U=} z8Djmd%I$!@gz<(|qVp)hhjK4+!}ZU4=lGCqcBQpD?rb(%j`B|rOyqQ$BrMm!(K|7r zvT0^U_zZ{b;z)Dg^nUkCJLmnfL4W@cgbz)E{o;&djA0rmN^gzRmV5MUFSAWUOf3y=|H&?da6CDj()SW+UsH{fTm7U z9>!KdWpn)l_QR6=Guyi?z%N!j?jC;yG!LRr|C`vUzt0D;57-g6-|pR6R|{c$cDr(o z&5(t9u8pb8pK(v#1=o9YWTwL@-J%~d0J5hs`H?|a2xdJLMi5f=v^KX#lE5|W7LmS4 z0?}aB`f}4<3zDIk0EPjHDzgxjFAoaRT@+js-9c`1CW1&ElO@B>Zr$x$M~?^%TpV_9 z?A0Ag%E=OXD*Q@+XwV$La2>-6ssrI)evTXnJXS|_VBm!o)Xd#Z&0iV|aN+~&Q4APx zKDtOpx)=mJ>~@OOayU5Dy-#q%E-A(v5k!#0q1~G?IILmf-yMz6mqk+nI67zJccf5> z9fV`nmoI_4A3WHz4Oh3n95?YpuV%NtFe6Y}PW?1M%w{v*WAET(fR{f2+8V+c#$Jd% zdFs~22RlYhJy||y{W#@J7K+Chn(Qwwc{jNz)9z7ukZF8}QIelX8g@hKF{A{vudpjQ z^Ts>E+|9daA)Qjm#Qp5)-5wHnqsE*9S2eF6%g(q~K@2P#g zh`ppy(N@92IsZGsEWP#RvnE49?r*1+mw;c%tbwd&o%lQ21a~`a0*_w%fr%D+;O*t$tR?igZH6hl(In%#d<%2MfRewF}zw%gqusB%8KQ~hPS8`sfmgk3=HP!R% z83lz8u~Z+DBVlZL4Oa`3#y)H%Par2WYj1%`4%+Kpsfl|4f*rr6J`d$P)Js+&j86(m#Ao=HYuYaT}|2MY#5xkH; z#$x_5y#D>`KhK{3r;FDgz5kDlCVmAx+VN$?xxz&l$JDkcX^uaqCWEdonZH`|%GY>& zey`JU8&CGzAUb$O5h4PBhR*o*Y&=`-s{a~{qU>KUD~TWpo@Nt-!P;<9s__Uivh0jE zW~J^q1qGojfR9#Q7SHeWg}-*a|MK@Jzai}Z<{|twhWj_}BHo|y#$)F?*}W6Pu~oLx zg#_^!-}5ZPM5zYx_xOk(6G0H@l^UPkogP^&eXl1V4wi!13}(#xqU$d3H+8t^q1PrY zs*iPLt6sYa1Y@Zc^)i!Vy~P5?1EyKNe)K)9aDknS*_Qszc7jb(1nFM$V*!CEhJj_)2~mJw6O&lDm9k*H^RWAAMcG2U@$INg6G71;4%*1`G6$lsYs zqx+jcO%Z1fW(*Ml5azYZ^Dm{V12N55UTXQ&;QpB|9c+(<9-3U8RVbmEMEA2 z+`GS%hBU$tuV`9nT9mr>E4ZT2DxA8!DA3L|5=8#|c}~jD=uI>&ndsbDuI{jp2p{nXc z86=YdFu}O5z6Uo_ zGT@--?C*;juwDf$zM^wlY^V;;^iVy-3Q?x0ex*4$fVV zfYBJV*u@Z_YPIG3;B4bws8!u2@LFkK^ zValggFAE*cL1Hi`Z+NOj8qfT$i=-$p_O}Yq88 z-B49n{YTo-zq`OEHg*_tq*!IFz-f}!1?>csDDDP{VCdA+7f=#g;EPqrH+^tECLAvT z4KJ0kR0w5|?AXq2f({uZ3zMN~WGlHDa2CNtFEbGfbAn6YYR4n3b=6aQEdQ@}d{NEA zLCl+MVI1@-me0fW6kF>L%BBS(S~n!`0<~nsNIYs+?TcC55n7}=6)9>+OW2BZI$?@y zsV}?tmDdK0`irKZhO+hbX3(NZ*?@xM8RDM3>vEaKKEnG~xGVTPg&AZ#yVm_)YTJf3 zQS8OjA*w>R9Amgc*1(a9&amVXHR&K|felnK^*VRbFDLC)c?svr-gaGms3T1KRnBtV z-m11Pxqf?EBH+#K)Q4k%xM-0{7etVp35}v4CiE#UI(g@Ve7Z!DktYusatFt)UdC11 z%zRN?N_433GOYX>UOz7cl5+pX8K+Re>k&u$l{c;|T^)BTCJ<)LyhVNC+gQL9BTEkz zL}$JsU^(G|!;EO;aM5)EsZ0kk`iMe2Ssu*A{)%WVTGIGvJ{!!A~zE=^o^cohC=C8nfEP9$O(Cvc5X<=PXQpH;~5Zk6O zT($Ps<$m6BB8&Rl@1?7(f+JH(8~*bVDZBvdZF<5xE{NfT@0B~eC^f0foBIXz3TJ@} zCp4>iR4uscS#++Wi9p!&?zPe<87_^|Wc10uxot(8S2?%2nd^gt6lZGQVS1}@hE9<3 z*BH+z460*f@pnjgxN8OAp=po3WFT{Y-Jzs~`4uc5{W|1?HCm%oddBYdng-YKg(kTb z3cdu|D}*F#*>!F1DAn$ILpH1-;jTok*>#d)`^1dp9a!Xy!WwgBCiIEP4U$~%U(&39 z(>h@0_HwZITX*RAnubH%5L#Y{p>hd$I6B1Ufy&4wE}9MjTk84WjM; zW;D>PeOpt7+HX%d=n4Kh!vD}xaw}wte5NBC(5M|naAxghn^Beki79g{V)3B2cFpLH ze$c{2Yh~PZOnEb9Zf3;C*+QEjYdk!-Bn41&xaN_7u>ha1rP68s%MH;9U?Nc>5uTKc z(HgSgv^6(R70*P^>Rgq#iSU1wWPDmSTUKf{x0ZQi({?^O*HRPUD^>jHI#t?n=yr?r& z2kkfB#%zDv=YPGaApk2LD(I$y!QmN|Ss4Yr+uxCA0(?f-8F^^6m$93=>aj|NeKMh> z!^tVpv+`u7rpz(E-5Mn}xGh0;%qUQ!_1dNj&suPw2~;mgFKQ|vScFYp-d9e^!M6G| zKDmFDW3CeNH=XQ(94E<@sHm{a`y0UOg z{WD^!*eD|hM%LWu?_*hmTF$WfFo$!)fG{i9y3TfQ=QVUd z?LQza{!$f0HP9K&3=Mow9PEik5)p68WvEIZ-a#%g7gO8f5*fvh)aIR$B_!2|71!T| z@k^0wO~zLSI=(G`)v-BVNgIO`)srFwqtjzcFR!63zM|=(O<1#=hUG1AFE&S>Y>%ww zli@xb;1Ndo&2S7>;@D?l9hIruiG^9PP6_Vx3wIBF^8_rVC_MG6G~I8|D}{Q>uWNf- zq^?+_-cq^Gys?5Yzm1d#N)UiqkHM*446bh2=_)_Ydcc6qv(`L6k`W**jUW0j$~*c% z0FsdTBtpEwcMTc9T3xuZQPD540n{BKZdL~)y8?k(5ySk)W^^cne}?;+-BQ@?aI|9FUQN7p|Iy=wI0hDc|3^LEdgIT`0Q^Qi(09XK?$rAK1$(mL6}%x9eXSJwu)^uJ0cd<_NNwi@Yb> za6LXKx&%J^-n#JeLtF>|vx-b@uOnq-L9n5ZPCS#WMfvKh)K2YEsI+~MFsW^d^ThF} zRW~xAmQJZlUG-x(oONx8&oF_^yw~jGGrM5`Xr9M1{OiQP);MwIqja%eNK91P!ABI$ zcq5HKG6v8DV+WeQ#12E9{rr8hzqwJitQ-zYZ?@2`Ulb^UAGYGN%#_{{vA*&fe~bCRE+)wzYz zmZN1y_oSJqt6?9S8e4Bqoa*xtfnyM2mIdb#mrldpGA^^|6s^O;94~=0ldvCo5;*Vo8T?J|9NXa$X!CWxV|Wa4c}$jfv$CeHBPF|RTVYr5W;A)|2YKz# z>q?USaH=*6J|%Q#qf1~aer=a6tjJiQ`Z2<~^3+;>6!X!MGJzylj4LWx8dk*Jpf1Y+ zO38?V#9w=l9!^Xhx|2+FR9bRVj3EH=Zzcl%RVe-22p4ww$MlWgP8vV#ebF_~knI!e z?S2DQe5K|#laMjQms^>}|sE(w1149`V(V8_rv2Y%Uf~I$>w4Oz*aL~Fw%fU?| z2oq7~PxQhYX)E!LwCc)=Y1}3XL=Zuu&Lxm`NQeP`o^bf)$<`jJIOzS97{Zo4`~H~@ zVO+!(m{+{!rtRFZoHM8--nF4(0X+`XizJ_d-wPeYm?e7pjNBZ~#QvI`zPXipG3X@7Eyizg)BVYq0nu ze<#nfHVE6Rh*TV1#c00GT|zv#8<~F-${YI-p|m|DIW;qbbMO@ZHH9!^twRz6m9Mt)1B#d zr{N^RwM@i)Z5^o)9-y5Cl3^VhMB7|S&d{i&@6j2#Nyr9ZydSP+ilGN;T)zWf0Dv8F!!_`EAJWg)`}V1t6Zg`Xdlt)X{^7u ziss@MW4o(Xt{BI#9YuWcJRP*AmH)fI1?9i=|G#HN{rhQv|5MLgm%~g1y5EjGXl&gP z>JsDmqO*V66H_bd`-Ka$d#=iMj&jD*;@H9{DlO^?+Bpt~kPBsnkBj`-108S3K^oQ} z9A;;9j=7&Jzo!C|=&K9Iy5;SoooZ1RcE0_9y;?f3L4>I1C1p(gsbw_0^+BMU0EDS3 zGy>VhUmo+?1@4@fn*(hgF$$8Bc?U4~PgL1T6Ssd6T7O--Mw)iE$*s!&@@3|AUCLv$ z#fiy52%0>OypOe?w%cr2^9Wt-b=r{ww5jm=^YBKS4(rX8-X(HNvc)uNc-~5NZCwq? z9@+r8ee}qnIas+H7wo#JSEDt^N?$c5lczChR`DfW!a~oc>&4#1Iw%~g^OE!|2!z%a zn^j9A8{(4oi)yzVW6z8GdwFI;K^`q=Rkltw``(c^sK<3MayV9*J=oE_j-x=U7_y|D z_$;{L%1(6`gNW80jr(2xkGm>8?k%0WN)gYfSCMi&Wb}H7B@{j2s>AUG5tJ^rv>+q& z)h?@m$MCJ5s}V1yMLX|A2oMO)?cpTiEnFZv_(g0m*#R4l`sj7!>x%mebZ@5QY{1EP zOX}JW2i&QJs&l2LQ9eu#n`l08a;!dsL&M5z(L>%fI@ElZ9?nM}roykG{5ZZ+ z%R{_i5!UGd+f4m7Hu-so`SebxMKl?DgD;Sk(k`+JebpQG)>0KDBZX#B-|vYGS&@vx zq*^Jyru=xD<48SfZJ=Vc_&ve=>{S{cCvS6_7-e@GoFzY_mR#Df3^;96 z_eXCveqI(oO%)lay3(AqBh78rIZV)rWA1}68LwwViOMRiFZ%s7T$gTWQ%(Pq$XJHQ zNQwJ9h4m9>k0U?2fQg$1;!0-~#tqeV-lYA7v=qted%>3g$D&yLN#rz~tM$q&=}mvN zf3hU~KMf;)aGZa`k@vsxv%efuHp%&Y0ZFx*9YbxjXS_>u_;R21uN+!z#c|#$VQS~! zHint}3nh&n7Zr(r4psWEOu;{LI{*7)WzpgX&Vkke2;%Iw941WDYDRy0#<+7!pY~Bb7p+TB}mOG$~1f0yO!BJ7oU8ksUbKOvYFdulvfK{D+eD{`9x*!bNiyr6uxjc8|v zFfKan@SsrppSVQ+1H0hQZUgxfkh$MI&)?}b>?j{(ee+dZkVTU8KHiDQ=yRwqj%y0V zC7{77uDmBjjXk1(rXrAwmB}~;f!zi{Fm@qS9igGsWZ@uD%B0wskw9wGHJqD9=yB1~ z>&66Sq8Y4w$eY?jYBjLfH&|64vJ1I=b6GOm$_{rthyB}0=7Yx3!@ReGmjLbF#qmXq zYaes+Ei@{)uY%#=4mN^1XiRvnnI;>6hMm~KEvlZr^^H%iJHcuj$s)8H zQG^(UJT-HA%s` zFjX6R-4hqjTVU$;xz&ooQeN{Ruz>g$cp)8raDP>bq$msDrc4Cw6DHPG$!wt(t+4YY z{lUEO(^(kX8&xo-DhM( zP9~$iYgV?;DwiMIzPc14B>%V%Fn$gVF~DXfzw>fTW|bz!J}@$Iwdm8HU_x;Qs4RrR zKbg3e2Px|Q?nSf|ZLsrE#R)sMG%mb?um+S#yDiJE@SsZdRw?GXVRcBM9YKi-Sh>Qy zm8-}+krRPZM~$b^jF>7J0f4srrmtwI}OxLufuG?Zs^!2bYzVxxsi0R41r7|GL46f9k1jU(Ph{Hp*U@uk9u9 z0=7i)@eB7}+BFJ`imJ-HI}YlBE1e=|tV0_ZD+n>}1LXN4awpG3{wO2T}L+TfOb|H=4nG=bHtiW1hv+OJKOFGG{hd zM&ZcrW^2<1_*D79nG-v-61hbj?@I9D%VUlRoP&w@Et6}h2W9RO8Py}(+=F_zvgB3s z-9^uv?_Pik4-&LS23O;PUcIO3u0mMuZ7sez?Z&Zs3}3wGyrFkIHYvL*11 z<*uBXjdN#U)8}DWR~a>?8=sGL!nSYlS~n)0+NwXcImQA8PPHezA5Nz|;;fE(JV2kf zE`O?C!L-71i>h=v{2SJI%z~ZBQ(AO&1yoeQ)K5*vm^4QN`)OFQdmm>uxO1-F?rUKi z=oec_&SzRl%6}XO8cPb1Lvxp8ZV29=mgw4&%zy-KUqNaPcLPhgveNtl3 ztKLrNZi@F7Q@77w-@2BPx-{-l6n4URuaSsOGRDDXxM-Xg^lNm(>j?pRs@~OBkK(?0 zJoElE9C>0@xDoDd=|gy2vZUxmzP_oPM;B*Qt-ZmcHb^l(hLD)BSd-X2>j4F|1UHL3 z98X_-_$s|RU6`d8sgJFEI$LMfa{ENrD%HiE%wY?f%CVxBv0oha=zx1)0P-pgIpA); ztGYbnsinis#l0|=QKRI^?n0WjIzer#h7ArkZ@^-K#z$DE9|&ZGxC?*zY);za(ko4F z@^T;zQBvh^emG8Lk5u@i{RB*x({_L(b|Fa_VG?67wK6-aC@)36doIp{ zKI#}T4>$BawByWMPHt&z+sV|y@M9HfqQA+yN>Sun@`gW&?pldV3T&2$Uk~a0`pwzh zjcU2Z#dptG=4a7=S5eY4zu8OR9V{UYWn)z%TsLe~kxb|cVCo8%mK?SoA&O0WUWW70 z^GPqJ=1Ln(r0(*(zgWAd8Z^!nuYA#XH-ZI~tKY_5b1%-~nd+F%e5#B0D5)o@=$*CY zX{Z7{qN2II1Y) z?_?~spGspjN)pcH+gT5-S9i{8=Sjx6eZ2(o)s7TE=8!R|dg5YlRsAJkKnl|sPQ~f1 zk3mMtFVj7pkJ;$hoYWB);%6@O^n3lpeUZ5a@?ul@)KtUirL8Zj*%V%kCVErARIjbC=SJpRys8oc9@O zMmVUbdyS`^E|{L02y(2I+-hqS;JkKKG9q_Kij0Jyhx2R%DMwH8A!k{5=AeVPNsb88 zzQK#+%BzW0@0@%u$>5^!2^v=lv(+b3@0^u#l1B5jGYkP%K67txWst<_4p-a+^7`eZ z?Wj*S31-P~DFgd~c~5b>!VI{G-5a|`F^yY?WeMi6w}lgPsrg7FTM}kr&Lto+xdpT2 z!CXD)uEf>0640i*!YmU=M#C!!@3XZ2V9k}wFGqh>Rr|F>)7=g0G+f=IeqS^8ry;Ub zkI|k!`dV23$*brdPy2Si^d~j*k0G-eaQ^-AN2M9WmKH|Wo5Nrf&Biiv4{ya<=_2Jn zTR2ZTPuFuwJKpG*s{)0m6o?}q7D+!sfg0V}smI2zK>#Wa#Za*#JVMzN3#y9sUVF0;! z3CI)&g#c!SL^G_|KJ%tDEzG40Llom{NP~gpz%wQn-t|%4b0SEpZ?0UP3`6{K6NV3iQjauUjm#^ zNl;9Liy>CO*ID>pDCpCjm%o;VoqS6rT2=EUi<@~8@%ytq4Yh0)@r}9Tc|LRNsScX2 zY%^eQ5xhqJX#P663^&>**CmhQ zwwH>di@|sFEHNb32X2hnqL10HNV~H2QC5F+PB~{auNq%K;dQ2DnJ1wFs}X|%V?$4C z2*{1x7)|lg(ymqMZVyvV-Xp}ta~`Frk^>&htb4|8E+J;TOy0=D_=l4%GZ9IBeBG^O zH&JsJH=)|<7*x~u_Dm2vwU7xii16YCI`l3X3$CO|3|dxdGZ00)0N@js||nPh+YwEX_ARM(Y+MGV5#Xyd*YpXA9bhAkXcxkj?Lu zkrgtMaBROuTPQV?y``6?{BgfWC?tHm3cOD~Tb#v?n%h^O`-^WX5X{j*cs|F-|{L?i##cK!bsVB&Z3J^$Cs_}_lN|5#oAv4q+Gn(z0U z@Ba&J^8-qf=JoQ~$a&lxP|(%Z=HuS1q-c|o;J(fIyF&jU%g!|1!c#4^OCV+acqDVw z-qM-Mm7RAUuiRtn(O27zWm|dp;U=I8}#zi>$}0(VlA`SB`0L z>wP&m)(x;^(*NAlnF8RXMgAQV`2CNHmG*b(Ax)bx`IU(+mx8`t_WqAwXkZVd@L!p$ za98vjU8JVo9YW*jT%u?nt(>`85%FRSE1x#6LoNH}^J(3YxZVM5nbl#q&>A?juoX>I z$9`EJV9PWYW=80Wt#>)+3-*0*l#?R5@pMm$1Xd;R<#;*XQK2$E4lI^K$cdlh+ zkW}Z@u+zhNl-t?q1if}VQe+zBz{SF(t`iq2MH;(+@q7!?5f{3>rQMG(E5DN59lKtq zh`B>w=Kg@X!qDa`w!&KiNql3RzL=_c4bkms7Z#LRq9hP5m;D{I*9rb%=dB_V4bkri za8gdahx;0mD@SDUOQ4Vdg6f_=Lyrd`kgI31VhYw}tl&~-669nwGMAW>Ly+)UN}Km* z&LKm(dg|RSc(fro~BR#a{0VzCmO;5$al7_K2bu&hjm!i_ai> zJ@`2NOnT`2shRN;v>naiVu^*s=UIDXbP-~Z6p?e=9`pxQr3&fAYRWn)F76s|{^m5? z`Vdyo;?X*bk%a>ZF{l;|_^8YizAZ0>IZER@QK}qp}GK2X8u!r`} z0tI-vTE|#*!*!Ab3Ka3i+9=Q{7SuvC9yHJF9EA&nUqI^j(oN!D(WNYzv_p|US92^V zSb&UBFfbRfR5Z7-5Y{1QPI%;sgEVo&sq?l<<}ngDJj52KXa))S+`T#{5hT&sis^B}EhEl&%-TTd45_d%ihA1UsoA z7nP844LNdfufJUJ_lwHv{Qi{i%S->|P~b7y8vA;6BP#c((9^CF>4;s6_Oy+|=DB~9 z=9Lkstq4k!bJD#h6M(4Ok~r|xD4~jZd=K2~jQsDY;pa&axi798`~6b(5tge2CWu`@TQNDXp6mCC#uM{bD zO^4)UY+EJcodvZVw7|~XHja5qiy&nm;u6Br2M`->?9TPcB83K3(G3b}bUh|s5Pb4P z7opBW`S5dfAy-T1C(93XQScG@`SJLBt)u_sf5{(E@n;VjO_(s&l|wccV>l;n+XcS^ z6SzT=N3n2GRIhyh)H2wTT%JI1Zf)%PHmUixM$Jedw2I(PFhgf%TsvqBw#@x>$L{)_ z5wUzWr;d+uvz6Jdew`aNNZuUjErXu)39i9k_)aQG-YpJ`~&)2{_C$^pY z?%O*LWMr`CEf%v4ckoiFYhy)>rG?pqX>tWkB#+78tikpvY8KSKeWm$=1w3=hcfZDx zq7=fTQHfWN`EB$_&rIw^<#?P~!HU97w8d)Q&F>RYI15;Z8mN4U`64{|RXH~HX&4I# zm>tnX6?~d*r7AzsC5fG;=qneTUR~aig|hd=14GEXImB zSgmUEZk>wg!(K8{h5P3E8`kUW>s2?mr(-#3dm#8wQ4pPiy=xj?o}MQ6Pf90gk^$)3 z5iUo;E8CfIwmHJ93^!uxe#)sp{1+sTF)xAUXZnLpOXQ@BXB$l~K!GM2P~(!=T;_Yr z+z$uqh~5jI&0~#CR)kY~`aSU{$hVBMUA)!Kf{I}@t5WzI(=O{nh^@?8CuEds;+ovN z@okf?^S+Kv?b{$Vo_6u#Yn~%f&2#y=P8@*gZ8d2`a20t>^6@kjqL2eA(5*=|8&{XAMJ%H1mN{)4nyZbSM{PqjdUMJwjC`4S^RdhybujvkX{^m`n=Cpl- z4`7Y*?Eij1{{>2YlCfgoxCc8abva3=QRu_yv@EYygQII*Q+vV!_xBRUGi#gFUA*V# z)4=k?`OXisL<2tk-IqebcF74)9@ZgsEF0glA3Ib$Irqzi%GWwQ#XxoYi11< z<+&8`v7nOP4Y;_mQf{a zm;V+>FI*FVR6K@N>>pE%np=b3>c?Xa5|orZg0Oc>PkPmQ@g@*0d`_M@El^?&#UED~?jC*KpFdq_>z=S)*cVzI{C_ zmorn{9{q)91GCC7TCx0b2zrahljNy`z6T~{H9Kf7bsv)7p1Kvc!$lhDPn=y&9Hx+6 z$wl5f+GnzMXzY~jSGdl0)%FCw4K@XnXRf`iS=Rja=G;i~%OFNu_NyXk=2wainaQmO zH&!aygkht(r9aED%o0Q!_8-_-M3;wzTs!uA4gUr@~uXvNOx1PUs43`T8@hS& z3@n?$*nsz64_eC=INu@~x^EL|+Sa&Gb_(p9y`-b;+zjy6QNz6iO6pE70mB-E0r@>o zr)nnIXEbw%H}-*9s9G4Md|gGa8jXmiyh5W&MzCd#az3*s<8_^IqT{~v<@{;2BO9D?nXI@*lCE4+Wm=Nm1Hl z8@~8DUzv=IZjh)eqNbw|x3cg+N-^AQ^Z)|*#3opCE`d((#ho-9YVG?<0zA;5-T1R* zSpNUR-gkyYwQboJ5|yM#20;M@$vHz2B#GplBLW94w5j)L)D3Yw#yBv?Hk=-HmFxB{7TG=`_a-1u5N{21d8 z%OLH?$tTw=Srg&mB8-#66UV85BYOwrX226w&v{N1(<`~CRWtr{%ZL$3F15LW=)CV_No z%22EXtz@hhf$oM$Vc!Y!pM*N<~baI@J%+;SH9mzWre4zCKdQfT8dFL2=IgD7hPwf}b*~^RwBQ8+nRc}NbzrR#6suc7?EyoU#x@IHXBH_-;!N3**;U0E z=-MwW|IGq5jl(7B&mimXE{ZUm7|;lQ+L5ixQkA65c70fBan9 zQ1X_9;3f5El^-94?uuNixe#OGoqqQkI}ROQXJQtu+*pK};4VcfVC_?xV7Fq$+CZbq zXOJdWy)cC#9>q8pq$A9{1D?01$8 zh0z+P^;qpnqUG;y?Q`0V$%_>#O$F@+nY7WPpQ^oVnACB6-nCkdFN}Uo7T;k)Gt?e= z8{E}@h3cVXU549i|ES@WEBZ2yhl|n;tAQ3h#ZOvgkd8fhU%tYEF_}C-7|PKcO`rT> zZ|*!3uGbzlWYxctV7%|B3y|1|1H;K}LM^;?xvh1PA468}_Lrj1#cbRbcm&UIC0~>; zZsIc$yEB!Iv2=?p|F{#TJpqIX%+Bb5Nd>>3AUc5QrqK>lOu8{)t;4U-^WJRYrp~Sl z^|qdhZ~}uyGHBVo)6M=P6%Ygo0f>p{$AgOBmp@kD?NysQ>N;BszBd8Geyi+j@#GOL zx2BW`ELwoR+h9R2v0vS7zNBSE&;KpMbS`)C-3AL{wI{a?_c|-(E1Bge>BB&h6-R6h zD*gW6)y{sB2fp}7qoi&%=AUrU54T9*BG9AH5&?2tjqkP7h(+qL$i>rb>+%hM^gCgV5;rPd zdk<7Xbh|9?p}jz9F>w>_G_xY69v;&oVB-l`%!dbKDl7ArxV{LTzWv-@i+MrnwNfNN z+4eHXf-kv;URsJ;)XP>|zPaVB?P`$Hs~(wZhO699KK_*S)$o9+1fZo8fOZSM zH{yguenR;Y-9D1^`OEynv2)4=el?&hXDB@$a=&Vm##m3_pg zjjgKbQV$PHhi<#zV?yv8Ht%{TkjDEFWDnyq(#q4{EVYir7|vAX(R;2K!ZB*q`*DYa zh?kWLykc!ycnFnz02mxF;vgt>oR(&Lfw#&mb2qq5^F2|VdfN4oOeHTs%o5vqfbqsE zLgq|Yjn|@q=Z-Az0@7&NSbM}X_w0ImSB)R|uI*XX57Y>Mxv9dF!KfF(Le<>h9;XH~{sA1Rgg%l6sz~pq*VU-mT&1dh$ zmYYt5jpifRbNwE$w8;-kZqP4lJxfH3HIyL;!^cl{rLNE`-1LQRNgbQd6UKo0~d-NKM&2Vn9T|NaLcRt+tM zUcOn4k(UVtKkRfAQ?7PIO2~CYudaO4UIuF|z`bM7a0bvw*=$I+fMh{1Z>i8Gs$HLr z{LL%Sr7)C^;dU1&>46)*S<%;^bObM?ySG(sQ@0u5h@oRcpJo#}%5hnetVCr45Dw!n zxpTFZ9F6^R*Mx+ah#j_^Aj^aF0Eupy&%b_S^~NUE9<|iQpM+B}tqUNG2G(Yo)=%Hw zKGOxi0WY_svDnnqs48C516&(jV;SC#Z&n-6;)!mFVQfd_17bv?Z{6D1x=8E4si_&; zz>H>%?Pp$QS(!=-uD-o_1N~g*xH|3lSPUArKT6;D;f2yk7b!*|mI{G6)r^c-onhKP zuZjZmp8zO#Ou6yA0^bMgD}>Rw1|^v<@O{>|gYu?S5Asw`PYxt^{ayL@&*48oRtaEr zR-krd0w|nV+dUkR-#X&Lc;jP<$#i(Z=Az3m9o=^8hlg}G{W~(7Y5_+PS7!qCbnd$$ z+W^#{7Cve5GB817HB+0`maaMG$Vz32#&uyZl7YA-%9cJ%l2BGV1dDIFI*D?wMfhTC zpXR)+*6vHQZMldAI3Hw~#`1lq-!?4CL!w%L!iV}x_POZe7wku!iYPt#5;B=`klRB$ z%ybj#Zvw>dF2OlpFpp#dlo}R|ypX}`H=YP8ZNb`^kZagF z<6cP!ke72px$M-XqXqC{->N31ZuWSTghs`vO4P$<1D+vsZz&uBb=3&Op*n{C$&al9 z>$J~Oq(tfyK-M*_b;eVnN`sc^hdxQy5Bf`gxT7=K2N{;%^SHnj+nXy}->b(gQTcS% zclRt$#+<==e_MYekkIV?2~tcVuPgU?qOxYo$lUtP1OV7XLk$*8JMKnWlzAxM&RW%Vf)&IJ8_vJH zOW2^Ps<4C&6!y?&kHBdjMs8Mv3acScc?@f7g&s&t=5eKMpJ=;fqmFxE4lJO8M)ubl zow5X1x$a&<8jY%VAK!D|2oO42CWAU!dRX1w;yo*C^;Wn+T<(-MksHhdHxlbRZGPPh zf3nV$p4Cof@tLN{j(oxG<=s!LYWoZ|0EWg@eM|s<_H8{5)BavaLf|D^5SMTnvkvZM zg5srAF>t_BXOdm~M9+f9x5dBy1f^S)wV4O<;ULzCpxts`Z+{=H-N?R?lI;HUlUYj6 zhJvN>1+^~R=6RC=;@%OkB0}ixrH;)eChLUXy{2UJT?8#VtZTBC>f?>wT4_x7zd6w(9eHo%ltfnwh2IysW}Xedu<~h z6=_kk(ncjHU9x2ZIE$Icx6ztkAN(b?Pp{58PO6SWSDD`fsZyk5e={aQh;LmqH(Mh# z2EyyS>5aO!!SDaM1%*#P_fG?hH7% zVa7;Qmf;3~0jTG2<#2w4xBU*Y>KQb50?QoUhMXK- zUeUmQe&WL;pHFC=99OJ}PRAG2@<8n>-HTw337`Bzg`)P#n2D!4Q27YZIuZu&CN)ZB zWDzFtP^25bQx@%wy=~%m+H=?gemXu1w8C-WhdE>8{MGj%49n$p{?Ad~mq@6BFY9+S znMT-9tEUk{9Y;0AF~AuVzKJM_Vl6)skePgfD0Q(qXDdv(w>8ZaL(lWB@T{{RI9d;A5YN83Dee~#k8`jfX0{Mvqkguj@)_>8h4EbhOf7Z(X2Px75d zfan3w#l?AMfiU!08@poH3ZdR8jC05AT)c_OLy=V#!miswiZL*6pBDB7^)!@ZrxQnM z^Hdi;YBJzt5p~xWi?;I9r3e)0h6jn3euX*Vno%#;@21C&CYW&QyP)R_D|EqrG&xt} z;z2+_-_gxE>W7bcD+jfK2P`pZ^ApS+-K>7zTkmve z>ax_R$;)G0`mn^FPmzSaMuE|#ds~Xcd5`xeh(zcTKzmQXoF%uGyr1okSyjeS61|cj z)QdUS%zBL^2-+6}UGSuh)CG6oE6!#N>i-12f`jx+1GOHTNa4y)vwDG-jR1Qh<#r5s z{^mmzCr`6g-Kn;}OT4Ew+hZG$kPfB@D$#@xXy}E_wCMRI6HJn`XP}ls0$A=Ibms0k7LUG2Dk&I9fSQRScwb5$54TZ}&<&c}@TR^#^V~)S$`C8Po-y zi_aJGAhQn%i`icsn3QNOm?+h`ap4k*W|Y21O8{R*-k1wJr(vLYvBLRCW!H*>z0)q& zbkHoEK{g}8L+M1AtXvKxzn9l(!MMsud0gf(`()hdPF_utcmep!?2n_eOu*9DM_jC% zV!qj&r6jCatxtJ9{4e-1wfW+ z9uKl@5RmL;N*@YDJ6gjGh#9CDpPjo5kzM(T$zy?Ut>@U`FTkb%r!P zuJ4On{fB|I{c~jr3x4v^{z&S*_&&>-gdv4`cyqwpx2%<$(H*hRCuIT57pg2Hut?O2 zCpTxAX$75B>B5O+?sDw~h)Pox!p#bV2ty8O&gdKl-ghIGn>|eEq&x;wiJZt=uNYl? zX#<0hz!#d*UoLDd6Gob8#74VvJ07J?F=LbC`z+xR_0=umzO$@_t`*b=`I61 zPm;*`u-Aaa7w07w3!MnL8C=jZ;{jP->9p~@0?kZT8efzBclE9k2PvoIo<9GWtlnTo zWthuI=psY2LHn(!*xts+{g0ZP+L|;-&DPCCn70DiRD*A<4W>d|izMbFA54qB8lpn! zDanD%9H@hG0EfvdXp8v>c1+r_b}Z_z``i&0eCr^L^G&t|W2?+)AJ=Vf4EX@Wyc`M)EPknh;@vk6Gc4N)Hfa;fqX0 z#asOz5DYXwhJy&*eqVp=iPIJYbF_Gq+&4MCalWV3{i{`akasbuBY_3~2K+vd$-K&D z{^Ng;|HfLn{|P{r{~w<5zp-%O|A*%LpYV?V3!MEc^5riWn%^;B{}28CU*PQj`(6CM zz}f!^&h{U9#(x5E_8+ttUqDrF1EP|)nJaISk*eXAk%+h~^COnyRYD92|mc0r_5fW;LnV6tAt zqRfMrm(p8=M!#bPWu&Miuq`&z%4c03yadR;djnZ8T_V_+buxEHaT-EW@MGVW9}R#U zHT22(w(4p>e|vhr=PMa_Sy?crNdNg{1+;5^SptkHMT5{LpX8qJco0(eu)t)9{TAOB zs<@EM>>D7soR!^4wdtn=m0({pj49ogE2Q8|^vZD@f2CZj-AV%J3NT<-ddaz_+{Bb= z?Ba9?yYWYNR6-rSHD#3gdQRzctd>r0tZo7iPb%3vTbl*2_!_`KU+Re8*HeDKwn;sV z^d_zv%m5)k>BI;W8uJO=$Sck!yurpmI_^TM%pQs3&sw&8n&5D~yxeQjv^IpD-MvK~ z4;2Ag$W5hv!+M=jM3)NZFK|yOY%k&!*MQ?GDa>JA540u=g|s7gkOs!bCN(?d@sPnhErQY2-@6tdc3`bXG#TQA_T z%6lU96XqQ1{G26p7-?}_V28)qgfnbumE&c}-*>1J!aP4QGy$XV@lZ}D5-r6}xj8#rW*8zX+SIZ&91dB_8}TU;YKo$J0UU%NlrmMwtUCna3V zj1~NyU5}BtE~}Jn*+gX#PgbmS?;ucbh2o9mH*}4^CT+%c1`FAh_N@cd*Y->CO3@zD zPg~2$v=o~@w#xFCmXx>$H)nP6W35k({=QQ8R=yAV5wO^iTkX|U z?1Fqg9ZutpD4KE~j~ zqG?lCrFYm|f^V>i?r(X!JKKb;GKwE+Y&nt#AYE0>IcA`pYQk6T6u#iq5W*b0TR))k zH)Jo`9;H$5-n^(Q29o?&jJ>PQ0(UgG_IBFk^Z62MjxQWm$0wj>nO=RgBQZ@LcxW%6 z*>tL#B91Y{O^DS4sNsMqMnkvc+iu$~*j(ZoPDIEh3nYmU0D$L1O%hPg*Bfo3`CpK2 zbz`JATm9uXB;R7R%2@|y>1Z^#=)4~;z+EG9fF&JWOb*$5L8-Kd?hL^|t|P|qg}h=T zQ&^mp^J&{m09v4Y)uHuZ27C@I|3`!D0olO=hwA~yj#W{m`EQFD^Bsq;s?xA)q zr9FVw{72BbSLgmS5WD}L%l=39=lts2bxgF-(g%=6hw)Yv03n^H?+v~WSUVQay`FsHIdWQMNm*B(Ycq84ksQoPx{oWGsN`y_N7hFhm zE(QNYyK-oQbv5fEN{C(!0A}qL7R_z$ykxCv__Q5wB(-AGkVO(W1zs#m4FVDxNLBfp zk6k{2V5r-5%*J{;-la9f?NRpfV&7( z+C+jt5`haZrCCJJFnEPJOUno=3_Dg@iqGW1MsH>la9uG{BtlwE?s#Ti;cx(GL{OtBh)qZjzz;et?xu~F&YPm8 zt09ec*EsOe((!2UAbgwgFBo7*?7=V66<;RVSDUIYhuB}a(ZOY(Oj^#=*0nNnVKfCv zCXG_%k-Szqd)-at@SaWM?W z@Ew_~xN_0AUx(1;dh_v+UQ6oE6Vt84%aMzc7?@LvATJw{wu@;IbOcgzMO#tQ__hx_ zs)`3qf{Pf&QFv7#M6Tw1kLh05SI4|Z+UgP9q-J_z--@Wk^ispc+q;H!SHFA;;l){Z zAyd!mwF=u(RGIf4JcJdhQW?R#Z^*Kk&)EQ!A*%}u z_qPU2L^%=o8$41(fAbN;q|kh0+f)~JMp#eZTjix>*o#sTbXkur`U$#a&EdOP6V;ZP`M1X?u zGQ+pM5I<|x(ARXI1Yki3fw1PmHltb2or^k(d%E1fFx^T-(4w~k{iTHq;rA{>&;sHL z>kd8-uet!(v*EC^)jM*vo9-oe372jAe zL=BSfz*O05Lb{SO^SJ@3CG!S=Rd;sLCfy+(8x0o4%TZK*ix!*;Ag;p+eyG2dT(|vF%(RiN_0qpKx#9!Z zwo@yUxM9GPx;Cn|*VRc$B|Yizgsd)aK?0f3!@PiNFIk^W_~5*7(>Uc7czJ92;-n0Y z-bCr$|2A}r0&v8DHcYAANN+Vkk_a6Fl^1aT(4nCB-aQEq_Xwz~dntc!9r@O4H70K1 zOFK*ri$^|j#Ho9UhowmSc&}++UrKtdQTX*6yw9#xbwFlrvyT$b$$NyJcTH|?N|D&B zOSHVyXf?J%14ZnAAcVr`+1)G4ero=t}TAO$#DJVs=gIx^Q{4wF||qXa+oeSsiQmw>Sa5mh6arwD;Do$t|QE}|Q&UCE)&H$FuYky=6_Vdf(B zqetHL>8x+m#Q37#MK&HAikDaS3XhDtiR-CSZThitr0o z6&&jnRk=G_tdI7 zRbwi#H+9$fLDM?3@QyGkhO&xf!PcmL>-BqQkit_vCBOR}>io4|`B(;t{a4{dT!{R` z-r!F6x6D5V<-7%b@qHZOM& zzEt!1cS_?PQC=~2TwD0Ow5*OEX;FV4v}DU1YdFyekJu-FVcvGhIsl}*KGgX$8F5k! z2Gb{{6rm!(HDv42eB5-h-f zj#qQ~yy%QM*;UIxk340fF}1Z?nzjBcMHu#)`qbRzmc%rWZ)N}r>O85VlM935MRart zPcXg`YeGKPJ_uNWceuZL7xnyuS-?R=s~pIBlSd7^B7BKA>N?ZPO(n_B8soSRVH`U= zL+cSh7QTCW+QG*c=}C**=i$+mA`Fgp*y{21c5sG$W2)sj?p)?H?yAJ9X@y#ReBZ@GrF zaU^ny)7Vl36jV?HoS*SP?7gE-ZT;(FJ{JS`U<2*}{o}4srIxW34s&iR3C$7Sq4@}+Py zG56(SvRyUyITFY(qB(~17%4O=P%?3Uy7&Qk46waC)Mx8*h*zSD%ErVBy?9ex#%pek zXVu+_11=_$06<>$?@x2%Zy$6l_s0nww@YUp#}1#Yi_arfkG{mL z&YChairQ#`2<`d}_c#baM1vg2#$H$c^RRsTx_GUGBaKcq!nI6OKl`3mfr016b}4|> zdQ-UX`;G>3;?X)JTskk{`ZYnr69b=d7mGLDb~0@axMLf&UVKUZ3g(K_F5N4#7CDfu zgT09Hxs_RjC=WHMA_uZm@r1r*zy~~Ub#d3Xw1iPH zs1bNU-YT`n22`%Ks|Yh>!Au-Q3b3&?FehKreX>r_KqYciR!SSJ*ahg^{4Jvr#*1Nx zn+}@BGevfDguSYKhbP7BjUSWw$XcqQ6`lIQxZ4AcBT`gpx|#gv5cc#Y+Dfv=3mlIJ z*KA{WnAlg4gTx4%#v3e{MElCyQ!1r5`2$XE;(|dJ*9YwM4Pen#v&DlV%!u(`mTUO0 zk{BhmZDazj;T|wFv^*HosK{7JyQV8Woe~oSA0Lsz6ZO{|0qAtLLba1#>c-BHj}1b1D}=vup@#?2`)kYx+#*%4PZ(m^EYtUqE+8V?5~Yc1jK7{0KmXQ zSI|GHEc?Vm>hVVpOip;O)APHZfwY4jAz*5}eUR@YYC5 z8qYXs`jFD`vGT*7851#*&UuX(LNA0mj;86;`mSkOp(YSmG?Q}IH!yd#YHEpp11wnp zNEOm1Aq(=|i(orG05Z-JRJF3a2T%n%Py#bH*{qYv3rE!mP}}tqlZB2+4ZNJF-A`s7 z3Qv~miheUx-Ym2%P}5^PbO1B#1qv3JFFk=-++V7k0eqLy+48x>hex3&?^$^zLsQm z5FFmO1F?eqd66@s5!G}#0YbN?8?*Gpy8wG3m?I_& z5Ow@VS(!2ai`ax5VZ=rIvE@y*<96G_B6nbx=)bt{0Dv)hD}VpZ^Pk;UGRY%e8BZHc zgV9C_RJW^9FP;FGc#!w|wvuxNzDu z)+@ik-w~!d23Fky{&dN8flU`=JX5%439_c>)206~_KqWDq3$9r>bDKRg16`(IKkF- z@p!K^E zT1Ec1jUM|_8jV$q?*T25Vd*fA?jQaW0GIt2_NZ^idSmAqOFw*alNfzlPPL8$)us zxajZhw9HoOx~Y|ev^($pW@y0S4jqgJ`-zUy(RoyaV`Pp1>tftqJ;~`YMf%I4{~Mw#~{nKi9a;Gx*aNV z-k>Ih-k-A&q3fNrGgF(ZDQ_JneNQ_+t4t)D@OO*&eN&m=`u;IupuysYAQ9zCaNzZ_ znwy`vSLpt1%g}@F5qSfO06Nf#z@P&;bFxa6Lh;7ne%L+H%o+I-A8 zlir=#%HS_j7dR}KvcDK;X#m^oduN@RGuj?1C&Ib-%>hRT3W}&jtFf-{czrw@c^7sV zQ@@N@4rw${#h5rCwBi?cLZjPP~S z_x)Lp1fs2JPbNs7s<54X$f&mfbyX|{^Z(j5(3~yT02*15{x^#VUD(dj%!E?xkK&7s zYXyHWYU8Swef_*ggVq^6l+40(S`gv7t=E0m8nc(+7Lsv$~ixp=97cU^z|r|WdRGuuS3))<+-c5q*+IN zMw!I6>fhZ+A_dfi#|c*LYfi%7YM0L@!hd@cV2!!k9K4;1_J;ns`~EF6eZ*Yo|TUM#S5W;ER5-K8 z;NiY_2Z-~5%m7c;x%H)k3b4An6LJgZ3}DYf!{a}g(wu+i>7=rlSy-2_PiuMy4=|wL zjem{|doRndjB6Nyg{x@f+7r+6YNN6iN3`*`EK5rHFF>r!`AEc*iZ z(_W8#I|I=Gi|?mCu!x?7v-}o8!C+U76whJeLVwVk@0{#Mcra^Mo(CI>pwLsD^X-nR z;QnRiY`TJmOL;~*sC|kp)4tL6=CF1qgGIS>R#Vf1U;#}D>E(l`>Sgji&NK>jFsoEi5LY#xAaEXn*_!O{#55(U3H6I?&CraT{NB#4L;}E40+N z*7?~m`s@%8VcTV$0%do>kjL#mhA#YC&kr$FN5v9hsxMsAiS)}LPbQ8ICw-E`^=liX zlR4s1#CxUhxy~p#C9}R6QI}tY76Ob0!&au<)&0z(?v@`vYizUBc%+a(NGVW%@!XRuek3V@HBmuy}XO z^*dXgi9jZI12PZvd=8`wxp>|M=#aGBnL9}to#}l3A+oqr2^@RvHJ5Fyrl{BYGe(bE zZV0uPZ+U{{)$U^Bsr0$?~(Z zgywGuA021k(2zJh20M`;J@Q{5X@b%ZP>2bN*rKU3=y~0$()J6nF$yGwPq^RvK3z?n z+haf`B%HMzsO(*#Tl@=)0Kfk3)3tT0Sw$xjJ(2ZoBg2ftWF_iHLgKINnB3UJq=6TJ zww?gg{xB1nOXnMppdSs0NCAEuXpxFvp-dfvDK`#AQZOz36WxB*`_Wc+C^63_MapDZ z#`+Vx4XT7`S=Prc96kEb;@^IP+#yLILm+i|EgayBynEIc?r&d`S46)9?E34vFyADQ3t#?X{^sj7`Hvcz>N} zn!XDgozz|5A(+7=|9f{A7Z`$(SzYv&6t#xoY+3S%en#(>+k8)%W&17z!Uv2U@-)y< z>+`-yg#OUHCf>7(b@VSjUV6!x>-*+N#JjoYRI4TQtx`_5O3(aq?jo--DJ=YGw_UvM*y4AMc0-?MyYy_2!hMxdHabT zBObW3=38^bl3ol6r9xDNxYcnszBlctpGK-+N|*IYT8vney%mofLp7H=u>iLn%;05j2QfR2+Oi=r-S%x4utW%)eyXG@nv;=_3@~@KVbI&I6y9K6y zJn^ILCrB>watpMc3t=-@5PJg2@E7nymdXdZ-r&W`j~%AoEoiK*yJs7(UiTzj9amuK zm2f!NIBghU(jWnU)lushp?L9dGOiHo(qSz^;aFkJ8{{c^`z>PF$0=fAkF=wUEG=-; zC@Z_vlvrlefk8@GN(rX;6Et?Tf2m2l3Han}qVR=L7IJaoH2xXlgM!7mNF${O5>m_C zGi7>Q<%(kiL>csYB7gu5#|HTt8c#N0J*J9>qzQIeW(t)c-Rrl z8vs0FnV7M$bfLodEQbjLLuUv@ZWQt7hCnW%nbJvQZz28#w8&Qx(fsmCLJkwk2jqyZ z&?w_mF8At9eh9m70rkQF!`0pC*72sUG8b6P{a5k3Y@!l|Dvx>0I%K}LiyR)G2tYJbS6f0=u}YNsmg^4-9sDI)I_`hd zpnWaO7`0rPO=$Ftu{O*sa<}M<)oCPJy6f`W#g+-tmNN9_=zh1)`Xm{}>(&)o$O1rY z!teDv#f2VgW)|Rv)dBbbkmj=9rr>Zi+fC;=Q&={jmNAvK{y1|}IX&5@Gw4P6vv9#c z;R-Z4PWfgj-Dh&s-3U;zB{VSCr=}scH#NW;r`GW;;x_eBlLBHHoZ;>Dr*KD;`@SOG z+ZuIT=5HtaWn~;6zV!@Ac9uVjq@V7R@Fmb`f<2g6({N9Yk-kuLbHvrIGhmKX3rjQ5 zUyV$U1IJ}!Y9w`rx)h%NApT||5rM3JNxVzEsz0+@HqW6UDx+|!`z7>MRoJT&+)XK3 zbkTw?+?i#$@Sv=$!9?+(+g>gr9H&!W}FBXhWatLOg#Oj z)xX~}bNv(LS5LzOhdLi#meydrsbvq^aTO52Y^YwhfQy<)Zj6|5df#Lg0UAkIWBGb6 zEn;jHu%2&i46yq)G-O>`y?%z?*kKdJMySA`vNw}%DYf18%;Y;Pc6IT7Rg+{>t0Pm$ z?DjRt3nL#0)Vg`Oy3Quxo}#pq=tml7jNR+&Lx8jl0+6!@8se0Up#nPSy z^BJ0BNv{=t`W`Bgv1t6mT6cppc zcF{jrNyFMQH1ovT`Nhg!GLz*#PN{|zm8doRC_U)WIhBX1cKQJ_Vgk5DPu&>#qgR?ppBiP9HbaRz%YmuAd$b0_&5G#Pln1uc1tj6$AcaTDu4*g+LP7b8OJ)d8mP+BF))9_IPg{fT=Npn>P?gP|;ON-E7)xcAd zY))2g$)kD_*U#GyE?HMsTUL&pox-Tn%~Rjp2?XI>rJP$hC)Nh%jMOSZ(#e?5ZmDx- zNMwu|>yxoDFGSa0yv6}2E(i*%)AqF!P8>eivPO6&ZmD8}wQK8}YXyHe(=;YCqc9D% zf$HD4LO!(?^PkD3gQbPRPg!H$7e1YyQ(`X=G*DH&-o3tQbO<Vl8&rmKp)uiCWyJD@_I?4*$NIfyUz05q)Xazvpt`q=rClo2>Q*SR zl|(K+0mal9#)pkM2#W9n6>rT&KcjqrN&{CcpkPb%r5$rTDbyA}ofx9&usV^!G5%_r zDbX{^yfWmABT&e!TkD@4x>LuS+leFjR6n*jPpRT#N*2B%=nw|Uze>lH-Y%FQquQ;JSMjCtm1d) znd`#1IRNdowR5rk9*x&^R$EqOGREyT*bEEijV?tVV>oga_icjI9c<|md;gdm!AGwd zO&c4)sY{Ijs00Y-Us{}_{T0r#ew4jZ9(&q3@-;rhmN^Nf3)G_b+Ru;;z&0}VX!L}; zv@`~C9d6FQfBz7dU6JYQ!L|vIH;i-U;L0W3 z{{p23y6cVP6xWVETWA7oc&yy%`_X-U>+1w&s};VtQ8$1NWtFwXB5$?Q*tf)86i9I_ zoZp18&L9&+L|ugMqIH1IwA@V7ujJtWAYP7$3EkB84y0Z?UQxp*xQT%a2XeI7_pG8V z*bBtM=RZtXk&A|9)Fi9#ct1yaxJRL)^rQjBpK1HW>4})Qi~Wj~!=9oVqwA&1@w*D& znVd2&bmvJyx=e;vKS49mMaH~I=$m};sH>eiM%`;^(@Vj@JKWbdbpgLI&)G3$_}chI z$4dNYM)8aUGEbUnrVGoqkSqsCdD}Hw&G~1sDUyrkdqld%$Y4Mw z8|9K%v^ko3E7z?Zi=-W;r`l8-J%#)6q1~TC)~m3UU_e$)NQL%CZBMRLaU^18G&otWlCs zeSRK#MKo{?Tef(1e(n>=pc+SYf`_~ULN|#N!&v6&l(d`gs!!!n>9yf~%#EWbhT z3ep^<3jmB#u_G@_7fo(jb&}};vRyFIKZn;>&wArF{NiPo3zQOlPK%b(G_c<@*o{kM zDnH3+Oan*~sMziKok>Gk+p%=5VJ)jKZ5HlUbrSa^zL1E$Y8V@XsT!@8)gD6r%nVZh z8`k5nio<_2(I3*a=U>h`vtXzbjPUu=dg1Pf3t@2_4bddB{t@&X48v)7Qi z*@opa!p=j$ia`8Tr5xZ!|JP~*Ps|XHB+l+^feY_3H!q-j12a z2G9|NkMxHb2(T%yWjBYFaae`mVIx8^)5p^QV&4p>V;_6hd)O%=jQM(v@YV#>!<-Z+ z<&zZVGhYB8bS>%Vt?%%es+)9gT~3jEv7e+WVlm1uRT#B==PLW3Z)treicXU@v5#lh z0ib{R7a1KKt-mf8{<&tsvd3IpC>+&YX8KD0D}7FWLv0g+R909&x0@YfPF2in36Sf- zOQT%4jY%9AU4IPpUO*q-K(n-U6n!EyRsh|VR8xO-rCnh_Ujntggec1)>HOmf)VEwg zk27<%fgY+0sBQ#ERq9A!Xkl|EY<|x7Zf7oMC^Mj-dL>9@Wh2wt0;0Ff+IV>Q*LM8d zHvP~1rvC>jSswzRR8?#ao)IR^1W>Pks1)vd`jc1d(q2!gg3CBKzKtEVkwy?W1sNX? zsr5FhXnq#!)1y&%hogN0WRPU@XdmK)TbnjZp(gph6^7+JI}Tx>U8nszkz7>?Ph?}U zabjKZOx%>Bme+rg!VbEl>aP_)AxHF*TLN#>V6(u8BW0@h>rKpCZOoGbE;9>kisX17 zw}6G#!>$D3&P_^|BW5zy8MGMmjd^5JB&K9R<#Q!c3*iFHHLj?0i(t`UX)01U?IS-W z^hWVt!4X9h`tuQN*V>D4>Q{ zT=(YQYbGs>1By1uscfP=x6YltN&czT7@fu2ICR--dACYymxMO@LeI2)bk(Q6thA-u zil?W6>QsESHFsq~5NRLk_IjQ><*k&g#^xMCDFpZCWrmeAV3VX6tfJjH!{N9&cOH%d zqMbe!?6+(k4hPi!MMU7YjwiHzM3Wwj>nT&!7%qb+fmcd@v+cHi&1#T0BkmrILkKN? z4_Jax$KC?sPRf6M?sPmxh6V_U>`le1#&{7}X~J*1l>+E$9%{q`A7>qW$J5=i`{A6r z>;$N^R^X0Szuk>g03d09e>tWLtUn_Z0AWfUy+0+!uiO0|nu>ZjkkyCGS~MnZ;FYQv zLJs=^l!&I%S1&I=CIzU&;ex7ct z5zYoYJcjjH>F3$gE&}WCm3B z@4MzSX(N(AZ_fMD4#Srd*+6wzsK;FJP{qZI-PBIJvZt&n_=dfgR`^{lD2@ra*!05M z(K`~ik`HdUYKEYh0}J$zi_Nd$RJZ~?4x$`k8F^3g`@GZ@w;(q*YR=uAZD}Pw9{?hl zfFi=5K2ta;a>5qaLIaWK?FXd>Ui#l{1Kv-VDSa-9|8z&f5v(!d#Ezu4<}}KEp`dlD zj%$c#9_Oji{G-#Iu_Vg_+hXY6sF(Hi_OBR60xq8i=__-RFl$0R$*GvqKx0_9;{Q*3 zZyi--x9$xuIs_2}q(SLYx*O@PMI&8{PC;QIC?%bOba!_nDBX)LkxrFT^7$su-tXRL z?{m)kedpch{pTC!564(C#=X|OXI}HVexYPE9Wi26-0?wylb;FC>1=Ww7`W(M_2OJk zrmaV@#v#^i&kP;<$C`ppsM5{}k!Ceyv^$!JXX2FM?_PoxI>hr7OHQlq>(^7B4u=;j z)amX?3x-R2GN^A$-(n}AvHM+uESk*~calTn$W=-{?&qi#mF9jQ}b!H}VcTtD9OFUXnVwN>G?U~!kt5Ae#6zqY@mU%xxhVEj! zXWhTQUxQl8jFp6bD8!pr+n$)dAuL|}JND*l8PCUUhF*+A>R43uwMFk0PQBtd_|OLL zavc&E^KGK+gDG1dQa)y+fm~UBlwN`H zPma5UjpE56nSMDheNe&H+^2SQkew-WG&qu9d&%B$J2?z*>HG$aX(M!@C-M`|*HU}~ ze09RVz2+Q2bRNMqjO+h)W&S(gQ(hHGECDiL4SLQ7Q&izI7kR1Az=Qq@Idxpuq}gZi z^4vkU?5K%3q{*_E0rlmlx*U`havFycl!?%!;~Dfto|2I97?z^e)O9t5!(=W+2vxu- z-4%9k%a3)ERI3kd?W%GHkbV3~89Y)PJSo9KUA(o{FoJ_cn@v+yQOt^O0S|zih_=wX zP+{fW%wO-^ZyA~sQN8-*p{SX@6Af3CU!v@6tRMJow)I4B zRM`U9iI)Al$$l5-Am_&E<&fu_<|5LdRMlmE(Kl49DQfT|pjfwi>Ggr{ed`~PmEoo9 zr8mFT(GhNPEFob*%tu06d!z+X3&cs%?=(_3pJePCw{RYDE>_arUta~71oh@dE*~kG zr|=LF#F@#;%*r&f#2dNB7roQKW2kb`;d+d9L}i&XR&V8T$E%mn724qVVyc9(S~;Kr zd&I%@(BE$dr68wy74HRaFRnB5kld75;>14&8ODF9N&a(?A(@4_naU))n??gpF&yuP zc)tF8S!W4Z*!!rTRx@uK+lt(>sDje4Cx3E%AtsFv8RZiC79ccO$-Z2M+h9k8m(BZ; z{-B{nXldxS)^>l-R|gM|F_YqC%@7pO1Ft0@4Q!m#($Q=(d z&W8|bd_Tx_tmHx#wuGvMR#*j2Xg-nq;39uoXzK`j`f(G&#YN#ej{Z$PuHyF`hiG?y z;VfwuPgv0qj{b4;SRqFE!Op*qH2l|;7q&zcoRk&3(k%qUrJ*a=LwRKrnJezy+7K|G%=>CBAvt#`SL;;Z-^4pZk z!cPsk=w;Mgd&@H8O6bWcFYLKK+1&ET(6D=^SyvWTy=VM^rV>TOJHpK52%zjABB#NZ zQ2JDl&wXI~XQ{zot7p@Ge-Z(CkfL)I42N9ZDw;9pNa(!9g3Es=n9n>AR=8nK#e3v! z(E-SH3g2G*{uOfD0#}5j8q+^zzaH4|ig%L4fVj?S{`TL}KNuYJCUmg8gjRu~j@j+@ zX&Ln$BeAaYyE=xF-ekA8YcUeXNKK{k54EQ6hP-1p#oy~?YuGfI`eXaBIFOq2jD+t$4rBAu96AVk45QCL=;XwreN6@eleh6z; zetYpCJ=@n3Xjo=ff;*WjkD$)8!1mhx@c5qN(1MIfRcbF;zZOIXJ#GJ6?-QUH>!3oe zVDr{1;@zk&Rbt}WjGe;UJ|csZ=MFR~>01hZQDDfP zsx852Z5d+tR+Jt)Im z)oHh^-Tkt~Oe%nCx=wS6@PBe~s7bV$-i4f5B5OXOELTzzB!REL`|a`*Uvq3I>v4b! z#_v=WWIK1!ya>JLbDD6L$w~MICEiu%Obrk1`=OrOiCgP{XEKiNqNkOwo8P*UFh&nu z+vm4^!gW}Xm=Bv3mh8u%Y-c|A#w{@(rWXjO4G4<3$}|YV?4Xs2fFN6-0P5Pa-$A#M z(PuBVb$M2Q{|LF6(7u)g3I9Ls&{LhSUrOKZWZj#7;{@*0vu-I^$B%`jP6y7!>RWnP zB#u~#5_*fvfw%b!Mazp!g+rms?rUkZPXPpfxR;dMf1$)?}x zC(6(A%EnD>uB$~(jVJnlb&C-K>-B1`6a&{9eA(6Q%Zfy zuVY4V)7Yy9ec+OZ5)WXU-BwPuhUS7g!5l7Xvonq58(iHKA*D==KZ)K{NTPM5O5uH| z+BojpgE{>xbxWLd^O=Qjh`Ph7LY9@=#1Z=CUTa~|7cWC9vuPXrWE||-RLfz6oqH!S z!q)Ybhi0t}wp)})G?xH)^eB0=GBNVQIwhx3MWOx7ubg!ii)||XM;W5_KrnJay72YV zTIh>>xop1k<~UWWXD2r05ScvVZf`fsma&Bv++|Rxw$xQKYG`$cRl;3Damepl?`-#o zQ=(w*um;C>R~E*f+P~Tn_(sAvrAn#LDZMtxS`-%&AdMQ!7Sp{iRH9={Z)r>**e7S-CGc%0^@osNIV3Z3IbF^FD%g)Fsw3Y0RbU!)IgRP~xuA1%(Ix8Rhx>tL zotM@P#nbG`jDYfI&iZn_-q4&Y+bh-x_V0d8wd7c?aB#w<+jq(%G5U&+Cg6c?W~DFW zy=>g5-n~W3X2E#xm9cgQU3T22y4mYiC;TaQq;Xv>35xoV>yRV=%r#>bw;AeV>gG%M zJ5Ny6(4ZS{t+U-5ZkG@VPXJK^#B`5(*C4VzW>fnj^0!C%UiW0JySJ;qFY#Qbq#@!z z4AY?Zvu_iN7U%ElsC`%sQCy)ziq{i@IyoIZ85#@Qc z&ts>lI0H%6^BreA7wpdaXSKehP!0m@6zTV|+ees#1H z`+7>$N;m%b4Qt7f0I9$TM9HWD^!8&zz=kG5%(9ok`=L|&fZJJIhHII!5u~PC(_py6 z>fd}Ne%fvVt0={s>okC@>R=;RN_Lg_M)u8@Z6r6{(LO6??w9yac{`g5aXUK$Z{!8z z&iOJ6X+$KL(;w&A&)YWjo+Z7cvmrUZoVA;5*3&)6pfSB>>)8~40ZfJTIzu#TB9^;d zCebz%13-b@?`ucBVE~63>Z@>LiIREc7r+o>0o9t4hbIl4z8Mgi7Q(F))jwzpd!c?y z5;RZo?(LpMP=xzSg347uM{6yjZA9_$$qy1Z=J{>?Y#?TTnW!%(Z<{OAaKu-p?}i%1 z^3!;xXb`dg;6`%a{Y71U5~oNU<^}~gtRB_7J>IS}BOLUyAg@xAMbm@|w zmXG+ZFguXRQS-bm&bpWCIo&>8(%yuPf#!#6kYO3fkSZOJ%g~&O!^);5zx0^)G_Q5K z%*jclc>7A}f5D)BT+Z2il9y>Bm!v zs9ml6W&of%z1-jMwu7th9aY{bQ95sPkaClq>aL~KsNc`29TwJ|7Y6m=Fot_UnICCx z6T{qd_S7X#lsH(j;`sU1fG)$2&TBuV+wC)=Mcp*p^V+3l|INUI9*Q_F?n9I4zh$}p zmXV>w?TR)4WP&kNO*^5Swm~UYOKP3-QmV(TfqQyw!S@L%)>m5XR!G*LiSb3L!a|jN zJliGnWfn07#X4>(!iNfa$(tC@O8I{=+u5J+n+1p}fXun4GoFC9E6sE zelf=hdRp-(7PNb|$fIuxMQB!3IY!0Q{X}5}6|Z42OGA|PAB58b=%OIswT$+Yl4f66 zx%37F%1KJxD>qnJGIVNkB~@$byU@kGjkV`IL5+D8@Tsv)_Gp$P5bh-_m+!&fFs@ce zXv$Y&rB(pESBN8h9NisjQ1KJ42F24IbRVw0b$6!=I(pZAz2ETf1NPV`!c0X2DK5;j zEIht2;Ps^iX~2n|tzfmkJ<*zU(&kyQ=cjc+xqy0>so0g5Y2uhG`(xzWz04&InN5YL zxcxTLldxE`x(L1Wy51Azn*3om=5EKI)EnQ3lJ*Lr+n`wx>HL_Yka3RJ(Uy%|k#@W) z3;XhEEjyyqs-@eUt9(k_V0S-YJll?!V^2(bfT6`O-EDGV|Db{-#zlUI2_^Fd% z(~sE~00h$Tw>tWN&LaZS0#Qu>1YuYeV06^rF!Xqfb6~8J=%bY{O_N$fVQO&~Zf5ch ztC}?ZnH`yPXYl@hYJBg^Pue~q9BxZ$Ur6&e=I5!eok<5ai z!!7dbE9=a)U~20GUtWlNzv4crNsh(!{v>PQl;g-u&8XO(u%Ni~XOG#-k@~keq$u3@ z@ll@w*zBP*zxc|M+3@8aM_ZWqLSIR5ed;r#m#&bo-xZPLR)V4Q!o9BlfV^{9=tcz? zj4Cv2bu>!k^98-g7MduTjrHH{`#BXT(qWzT5~^XXj0>%o|ESIMe^9&p7j?Q=>WHfi z+u~#e6C25DfhhtHvxo{#re`oNn?@NYo5F+yavmsj7%#AISTlO-h)S(z4wG3WhU*n3 zKWt^pHT?r3sS8RBLAi!E`4t+={5P9g0rpDmEcd!fS(~)4O7bA$ic{bV{1R?54{l-W zB+Fl1cPF!|ryfrY98>l?mGt6Ymnw!sSf6amHa0b1X|{gZE|!j%X(vRA77MfEP$q^- zr*{Exj*1CPo@(`=26;F78}O%@E@46;rX;{rCD8YRyL(~mY8l!Qd)1`mqE-uT{18Vn zNKi-DS&4;tKFdssQv3e+GnWd%jFnFMModup@I2$KrqXwHMO*8fn?Z1=%VscH*fMIS09V{+CBzY{qy(iBASagIAWtyKw zn$qV*$uTam9a0Gw)G}CjFW)PFFerco zWU0{%KzVDhb&}mF|KxaL;ATVMj<7}bPu&4sRjYrcWvzW-vzj?P4GMBk7G)b6$@Md2 z4$AyBetQ(o+i~din=ra(gsY+axFdPf?0`Iw9W22c!2f)-zmg^WQfz3K(H%@!Q#>zc zVM3wXnJGsKhWcPp`$frL`s$S4mB5cDQf}t9kME@R@9GI?xc@BXQ?dtK5dUo+RU3)@ zuN4s@r-QF;67l!b0tINUw9&sPJRC?}n-{|GqI(s?3iR`%m~w+T38n2TA=rfs>7;2( zU57X!#YUz#lI7S;$l-I@DNau*^F6Oxs{qGb=H{)?TPfx+2t|a>9uqRtn*7=Wq(l^F zu|!!>;e(C$3I=qH!Y^t3xFtVsVncVrJK z5pI5&mkWE_>vu=KsbFz)NgU+KE~-^|8 znqp=d8V?6s8SBELj2&|KBfuW<_>tfH3}u5gg2HnP@btM3JU%*#$;Kh~R&_5)-bz68 z|6VowU)3`tNwCEa`*ATZdM>ujZQBXF=r^yUjeH^zTckq?`IadI(QL1Yy5uA*8*CmM zbj_?pujyPPodV2E#x3zrEnr=dO-a|*rWrjqsySVf6;Gre4Y70gJTGL!5~T6qtq1kh zmgzL(ia+zxwM}pJDWLU0 zmwt3`<%{^!W!H?nGjW4#_v{6uWxwJfWL0@eTt^KFd<7iMUb9$9DbDcW+rH?*EB@45 zRrlKHA%Rg=^M6>vaep6A6lQ?r<;Xl62Fh~NWrZM7@d-Z5eZ3F;GmhZeo^;$yGnO@A zmJ^OSy9}4kmtiaAU93B3M=?eEjRaVl$fg#QrQBCf1vY%xEp1ULj-y++;?qL7B|5N`rSSJ5x!XE$1-=&bgP6B59-;?D3 z|NqQ@?!E$W+5c*)_CI_6zr)(D($&Hw$O`S0rE{{r!g73)lR zraf+QW=UZ)L-hc9s2jZe*oV@Gi*H?mBssPHCcM|J~6 zmgMLZ`;b21_zN%+BKgaDN?90>%-ImVFW8G;Sy4g_eIP4zUsbZtUnk)zs3k-? z#s^g$r6lq2z)PO=jkU!)4Mb|IQC}$3cI93#sa`Y>Qi-89))v}(Q7_Bw6M(_jPI@%4 zDr+nW)e6>{L%7-}#VwFmeT4{O6oJU*d1E5#tY&1#D<|wfITWMqh{G~cg4dgurqIAD zE1b5V{w%bzNI8v+N!zTzXS{i1qz9pJTLEF2mj_d6yC{*yE#mL3XsmngYBhTv!f%?mA~z zx#w~Lv{7;+(a9&JLXJDcRi8JNeo2f@MUMthEqs)-qBxO->fav^iGqL1ZE2n^rzAI5 zj&PAeiM>qipFSFHY}mFa(n{m{n(+#I&$Q)at}^+Fsm;?oY)FkD>8_HFO=~zhrrW45 zOGM^iNnfcU&$kuzAI&>-BqI=e4S9KuI2SaVbVWY2tSAL-PfH4a8`3)i?0847dz*KSHwkSJ!ROL!v^n)n^APjVzp3yWsHi{eh zrQ^f+qolIr^F5ZOKDx(EI9(Q}fJ4YCcJ!-1N!T{P&PsrE2cx(&k|i{vG?}D-anv8U z^m@H8iqedJ2XSOQ@z+MGq@O8ud>N}iHbn6jw;!`_&lkF>8@N6;*1sG5JwgZUKP9vL zE(T0?QCu#NjraqWy3{n6Lw@`Lg6b}76zV9f&p^&s9@CUmRdPdz+4-Bo>(J@^7&jgC zJBeP>%Ysr*^TE;wJ!$BS;#uXr^f8=AtRQ)pn>ipsN@$Cg>q#Oa2{BKIt`^ZHt`&$z?fLPS?2|URuwyW3Q7L|9-LKhGFmLpE zY++oA!KH%1NF#e<$Cj-P~b1n4Z1wyVn5;E;$RF*9}7tL{c|MM#iROIwQ^tiP?M z5G9cNVkTds7QtSb^0}Fgfy*_j-rDY4W5f_Qn#R&NY8}*YFFf{Q1SdzVwIRY{mUL`a z*R&~ki<1q9R4pH(CS$EsIxol3xZFJO1#(*t_#9SFS7J7huQd??fQD5oBZK;L>3OY` zH>(FP>g@Purbbd9JYj9iZvjtrgcrjObGUk_Q|c^F!p-eDQa;y}Ovw9Ud_SdqQUEfK z(F*Wo9wasS&SdpB1qo<9rbq_$qk!rpl6iBvvjfSZI4w&zxm02`pLopnayjbJo;@zr zkNP}~8hM4$7j}z?Uw1;`Tq-7?>Ge>)dkK6xxjZevweMekFGE)?&`)6W~{&l{Z7Z)jiEw)73TrZS4>;PSB-)KUDPw#O|xv}t2V=5DTp;<@MZ$|U<|KX zfyPK<6yKpi{ z6spF8elB{*mQDxSP#Tg_xtR=5M0QN zFl)rz3;h}!Wtj0PK&XWUlko)6(5DQ4ZC;8i#ptEv;nU#q8PX-LDFd<9&^~m|YGsSv z5c1nJ3k1X`LLJ!0#QKQAz0Y!*xdH_XCOxmPGH?MCw>PG?6)ilj^Qj0B=c zrMW#QwkpSw6w;1aS#>+&2IuPJm8keF-d&~!$+fcE5vowUP1`(AZ&&k$SbZt?#J%>i zrpfN#NZX*!_aGXuZ-aUDUcrT?KAP0L3twA82^uoC~$7dWa*%`sV(3qvfM|ZwrAyJ?%IyOpBmP zT_vLzp`ea^NnLmb)ZFKfj8T?YDr6-z)wVzD=e=kVOQ=JFZXV@Io$pn?i6e=D-)AKl zGHC1cio$%v!sRx1n>e9xTiy}e^ali##QG4c0Ip;Jnfs{KR!e?#D6e`D*-}^Tl-(Ap zg48emp@TG*LUstF3i!#+A94oQp!|u211mi+p{F%rrR)7=8n?>+c z2$kTh@6Tzmp9%YH@%4GQMSpb-k{~10)>0#Cv)2E>617z%R#Z%#Ha2mEr9Xl#~#O=y1c! z)hCKe;ZxC(eO(_^Pr2IXRlrzsARK4=5=va|v$eVy8I^tCMHlsC&bxee8lSMXhXbXU z$rjmIXwPs6&SyL9fQ9PEa-&|2pg|C&Lb{l{N7P*1am)NIN8XJ4#se9Drm|vBTlW4d zWw+<#*;ebPBYHs^wavngOTrccl(xu+L?4Muo!MvSx~$^IqZQ+hZiVSrxf)i#BgINQ zDk`Zqkz}uG#ckl?PWo0-P;EvR5qg2iPkbmrmH@R%Du=;oEmU2e zBDE0ahc<5+{dnp-D-OP%!BH*?*Me+WNna8zDL|tlW|Y0XM~ZoCdqO4Fe!w#o8;KVB zNmHmea_O2D#0>&P4F+McuzSuH8FH#XxLGk=N-CndiztnkX_a9`kU`^GUf`MwC}1gu z=i3z-npi$}8Jbo@^(S0xSDYLQTg4c6vqiokis!4f!?E9XCnj$IG095Vygl)+9TVi2 zdw3dmrW*hCJK%rORBTh^AMaJVqTd4fq}q>RSyTNm68YP&S=hf8Uh8sy_GEj9DzXlI zWekd7x*B7J0`HV6o^*$3zhNBVm!bh5T>}C9`X8ycX0lWgjk@9DkTdS zm-)Tjc>K-6_GxHtyk65&?nYPAvk}LgW7&n*EF=Vv3jOP&b06@&%6hf^2Sh3hSH)>m z_&sCHn-0zA_v=2`Rcr!ac-$U_EEz(!So2`>eIB)1TE7nMJ|wnbi6 zM>u6Q1U7s&1;De zq;+`zE98u*2)pJ8gJza9@40yW)BLyoJFb1oTLJtPY~@aaAK%{BDzNW|;M|{9{H{@7 zV`UiJ?f-$aG%FY>gk~HWu-uSR{^=%GvHu*G&*;*;=(lBbb<+`5fRNDoX@73K!Cf!Y z+063J>C)%L-qgE)2jV4?by!c)z33Fh) z5fVNZuO7K%IA0Yfd%D2{z~;I7m-#)0dIKyj5?z<6<0WKixb7Bo3Pbp@AQH$d__%YB z``}YVo(=sE)mIrTo##$j3#qH`q?!?(%JKrRpVt|tw22wHFAa|gQE zc0p`W6^Q1~N7q5oSD)&5#x5{=ZVzo@ul>-`=(XV(VS5HFNW3MmzRT7jK(rSn@UP?K$<4Hc%wb~n& zsB{mW(%tf>9oiPNRi_nk#;T*dgZ&kKM?R53KRLrg9aG<$ zGgyl)nI3Obc3w_P8?;3E3HGuWyNSJi-R`IM;!~;pEa|jaLz6&&Y3$tgOa-j*0oMK%OxPwgz{dp6=DdboO^O`EA#R%&h>YyDZwk(Cr z2d`(ALu~ET%lPnFei>yJmr@zk@=`(tbPN~=w004Fpg%dbSr(n`h$Ag zSUG^uRQ4tB;9j0w`C|X|v!StdV_^kybcwDQF5nAnLPo+5@r~YB@_V+x+wb@yLF&Hf zt_Y7LEV}5nZp50IotLDZh*UY^1`H%HO7TH&#ufQU>7>LX-s<-9nv1nufk3{uNK%sv z*WF9~)AWUe0QH= z634~gOi7TekeNHAsM+w^8W_%-3=|DL6&dA#kMXMYFy7z-G`(P)*#414V&c+CY!A92#T zZjC+HP&9W0^~s3EzE;Z9!#r&($PYaWk*H<6g@1Y#?tf@7U_>AYc@w&W{iXH)7Rb(| zAB4_ub5+qW@1B-*;!>XyQv;u-!it=Rd?yl@m*VLj{IPQ9q)!%*-wF_o?OVd_4dEGQ zlO_+?=^3vI0yx`vdgp2~-i&BX6XLopy@@!y^@M@0*aBUl0NKh6kuu{zU!=V=ss9{853Hf|tNQ3G$6AUad0opIby#rgslT9_e|nabxp z>vL8;dtVlNQ|@>ey=}~nm@r&yt92_8q^d*8%oY+$A$hUZFK#qrosOyEwE4ga-Tn88 zre1kqaVl#i-N1m1Ro8isbwk;DU6_2mX>VHfsDjv5z|(0_sjrLr16*f1>-)Ma!G$PB zb3+7f^6Q|$pAkf2h|j#4NB`k+2L8Vm9b-sk&M|IU9&7ejDlQg00+ok6oTbRx8*df@ zgal0=@mV7)h8S7;v6NRgPOTX-%?<9)%c_B7=g%Tf@*b)L>v~gFNdIA~Q#5a$fm_EJ zM%+e*`*#UK4+ci{7E}wmG+%dxVtBv*;w{ScZYteGq~hj>cerufeZIyzZ+(~RDnDJf z0UzZVq3=>DUu0RIjIugD;db64rHpbxSKDA&KAcT-Wc?)RA?3QOPlgK67S=xbu3Jg; zLsYH{0j%?_)DQX<6PE5v?oTbWEUX_EhP6o@2b0SSFm@8n29j8*N@iiFhFrfKxm^Qu z03Im#Ud8OC+c%;*+myhXYPyF}xDXl8R^Pj}?rRwa+8xNhwJgSs zHiFXM^m4(}70ZvAHCh|dTV8#vZ^Un=g;i;S6i%X9xuNkF_6oSyhNIvP%#9?8BF@Y&ZZb2^-)T(B z-!3#?!_!TE0?VK<_%)_F+98X;IcPzuZ0c{=NDkY8Pu2&^PpZD}c+>L!bAfhii-r5Ui$O()cL&q-(J*LRYvXhWF3a(PEf` zR`(Vxj&Tg(Y*q6q2KQ^!*bD7ck>oFkAwt_*3j)0cEb-VP50dLcp_lMB*NgDz=LmxS z_IGRZ!Cx~{j+l@?AEg&9FVjz!$PPqBW@NteSQjAO*FUj`?uULgV~hM$l6LDq?Z=OX zG-^n(J`+Hx-%>taS8IcTv&wR#Vh4Tnp>d%pf+D|aY&!b4IufjC7kWP6L{Fj-)V=ZR zP>;+L0rjnAQ~6*iFjl>Lq3d1mhvv3IUr%LVT!^NMcM9m7O`Z7Q>U}D%wr@<8`0^Rs zr1Eu7@MZYcwQSGfPdo|nbcomcUviU4A0qMrpvK`H|v~*~ClU4OcB^ zFU(DBu%Fh00_`jHr}~PDul1n=+(>V_j$rZQG;y%)wl5ECPDG1Pa%MU=$e&GC^9Vny zGkA28AH7A3IY&L%P+x=o@u?q6a&YL?J&-&g%|hQJ6aIh`CQ?JV4ND6O--$nZHm=DN z8m>PF5l$Vg?YMG8bSMmS25|3v2@nj36YJQ2c;TS8R-NX$3;YU>g}Y(JgI^vlb2L*Y zvbR3FrhC2}x#6EUwFkw5J6VVbY^r5vVL6mLI+`IPAmyXJfZCGSw{!V^LD^$*H38)v zWA*8!u9*ZyVYv9T^cnBC9=tC>{0MuyO#3i+YI=G8CyeIHShmgWSL! zK}Lf(aI$>RO=B&Jj;ltWUpCUA7rTfEP@(hkifQQ(a?C zd%Q-Hkxv>x@@3c;ymuI3TX-LVrLcfk;5)4%w)vDd%;abAW`6{~WY9tYdAX`(YeTF? z0!p`8#Idkvd889oVf7RIxI8{lSG|R^UM^$93kPzuji{hlW6IA5E1#?||J73g%`N5fE)H3)wta$h8cA6jRf>;AU2usZ8+d5ic1SRmrNpYeI$g zroMIhwleZIj=%7+9B`{7%PwBxEl-Glwt|APXMdxPI9%8pl-K%>-v=BN$=DQAeFZl2 zhB(gziy5Y9QqP`+J)WxC0q??4imHASb=X!D68Mtjw7j+mpi0nku_(@HV{#F(5+se2 zH@QZ^(-+Ta)>m5MkVr&=jFM^zlTsrL!XpY~{c~FCV(B6j-xAT%jAPIj##@vRScm*_ zQlk!*5jK`_R9xct`U5VMG5FA@Ov%4Yid>GK-yE|B;FYYb+gD4`b6VVAQ*&HpS4_`Su0NEV|fp=q9V7SNb z0SS@hn>FPbdgfAt6!Ga|jKxZxKOpxqmz2d9h@AD3Muxq-1<)>*;BSGTdb%}Fyh^h7 zkI0@q=F}&#g|bHOqhcvVn(U(MdX}vx7L-KkuD{n&XBHe5MA{q=qY{t?8LuPycHpOE z)3co$q>O9V`&9bGi@pAqefEv+N1O&ihr;+7ePKhN{sL6dHzqv z%YEnKTl3t842GZg21y)gzGhr&XVB5qw*@5*fCzbXdVuNPHxqR68yy7OgxGi)mRXEw zX)-GOlgcGfGji_r#-*&0D5SD5ydkof>xDqJNirG>DZqvB|S1b4CBn7S@#>l^L^ls=7dXp{1iAi z({-M)DB1mHF}6sEyDH&-;UE0JsA@2hEMJwJPdCH zc(vB4!)LDNZt{T+-nt?P;7X09lG5Z zyv0qpN2z9^1%x#;Y+D@Ew%GJ(3lwON%hfr%K+cFo&^*eLb1*yM3>_erxs#O|bmq!w z)Q&;~N@IEmF=^jP6&I8?KYeE+sHUH>oI zGu%!*kz$*}aeXILc?8^0$FjrnM^-JUls9l8fZGILU;gy2W!2VqEz7EDY#=v9lPlPw zJ!U8eP|_MZve#o9!npL-12%(&`JF%@d4nu+`5W&8r9No#k(jQZuLud$z%fsxMB_9;BFLS5<549cT8mc18f8vJ{~q zf#ad@;LAEtup!HJ&YLqB*y^Douck}$yMs%Q#LG`@!nyQ@_=~L>FQ#{L^peeOz?f{O z#sP$CrhHX%^5|VpchmCHEm$S|WP3(}oeo(i0{%!j9*f+Vjq{;>KZ0re1w?_jDRqZsj~kr1+37M@-Qz*|C+l7v*j26n?VMA( zi7!0RyUwn^FY)WwSveUK@I^B zR1TxGE1O+SFsD`m{xYYC+<@p7T7G+-i82$Bn#TTFiCxZa@w3z?r2Byi)QNYZ+b6qT z{1348)>oe(SyKT1KYW9zJH%CmX(MEa*DD$ znaP`9-)qjSPG};rBd!Q`4~^T=2G;7vgxg(f+KcI=V7}Dl z3M)edx6K7&mYBHl<&{)&pJa3Og>{7nfLzx!26E#_CaO7(T)OwI>8wwB{LVYEh$!_9 zb^v?LB#fS+deI4bpZz z{Yb7aGH21_MHe5P51{FPZg(|4HM08cN__Biy|_^<^9S-IS%=M?CO)Ow>S*MSLZcIn zi`i0sq}3NTgvkb0iR;c89EJ!NxFQhc(yFhjs|BU9m!INSi?2tFz}aDu)BlcKKud zX7heJ*st;_U<9Z}Uwz}$w(L&T(e6cZec4mvd)dRUyb--&Kg=PA?BlxeteNp5H5Kd< z|NF!LF7D82uBmT@7|}R*i1w|kKTWp9Tq5n-Aq}IB6#y#)2?8;NI$X-U8}9I^zV55Q zA_a0>$Z!A6jzYK&_A=tRFCz7VC@U2pDuCCPhaWg@inr*8-gqnsrE| zKaNs;(W#}?rplp%d-Ckdsq9%XK0WZuV%kZDUD2fqQKY^z-f|pHi9ms5`*295cj1D` z`XN?RR7iF~L5js+La*5hwe`EBohWW}L10*eM#A^-LTyX1q&Ro?XXl65L}&F8;N%H# z{9aF&Lt-P`N30p%7vy!=hT2qM(fR9?f_ECrpIcI9OcJU3wH|Gs`fYS!?=en4c2rP- zSR(k@eCNUigD)0u_=5K&1+Om=;`3zil&uRJfA>~*8<4b1CxlF^mk@eTwSf&HXTd5v zX7Ej}^?r5u_VFDP2a`R!L9?o2=~zwEKM$eHN8D&hp7<SZN93Rgc%O5e{DEe;KHrsu<@KVZ^sSy@VfA#><6Yu0nCWMiHH1+YXKNW z|GUo^fBb7&8vk8L(*OCM|9Zvu|7)M!u_M8CfH+){3CZ?(!2=<|-CY6%cXxM(APMe}Kthn<5Ii`6;1+@e5AGTW4#CcB za&PXv-7{~x=S{!)=Joe4zB={WwO3WGRk~{JDqz#FCE%XCw45{mfdD`P_y@q2flMiH z8%qFCR0Nm+06+#1AUFUNL=f-~fRF$Pzc2vELrDL^su0FMY2W}L+y;RElSUT|?;M!( zUFttWXi^w}1U}(_{>3}E-_%IL@1Vai96$ID5L1(olLJFFGgk`>M>lIHci0}l`xFeF zzz1h4jz_Ejueh8d{9SpV`-SO#!F%K6G2h~W5AfSTD6hX?h(Cj^U>VCvNEkg;RhE`} zBK1pTtfwZ<&h~Kk0Kmb~-BndaoJw0qmkMbW4+?(+I8`acJ-%`M!_z^Zu)lAD^lnK^*C1_0o>&757`0RZ7U7*FHn?hIl? zkV^p16=V>^w0GF*56pXqP5;1meB!yPt4e@0#$e;2o0zy-0RZM5UsRrER$xAuF(77j zG`DdCaWja8&8$t#L3|D3hYk)-AjV<@F{9}}^0E2{HZd{%D^C*>tH1CcvVbYUf+cO- zob63~em(qG{y5lsfc5pu(SX0`)~>RuV2F2DZ+1=!cNiVSeU7F|${+@N7-ZZAl;}^q zVeKKS0pdIQ3o|!~yZ%W40C4E$CXxyuCId0GqrK8yJ>JC&TiQt9bwIH0;nZ#1<iUoHz_qR9xMya+r$+_f6zpE*lCD?_%6>(YYVyGc{cf4t80PrAfIp} zo;I3FAjSmooSU869S?Wq?)X|u+{OQ*arIEW;~8voC}8O-eJ3MG1EqCv1JSQ?q5Rf1 za(DC~56~y>*6Ok#2Gc?f%}jprMg(G83pedM9)9s@ZXqcJVo)wAEi>C(hkejF4@3aT*U}pp4!8(BloZX+^)ep!UJf5wI{9T_0 zd4OjEo&XYnBtQjtfZh~v0&D@Bq3`wI!KXjoNCPH-D_{j!04#sT{7#|u>y0D$s|NT2 zdw?TI;Ym6FwO7MG=ViR z#^$f*-=&*@yxN2Q?%n?>jtc8{9HJDWB%&;$G~xqz5_n;F0r)52uPEr*;d$Zte)IJ= zJ;DaUSA-RW34|qtZ$>sDuD>aM=LaYPOCXPb)QSaIWB-s2o*vAV3tk*v2+SYM8=e53 z1Ym*Z1WOWw7Xi!W0%QNltH)mw|6P}V);kI-3LFX{iYN->zvlpx36txuT<&U$3i!J& z{@}~>uiBFMtCoJZ=kN0Weq#fegH#Ito)Q`Xt%mkO+oAo?W@tS?1#N{kK)*m+?(m=G zOaE#ctG`N9{nc`Ipj0-0(flsY1dO>W^R9K>SW8&9S+D=h1#q|Ub_e?-AmQZf<7#7N z?M@{I&ej%Ga*k$?9#XNhvhe}H-Q0gi0|3VazvgrZ!RNo@LUI8>NEil#HUAx_unqv# z2LOON|92cCI1VvK0if@tnTM7(5&VVZj1cC&?fZ#%iA(Rk02s4Bo!V3|GNI{-JR3SPLV~7>R5#j;yhdhTwLy{nA zkZedXqzcjqX@~SfzCyl1mLc1aBgiEj6b>B@4~`s;9_}$5FPs>h0-QRW0h|?_3!E?9 z3%EGA*Kj#-rEqm{?QjEd6L3p#yKtva0Ez}BfZm6)KzX4OP-Unt)Dr3neFlw!ra*I` z<e~W;D zK!(7Kz>gq@pp9UK;Dr!|kb+QvP>awFj-7RcGeks00z?Kx9&qgGAlf4OA;utPAXXr@ zA$~<%MLb19LLx?DMiN3&MlwNiM+!rFgH(#tiZqI}j&y;HhD?dfjx3F=i|mLTf}Dc< z9{CgUDDo!q70NvnIuw2sWfXH1AC!2MJd`GsVU%^0D^zS$22>$bbyQo_VAR*B<*41L zbEwB?XlT@Ed}u0YHfTX;uhA;e`p}lqexl=`GoeeM>!W+1$DtRZx1oPSKgPhopvMr! z(8X}ah{Gtx=)_pSxWL54e1s{7X^t6)`4+Pda};wQ3k8b~OAN~p%NHvJs~T$<>-#;F zd-V4t?wQY^C=?obh855Z*nIBm$*%z`?atd-8a%b{4rY!kJ41&;$4h5TmrB=5cTUehuSFkDUr)cqK*S)& z;LA|LFvp0&D8lH%n8P^!0Pz9u1G@(q4@Q{an7Ej%ncgxDGs7`+Gutp{Fn@iB@R0AJ z^q^qf0 zpnIw(rT0p2U7t%oOn=gV!NA*~-;l)6-mt|8%gDs2+8DuD%lN(VwTZGxj>)O1oN2o0 zftiF^s@bl&sQF9tZ3`ibB#TW;Al!)MMFG*z>LDg_ovRwKul6qxTmdCZ9;3Od_Q5m+0P)) zOrLcG(geN;Tn~~7DhfsowhJB%VGT(NIeD)2yy*q`i=Y>)p)#Q*VVGgAVN>A(;aL&z z5jGJ+ksOh4B5$M2qdrHoMyEty$C$-@j%AB|9Se)Ij2nvQj?YX$OmIw?OcY5hO2SU^ zOOs;!L^)l+^$t$B*{VAL&S*fV09;r*OpS*5(L;EJ-&2^e}+QeJ&w-xDR>5=K@ z8Ri+InWCBHS>##KS(oo@-c4uAWY^`;=cME!=6dFCf3tS z=G^wZ-Ku@H!>D7nQ@eA#OQmb5TcNwJN4lr8SFE?SPpGfCpTEEHGwznJWuz4{)uhrO4+&$wUnL-fa&1MP$LL)SytQOq&%ap4K)Nyq8a z(}gpKv#ax{3*w8SpS(YNFSRZ=ue`32u3z6gxM{joxSfMJz+lcMt|q^90VwzjX9@1F zj|u?*Lk9qGM?n2S?~lFNFN$CL>^~8>P5FiX2>*fq*uVXH#|;4GAh=V*CE5U>67=+- z#{m7E3a%~+z;ST=<-wRc=Vr;l{tH0RT|w340tP!K2LPld0Jw35!LBl3u$w$k*Z&3p zZ7zT83Gb-y_Mq=L@6afLchFty^Fx&TU$cLY!|0o`w6 z2pkk10TBrq1r-e>sJ#ckL7-4LcqjtG-9{b~0KNy{u@P|Yvxy<%s+b^AyWp`0CuSql zh*!1atA0J8w&tHUwg-1k2B_+Rnm6H1UO zJ-vPXpGU{WCnl$+zs)SKtgfwZY;JAu93CB?oSvOu{JgxA3j#p@5bH0={+(Rdpj>e9 z@KAW9JGmfmUZ96!!z0{hL&OnNK{9c{rDhLC#uHD>u4+f2;ZQxmH+B7rNc<1 z>R%29W&!~f9(-JUT3lROO@3B>&Hu*^HV^J7-C;8TIuta)=k*ebSIrh8a%9W~Xuk9C8rmC!c4RDnM`oP1`J zh{DjU3dy=!g8@tgXGm2iLe9634wJl1mP2?Yk3OT}v(($hLAHFe)(wi?Q8sY6SNZm$ z4n*!{Pn>*Q#DBt0-ze))Jww~q2LrrefUbm0iMNx^gvYGvdKxZ8UBxrRzeK5FB!zvRl#lQNQT6^^FBXDECe ztFNP5rm``lbe^Zj)IonscQ99b9`^;p^AdA&j5f@crsun9Ky>#4(?qiXk+*$fuIF^J zz?x%^kJ$6ev+~JN&CJOMS)>{Gas1{nQ|bO1Fd%`h^~N9T22%2s^T$u5Sr{z@MNh`T=UAEXT49uCi(VKtxe6DmXhP%y2PxuF6-zTMF}dD+Q5JrO%i=4 zVp_mzrkC91RT(9v+p@xNPT-cE#w~eZU$N8eN!BHCwg>5!O#a2u=0cXD#hu-RLHf=1D_Q;KRq0ZCF!b!7OxZmG2!N@lRbs5ZL^}zci%c3EAoNw zbJ*%;e3pE@KspfkN;t$yN6IB~rLoMT;Y87KTu*Sxhn+UllugiVznJXsv{+DP70n1U z_JiIqb0#6OCc8@lPvY|n%LMWO{0)w48QFzz@2Zkyt!B_0nLftxFj+ivXzX>g%bW05 zAx|kL2scA%YDrCa8^RQ0^{`hcm=^$hG_X@t&L*hERbEdKUN#eLm5#+EnfyhXAtqpg zCR4bUFw)G z`Jv1w1ZAw)<#7j<8UZ<{M><}d%%RxF@fpKV-qpzo@NcSbM5{-gutZ36u_pXBkiq_)PaU|QvPncY> zR#ml7mfp_GokR=h*ry4&5aRxf5bITbN#4ojYlNdeH>cpRyi4QnF!YV~5lsy1?Y&-s z_;TL!?BL0-`j)|~$XwCiI7|9ShRz;t&XTv06bQe0uN&V*E}}e6Q|agPl7MvW zl`d;hV8jE`_%-1g$QGU!Ql9s>Q$?>U5(N~8=MOw=Fi8P(0&zqL#YVZFchZ%H&!YGT zy$CAPO#7stCet3s+;uXI-GC>aCPipztnG}>7Z#`#v-s8;4zWx6>aN~PKt&4gm)A5b z5iWFav>c;~^L(nn+$b(e*?E+pJcVDZqIg>j1Lj3OC&T-+hF?3BdM7RKeSb)8#Nx#v z_W+e+EC)CFq~Zy2FV{*IO5o}W47izw0dKYHGyoNi#}y7IyK;}qXYExUZ3s{FNv)LW z9zO|iQW}#?DQPJG8PfN;83&IU5<|g}$(x7dA!TgCf5N|!d{bC(-;&D_LhkPcEuEC+XqOO!FWIR8dE^e&TJJ1oCzKA@68|RMF5!&xSX1E8|fw0`^7_ z+J~Oi`tE9kD~@62mzd5mh3sZbGh9gXrvVc=4a`3X>@>MsXI!oKS$OHvG;k1vz5%?K zR=ZKN&%0(kCgYkWJ`6lm?CD#5v>g6ae3R%@(C=8393Ov!&0oI0Daxz*Lz-4RBJWg+ z#im;w1hwRp>)BMV^PKg;4y8q4jlR5b*t|B*ceXxLM8d@6I7Vn|PUC}SvEpe1J-wYe zIIgBdQ#qPc-!QYUAKr_d?56En136KQzddy5T5jdWsFVHO#}84m>nv5HYS+T=-nDNu zWniAGkF#hV-6Ai^sTi`x8C3fG{PYxw;>A?fwKfch7kGTW_!ITS;a+gA&M^#lNCN{# zq|l55FtxgFnJ2C}2WTf%T(3?FZF(5YZ4>C#`cjWtIp9fckp8N@Q9iqbE&>v z6Q}R1(f5Ket0?IJQSf+{{Q9+W>8{-JRdDh4rzdJ%TK=w)==O8KDq3#?ea$m+GH+G5 zz!rwvFl*6E^8Rf@e}BFA@p!WrzAJ+)li~_Ewkt+)V0?GOli?L4bc15{|m}!%hgKe*9ae_|)%#5o`s3^sADL3EE zNQW8)x>jq8hS;|IIY1Fowj~2VvT1VoeQFZTE#IcvKptYF!SO+m;_cb>U1}AZ37JePetmN$5%bAP=%ZEm7#4(k!EbT40Q}d zt(O9P9J5wioYck_WoyUOhT}aC#*80zJ|9{%WX=tc>0t~_RS50-{)kT`E!fa!8HFak zcWOf#6(!x(1qQGn`0i(y+FW|1r&fOnHdB3euof5Om&T8zCeW$7KG}%TX{dLTqTuu7 z$(#8;AhUC_$+lyRzW(;h?MxJ0g?FS2HNAccBR|$VXp9yMVV52s7a!+Yb%S`6XZhUc zQRq3`BR})CdRxJu6Iu>^hIN#9aKf*3Q9ADPFavPY$wl3hyj-NpcvEVdt{dv|n`lBo$ZwdzBPU(ie#& zUw1gLjn6+bfBAk({cb!gqx_rDb2g}3olEB1U=1q2m)|AJyyb^>hb0>@{G=zcw~Q8ZFbk z$Won1QYb@H;+VaE-#kSHJ0b2PM6I`xx?vb~A)s{yV{auQ-cc8;Y7f>^1&PdZX{ zhVz#|clZ)hJ`CtW%Me=4T$RswxMu(G19@rS67TLDoU{#OTU|rjiDk1!VQbFkQHW@7 za81nSQMoiFfi1~PHA8EDm&Upm3FK=>UtV}jf0Pr66A@;FHsmz0FsvYEmB?0#*|?k2 zX@tD#HMMocWo^j$Tw|>KlynOt=UY}5gVza?P*uFVxgpa;1!wQ%$sVVi?Z>DeuOG&g zjEADOnIXk&g3Dl2P9C<+IwHk4$Tv#ulBJCBfsDT5SKE5X65^2rfNJ3ZA)r zSkgJo>(_oPafRe59}8PzTyMS+cRG9b17;Krg6e7Eu@%Pk{Bc?N142nKV8F}iA@5ALz1V>)9r6DT$>-^6kP5f1w{(E)|+qyCle&ez<~o_ zdA<@PARn6-YhY#x$r2C~O!sIzE7$A%#8PvJ&R(j*sl|%kj4mCxM7u=cy-6fw7hFFj zDCV-7Vi(PREJjK_QyRhj0N8X9klRuQcW3cbLgi6?S(?$xqpdZk9#@PS1KIZ@7;-|I zaRN%-d^)tOHM6@Zq>oz}3(?6}p;DVqfW{X*Yj}rw7PV=7$erPlTV8AbVF1_V1MfhC z$@sm-9wB9l>!}A=T_t`+~GKe#-Pijb?A}3g*DTHUDojfJz z5>=bCo3xWW_ib*LtMn`sKH>{$3VaOqxUsX}2qNN}l|iMeZkBfjy7`r{+70t99{?J3 zR#vfL`4iN`2ObIrG%G^u>ysw1>mj+OdWf9cmW7-{v=(sqqV}QD}Z(kY8EI$E%OKeKA>(?Bd~7Fgv#>vo=U; zlEErXbh>mT9G>sO<}X!y+e=c`{c=YEuLG_>bFrD~v2aE}N%@$EQ~lOg+YNdOQ>UY` zO{aI;X~4If<^@nmP%|2^IOCg0xEEQ^-W;@4(4N3Ho;1Od6}NW#O_5Zm(vR?EP|V=R zl&i)iXJ2!xG~k4vnL~4;^hA$v{PF9EvC6e1UqDFFHP!I(guOkk_Q%nvZ;eRKFNyK> z_EQv_t#(h}VO~b9pNQb=?Cui`Ev%~-JF&|iq~Bv}z~LD5_(P-rt=azvAp|!5UuYvC zfdPGG0ibcA_x1&I>fxv8o7`o)0P*{8>{m4}v!Pz=?bpm&y@cmj`O10P-{$5NS)uMO z0@N`fN=vNXCqE|ZIMIIW39+!0?^)Kn$Er$Za*KMmx6{B=!T^K&nJ!i z0c&Xa#1N~I>v0#~4=&RCc{Y@3lg*^jTw{1| zpVA9j=89E#5#4rqQtzu#ZJ;xx{5hzSIX%cFiP(>1d-S;L+2dIYx@4O=S{RVW`Qk^S z_vc6A={({tW38qm?37WhNAPtppduIsyyNnv$bbPucfavvu!Y66?~if z@>Q1n&3)G(1S(*2Dt0*t{E$5#)@o(Mue-ygh z3c-LxrQ@xvOOh#?rD4qeq6^mtJ-I%PK`5xtCx7l>B^2=u3LyuN0 z?+phN`}~h#&mgf17TTCEpdk%gegSSWDrhTa?^~ldb-2WE2;O-1VzbovM0c|Fte!wG zCx3KPVTG?S)ONstEf)RrDAKM}--3*Cg+hKQw0+rRcDbWUr!4D^l<&f|em(WRnT@&a zR+w)+tKaxfs*tj30HFhrm;^B@E3)T(x$h=7bGKxpb99)B05M|d*gF0f+aqs3r9(2U zsIwf*j!ve^OsIF@q4ni$7>M5^=QlUb`ar^@$qb!8CWcqT~?4SiCpeCloV`Vk6E z5|Yx(l)eCEwFwnc$;O!Ymu;&Qvx?3z0JZJ5J$*H+hUYe@cGQ-?GF3E1+rjb8dTg$C z^BbOh<~g(AVZ&Dg*L-#v>>UmQPiq=OXQW>#bS%rx@DA}Gj%8OrIpaODj=#?*7^~V@$jQ%65l#vnf2rouEGlM zXUl?wj}k4Ke1dS;1Z(1(1v0OblaQ7raA-%%X=@MQ^gA;SzkPfpB+6Uv*Oum4==Sbm z9sgUs;>2_5D(PvyeQn$%R@?JXCoa-GYDz49o1<l~!um(rn}xRPCyN*I z%}4=UIXW$SrQtNjre$$4$VfK5Pi0OxFDth^&lXTvcZ*kue`M-R)N`w-^9dq>+`x7wssFxcc&IqXv7=egsT}+wQ!RNze#fE64gBfj&!fr~J^ z-9^H0JCjWTtFtrT(HA5_X0@xRuzxJ#d5C?@98&k_06(Qns+LB$^Ki5-h5B_$oZf}OE+!?Ts~95?gFvGb~o~}B$LdA z6q1iSnc!sm_NIsg4-KOwew%rKq5>G83Iq1D)>S8^tBPXz#~VWY!i>f9u&a|?7NO9? zyfb(otq!A$i&hxmDY7xK_iAYYB~|0#%Q==`zL9J-Q}B96S4iF&mG9+Lph(Pq3cEv% zfz)E9w|x8;$6NJja{6kDRJTH9hZ6P(ZpUPnm`C2iEi(9hHd9H%xm<;zEHAjRnL`^I z88&wGOtD@c&m|xg5|z6KJnrJ(46kKTIzquYT2b$!lh0y$B>q0rFL_6{j2x7sS}|DLztk3-HM-vu=?QD zh(H2s=Pi7T(2?z8XEB$9kdaJ7F?Zd^e%|pbrSaV22y=h-z zY+IbxZd>nY@(8x&Zs*M4UQJ3We^)%-dzGpF2y+T7sX^_7sj!uUiM+|gCu0v|TQ>mX zRq_@WeWm1;ANVt~1sPs>$}a3cG(9m#9OCz4O0~oryZ4m_5^}X!dQ&)ZKgui7HMd6X zHIVFW5xt4|gxGyE^Ni!PmQ|??q=Ja&iH7S3yWNF(N4PLlXJ(FNcpE;rZsh{$OKW~1 z5d6VwL7pbbdhVH5uf|1e%o#TbXA=`cTzU-)qrNwbc@=5f&IGJyG}p5+eW@;$*A5kM ze%43~ZRK1PEBn^FEzBEoDSHw~D7H3879shX37U=IRe_)+dUgtERZP9SXqpru;HoW8 zwt3Ki$umr=jFT78Nr;x|16XBu`=;Bh_Y|k^0ngNt1MpO2;i4Gd-FYn9iz*mE;c4`v z^$5Qr{{pLHir=EPYqC32R^|22ICl9EWsIoJPxn7%mo*dkI)u(eL;1XpjBFSHgw@Op zUtg_v7p6Dm?tKpVb(u5Z@?i06d4P|h$XXSjrj!`CrSdkD8mYPRIqCB+^pUTc8%zud zj7U4p!T?pw7V!I(;n%<|>gMwm+SH~l7n>Q6*tr5?G|mvhZ(~GAWH-KAYRmJ+?b%`o z^BHrhdXkABUenUm1+_kc3>x}qz{!9S{v=)|+2#gAf5Km`WY-t( zOfyaDv;5(T8uhZe*K9h0@N?oqZ!4LJ{pQDy!)x`Ft2?&3={!m6+2&e5_DEw1W_3h6 z3irT(&Wn}!j+BDOD+^x&&J%Cg^m{-PJ)QW#ZDafm!cJB=7kOa&h9af<*giuLo4K`m z9m#`=vwVh-DY{5gy^hRsm`%Ndg!@F{g))7gIIXeu^erx+sj`<;A5c_D3B@q{3Q*I zgZ#`9M2*PatBopu1vC1rW!!f&QbKdbfPxIE@a6*fw+JL+zNPeuVPqqr_^{Qw=Gvwx zJq>|`PI|qEF7@l16=GwpWMli}ae>Plh#yNq9a0GCHtclw)#@iCo->m5@2QhKPxE&z zh_n5n4GBc6aV@@DRjOO$+JA4AR6n&Spt(684m{?aMdgeMM7RxLgmww?6-@7B^0wBE zQ|si;3xIk7jz@*;xFE{%z+1li^)=dgOrHEn$o9^@$0uByBH#Tm&J_>wA4lN$pI{RUkud6+Gk1h&N8aUSgRwtWh zJ4GoYX0mB=q`>0GzYp&!y0TK9H{5uubG*cyU!h;Ny2T;R+ zLvkk%>G}_2rTdnw92`wu>poC`%=i|6%66u3#-E2RY;vn#P#LFtP57YOrLPy7*&@W` z%XTU)(~0$yko5Lr>yd8fh5vhh#S%6sRUOvEpuIoem111lrNL=g7tYU^rGDJ^38J^u ztXSmCloIhz4=N%m3dT#zDM$+(k~+eqDA}koKE1mMc;A#fGICL9g5LCmKMd(DUN(={ zOxxNsV@-s$bFa&(a0f#-Xg~^&jx0vFvrnh9TpRw_E~}CEj5p_namuW6=ykv48+ncd zwijx{;yyH#schc69aE{14#it`SG!zjRlE-GT|WklpXJpsejE2^F5a1pH7l|ml|x%) zUhCo~ixWjEbE9Cc3`D=6vLQI;EIJN-=q;f4I8LZr`@>;;wBvmOGwHR1FZQY8k$8W z6p_dZ;je*f1-6u@gRN)sPGQ#)iynUb2(d0xM1zay7CfD-V=5V17ty=POGJ7kVwXVTIba!+;-7mqM9AP+-beUaIijaDu1=y zRX<@*rxnIVrhsYb)h48Yh+D6wZG_Rik&--Z%k9NlGek_B?VQ0D;u3r@;bDG4eP~(& zo?MXA1Y)GXjLTyU2)lwrmI+dCn?Pe}wierpW=rE?Y-XXInHMcVDE_qg^`5KJ0w38e z*T>b5k+tN=+S9sogS6{p2w}>v#BLw)3cByAjg%L>%BY$81Q$0tZ7FB(S#(OrOBrkI zFF0wx$Jyu}AT~MUXJyI!VA?ZNZj&6-ZCmdWVSeZ0sRVv=!u-ta9tuBEy9JUHOORM^ z1^J<&Q@(eI_xBf5GuY$iucB)Qis^>D*K#rW(aF#?JWRJdgrT)ITxD8X^{Njy5>x26 zC^s2srUP|P<&OsmdsC+Bt8BdLT($cj4-h%3XLjZ!?sfZ;Q|wG%QKHD)?X^F}% z{=ia1lwEmQl!B&Y=Ivr+mIY2f#I?0KV$UTacxlRwC#clPLWfIlV4$t}MT&Cj36wI7 zIXF$xpxItfI?Z7_NS2qiZZF%Dg6(3eL03P24b3@fi?k-9uan|^8wo+DlN!^x!dI3P zKF-~jvi2In#+V}j-;nR7;AOJ@rwpHmmmIp@s?!oCI1PHyNzWYf>!^Ol3-trOE>V+* zLm7U>7oK*02#=|lt_VvI_&_xiRL*1nQ~*8i6u|80p|@;ZghcRb=2<4E`7NDQcFRWB zK@8yDJq7GC1l59MkuU&dq1>6hK;O?&?8yQ7Mq$p(VB!4slz`wEO&|6rl^S$}9_=+q z@bu=pB7Y99)A%SS4~2Q*BVNbzJzUcp}Mn8Gg!I9bInSy8Q$$|?D|(dwiz zP48UmcI3(~@ynP5x%Si2K$Puj18-2v<$qP%U6ahgUX#dQC@MB5N`^fqIO!N(>|N2` zeeyPgU#~#|=OfhEs#p^ej8rZZ!1TI_yeD$0>uYWJr10di9HaB{OGeX8hQd~^!EG3D zq`nX8{aLqTy4&s%wS!Ji<8>yNuW0YVIqX%ZEOa6)7E<8=R32LbM`vm5yB%gG0$g63b zr1j2Mjr;UOg6hEcksVF9E4; zU%_)+D&&f=C#%69nPkS-0(BKC$?L=QJ3IaI6xAxc=v>jo8} z%@Vm6VLBBoPdmN)Lae9P_VkzHy5PF3fXlP@Gn!Ryg~NVUCAxDF&&D_;7$etILnGT~ zP+uyry?M~MX$0#+DVGDrKewG$Ic*m&o}J)l$xY;&eB(6db=U;y5JG$3Cf?MKPt zgYUIcL^)kI7$?<63o8{Lk9h5FQGPo6gKU2`Kb1xF=k21`ED~A8`*x# zeBhTdmXtB2xJBf#By;b!wwZSaZ1a)Bg}(VKvj#Hy2es)bnqy&g#&n^p7S5il_hknl zJ3`MI(gL))(td=~jqRP(hrXb*@L_+qM^5v^JY{cNVh1hzt2Y2$CVd*XU3eQdG2qW6 ziB`l4704w%``J(;j5TqJ!6Bi?+`_%<&=8nmW~%e1p6R`r(2%lN@UHqT1?fDs-dhLx z&pVVgAMo73FPeSrysH&z-CCODn7Jz?MF$yu06J&jB~h6&61Xd=0&PT^E6_rmmFHJCE-`I2zhEl2 z?-cg>Hd2`rI|qn(Klo*ed@Ch#GIDoXteN$^?iowVxZ`)n)sXL>dRRqB(^=<(dnsRu z^ppkoReV!6G4o*8;WkJioRKwlE+jkI1qV~5(6^6=k2?0x#bj#5S5PG52$KkHh#Nw359SrNgW*6sK4%7u@j zn;SfP$qSu+z{Tmi7)2Lr5ig0L4Tm+8oPEs*t;%Vn{AafYZD@ZFhLTU0Ldk71?q&*1W5{~kwL};A}d6Wn}vn}0#kPXZ1#Wl7Rb05yygg9GmWiu zwmN1TI%?X#y5k?~%l~FQ;$O4CpIXKLF7qGUWAbl1^$V0q7_=PROXgBnU!L~3X zdE)rJT{qMv%CL&nOzUet{4r{v!} zx3n`m-~AV8JWOTj^&EP%_0B(WDu3qAQQClaznpC+{yORXw_Wj%li&Y*4unnq=XOs2 zFRuRaW|8d7EJPN*yXWY9jUQ*g^YzPjQ!uiV9c;U}S;FdJf#X8c816@vO+hAMIVasg<4We7VXobSJ@eZFDBz`p80(43%K_fTZD2kUa9wNFzdx^+zlJPAAOrxIygRC*<2FPmxl0pRrD{c-RT?3rJXzcSRl#)MmV_ zL;`RKM>ZL@4YLj9R)72?b~n1*5;@_j(95>Ra*I45t`jx%k|v#>(H~+r6yiIS6Igz8z2J0!!n8Pjct& z{m{=6$;3~2(2wI}5tN_0UAhNIdii3u7R#0Cdpqf=^W1xMlEN11zFH)NMRjo2q@Zw<4mwT*4G? zY+sUF72xHczT#vL2^{#TSbT=~dFHK3&~5L|VyU zxWYdvA=nTj_=#0gz20~uf_nK-Z41lAIwwl>pR>Wg$F7;QQ$J4pbP6gz_*}n2A7w)$ z**u(^{tw zKF3dyNE?a6`GHmAsT@NPu%qa6c~PM@^&_pb!?`$$Bl6M4Q2XspjQOQi#XGV|r)~^G zJv}1#A{3*Vq*3gfiC)S`k$eu>$%2ESVC4F*`%h3mZoH~|QFS#~pByjl(!(g~OaIm) zz&dZyP-wQ2c&;c4>*~HJ%^`S0n5)dJ%eQPIE}B|rR;R8v|HZqFF+gBsr zfnB~{5qpY*k@pjpR~?DvwzJHpr(U)6Y>%K^m_EuAXPu|rO4eu=HDU5 z>t|F+!X+jPqe%w$s6`x1 z4?KAR^R(kY=z|F^425nStc#1(&k>1zQsB1=2g*=n;w>I8&ok~HUY;wwxx|)2KsnA> z7yo$_gT=v#osH#-;-)uVY$D(3MTe(ER7M-)BQHzG3}?4k_osoxjw7Ad%hCd?lx0rw zcsyY>7p5daUo=&MN@z@!g}(W;Gi1tcyg>iKLY3AtAG`h~mDay|4V4x6jx9r(;WDFC zTz*kA;8Qr%6!6kf1zC)?T*mF=Iy7;1cA+UiM0M<1E}t}-dmcQ36ukU0v5%&TxPtoY zsUHjopLy9NTtzBpL*s~^d$7>BgXJ>&Gs6Qj+{2=Lm&7X~xzl%Bu4}sqEn>=qxBq7< z?{{DKYwsU+UruBn`D;`#L~qs( zWs~@PUR^qSWf#MBs8{`pGi6F?;1iv(>w6_kWu)UWzd-y82Zs^T3q-uYdnX0fz7rJ@ z<&O#?XcKfl=O8f;$@>ouE9GIPYv7K6w~nEThunoL3RiCfz<*0f(10lST!4?S2VXvj zS4eJ8)1Pm}ZcswCX{6RWN#LV?(pM`pM&7<5*&X4=Jy}bJC=YNsup!CFf9H)uTFhF& zMR!01hxtA3g**NZ*IFPp3|M;%Dza13pe;D*<1o(d*9)&FJVTjg)yS()mI#CkE8FPf zg2%TQ%PCdIo3(O5ePzar*@mBy!dALGpCdRyfxOLYNlUEEPBV=>xV32jJyg=D`MWPf zl=SZAvu{Q0$ra+n%)*G|bHDaf8+^a@4|s`-lx6PE$fwG#r3%Wk;a>7FZ7IoD?r_X` zAqMdj{dxK5_a&tN*$_7Mzg{^MiKIMYR=B+myu`P=5V@6#ro3ExAgGrMUge~13dm*Z z>(=o)z#L0Ax2!x-sKD7KdE%=-BhgQbRz*w~Z{~|5w|biGf#?=(51t3d7p+?oHVE2l zqE>oVnAOuHvL~R^EaqahY{#2+46i@W+rc0h$7hz=Ni3qoDJZl&`&MB$)=$SH>2@~+ zcVD`7o_Ke}ZSNwJNBL{>|H0l{hQ+yN+oDAZ2_D=nK(Ij2;1m)(1b0nvcb7r}1cC$z zPVnFw+}(n^ySqa{fZT8O-rc>{>eXxS?q@&uoOAE_!Bh6tm&z;iopa1N#(cY4?6YUh zK53I)YuL}{D?*b`^GSqgJ5t|JbjR=V-3@`-PZ0LO*qNg})raUGq8N%WkPBITnvx~7 z+lQ!evgGL`r>2X(R_VfGWo#c)z|6Lwn^2<~-qFW(9xr*@H8@;X2LzJ$Ml|N1RB{I# zeK8!5bzh&t;-+%?k38pq%;5VhQA$1%lo%H-L_7k+GG8OMAi*)4Q{njDOE_9}M8!0AqX~OA?Sm)ef z_(pZOsjboc>?Xq&Z!eXw=)xoZ)U86;cspPC{e{Ps|4e-(Nj#M_)9B|^c0R|sNYOPC0eG`^fzh zblFw-uH!+#0B~k4?*a{B4G-eUKS8%^ljU!LqC!^aG}nWuFZ1YslGFdAE{iJ!MT&^J z>{ysw?l(SbDK$WqS_`+-tejwWn$kGvH7I>9k`RRE7)r zx`1m!c)J!@COAxYvH&%JMMACah~U3EE;}Lt{;Ew2Yk*i&!cPzyHaT>WYtSvGqvp79 z9+oDCGOY%~#35^b7Z!nO=r1YQoYGao{Xsz1gwvCs;rZA*fty@oI2Ff0l+F~Oq_B~s z5VLs5v5g>D?8Nz@vOk)J7H)+N(Brn@`uf#PX}OD!ck=Suo#j}_IqHo-6?~v78QVgb z>*ZDh1x)gJPq2J2HSN2zYMbNKhUU7hx?xQ24~<;r_mO!Tf)k238~rxD4#-1~O_V2q zk<5F?yNn`R;HKTV-5y`ujQmaEYDdcWVu)3;bgbb84z$-6@K8#F0n=qslALLg6d3|t zrJpjyZ>Kq139vH(N5xSCydmHaPdsM2$W_!W4Y8oeMr0G7}yG zuesg@TS0*;f8n%Q56&9L>EmJc?HP(%8X-J18>_G(%PFV%#JgO z%9Sj>mNk@^o@qp2W(SFX7ftIB9iI zl&PZU(J4)rPY0#B=7if zlj-g)z*y(}2`XRSbr^JgGE)+ks=b~;Q$v17(eV}45`5%9{y0q2ZCG$C!zW`Qu=ZPOa2rYH3rp zIWbo<8E%`D0KKCyGH|IVJJXQ!7jBKy@UEt!sbb@|s3A1wGB*((Z&hUtBbAru@pe`a z;3P#Ph4+~M>IMDDBWZAiGvr_xLx(*@prcQ|@bV|9$5Bg&XdL>DlSvz!^}BD&kHdbS zoH8%j>^-fihMd-?&$<^IXZ4x<7lSxbXLgqwzCYpUuKKFcOsCm6GxwlLw4Q=Tkyoly zUmbKdC#xN}5Y3>5G!nYu^>#JnvdaI5oITlL{vweep()SsyT`qUH(LQkuvBaQgZ{5l z@dFL0aq=|e$rTMku%2pjr?T6Hbs?g~`_OsSXr{_TLwV^l@5HZpP+z^- zl_jdsEk?g;l8aub$NZhEBuN=NzY*z|I*<{@ebfFZgA+!EIIaTRzxt=gbZU9;l6Db) zQ|zhgb{y^m#epU5;aKK!+Rq1pD$9*)467gbl!$h#Z%vY>c9YE_RG^P;y&0=xGPQSV z2%@zSB1nSORTO;_bSSKF2BkA~EF86-dwk8(>L<>F@bQ%&NsQUUBJTI!Y*AEd_aWME zG3Z3h66SP9fMq7ACtk2vHxgN*H;M$Ox$U3DZNC=@U&^9^+a2b4APm(IqOxFJx=Gv^n@kPqr* zx==%zXe@K4MAwYA`7;N8e>k4vpP(l>6>EY_Kb{J*twt+-%}!jOCC89P&JjZ)r_e#Yn$@kO;;n7k7ZC^{B^WXXT+ zpt*hDV%Ah=aNa-ji51Vot8o6-DF;dHcize$8PosHd*FXn@{#dn5)HrZ1cj(L_u&)h zu=rgUNKTxRDoCU^+eC#V#aB=c#1|TT2%K8yqfj=75B%~wf>b3>Cxlf4Pq@2ecqher$bi-Z{#jUsc5t;#9Pu4@Xmh~Wfe@T z$Hgh$)Ko~KI4x2mP$YKoE$z)6v2hiJFp`Ib5&FFIrOONz6($mZ6Gjymv39n0e^Of+ z`z-9j$>6*F5l{ZkpDd?P%?v-Wp}kNk1r1;JH=yl0LHb8p^m&^}^wjJFdmoxKBS3d@ zAZc->$_b|*cgZ9la_6V}oxQy6pDO+}h`eyjvUE9cfd`@{cwef{zJ%iw+j4tN^o z8461J`^uN8Xz#J8kM?}@%P;~OyNp#`n_tt>of%&d`l6QU#EhU#CjCv-`Q?r(@?h@x=2?&|A=(x^k+rH}wiPnm5X5W`z|#?D}HREvUiMJ%7JUY~47z zdy!q$P`3mAc4J#`#&lQj+QNFU(qtq**x>B@*~|4X&vy~N*8({0^(=|*>r732dssA2 z6050kB4e*2OAtIJQc+JTKPVO%g06{>&XKq_$tmKMg1fJezlz56H`^)B8_c(qKpBs! z7UjG9vWNO|y9j6IztApyuiXe-|0L8R0|*SMcj%EuO8Nf;$w#wZRlLz+g&B;->OJ1h z=-nP`tp6b~uDB;il$0>@-LxKl?rryi^>>60yv(RUW zvFlVgB#}owuU8bW{H%uNbxz1bm{a2Ute{49;j+}-ooIQz@J@GR!EZ z_m9MV|NRkU&FcLKx+~LFzN8R9nMtT7b zOS+Ub?T`~ArfXp`(6C-^Du*bd%Gd*io5xre_8F#-ch-Qb$&bYoI1t)JsQ}C~%RH&- zenBhjsc|@*g7nJv%=8FSMkzK-6R?-E$JYcuL86?1e$RLs(wx~6?Rb?SF}}t}KAH45 z)X!Eg-$;?w@$BkF{w8AnjSvJ}pIiUy^5&6}5|*r`6VYR$J`xSIfS$1mpI<~9O42Sz ze}cX^3DE;~*--(A7~LTkI6BuI`Zsvv5A~390E2Pu^y}lvDt3#0mV8ejmpo90GWH5=qu|iublx){*#sKax*d&5Q4fY? z&Z`7;CYx#^5ApwV&_)yu;8&6HzyWbTe3SnJ{Yr z1NMn8MNvT$tZEX3 zQIwUHNYajn2T+M4-;_f`UYSmGi_s0dgn98g3VfQANIoK_?3Tu0$Dv1s8|S$!k?my2 zoE0FB^uL4ID3*N>p+~Tv-X`2*mGnBJ2QSkX4U%8UNy^qV#kvTT2N~}M^+1Zv0qga5 zpA?CA?f3@Y7J!yNs&xh5xd99po#`8to=K-#}aQpmY5%WREWRd%y+9${@EA@bZj z?^q5)bLIL)wQDHjWV@qBF3LbX!ddjy!GW{^M;!d&b~7j&=<(%hpU$Vl!QRCn)$gJM z_B%kQ3pYOXnsU@OkIS%@qFN;73`;vY_4V|NQMwbDKCBYt_`yY`K~o8`maqQ~{g~>Q z+I)8nOt86y5Jh|^AArPI5okYVeHy6!NR66yo^-_&vUs3i1}DCR zN7Fd!7SY-Gfi>u<+doW-aaA5-1l|;#7lG)I`lzlxP)!QR=I&3;wS0I!nh_WoMeay= zjRuKqmd~*lT`;Ye(9BY^_ZRNHF46Ct`??@g|1v_Pxn8+4HL6?aB1}?VMWgO4!)6rvjWXhI#b8&Yo(z;H_(3}mj47@sTL3a1tURv@S*q# z63G+}I}Vzgz6y$P+Kn)-;Fs|1_Br0{y(=80aS9=p#+U?hIc0b&AJf3FZKTbr&+GHw zKm7z90{lt%?Q~3`sKh0QZ8OPRk+O_9p&AenGE z-BrO87%l648*kFwSM&+AI*geFo5mUMmOI1dWy(m6Oev?E)Z_YXywulKlZ31)ix7PV z>3LfrI;Jz8p?tw&6NN*KwgroWmiC$E@U~J~mbt^oPl)Ezjv=|Z(_9n#x2>ZGjlI)@ zwXyFNUhy@Ck9Z>~>)hRnnF~Fc@EKdOo;ZErmGksKecn zc7q4_wo5TTL1J7Blk5t>e>3PGeGqXFyYN96HNQrHzl5E){ZgZHS2Y2?-m|eJ&$^PZ zZy7>1tl4~#pMiFbyP;QmXYv%#-u_Ggz6YwbGm6O+oCIweLy3^W8cms-mh?K_^l3yc znkd)Mxb^|{2_DnMQZiXs6DX&{fkf_?MEb`*$B+;U{MC_})1jm!@kTw`QB~^_W*tN} z-;Ai*4!G$Q%#7msVwbJ+jwj2z!rnO-G>UM&nxv}@P$arEnfRW^-`vzLxQ&%>Sh>

HGMRRA%>@}pc64)(NnX( z43F}AEcbAQb8(xS=q}F9VvI>p! zqejP72|aHK^`Ya%?NeqTF&ko#y192ntVINR}u386jwoz zSaq0j-g2X>hv4dSW|V_rS_dhn3wV3I;3A7pW^-0^;cYkXbo}nuXE5_ida3?>``2)33x+p_R>+Q4q05g7>l@lOBMn!)_e z>h;j*&4`I8pTDrM7h`eyq2zTh8IpKA8eHEx;jxOmSOX1)bQgupIT@;0)5MG8Dhgs@ z(ns)5P5#rrAmi=1Ebu|8m$k)R-l=z5GD*eD{(!iUzUi~O9hng6A!Rjux|L{XSh5?B zUw+`e@k}h*^+|ug@Wgc0bkrRHh?LJT}8x5oy6-4xPz*EKq z%=3UboiK78KY1ukCkTbC-kS>lkbgg^wnmG9W@;HTk)7i2>U7gwRk|MT<4y1;lu7+; zyWZRH^h>DFIov|nNbBJ>+Pc8lbf_i14|=@prUQ~3!- zex3y%zhr?ehj#NI0OBjLBM&RjbHXkD-lqSXui<|FYeM$CEd-yMiCSpZubFjf+FNcw z0LND!u)^ug&RW~SPp3b3IJ-KM(h-F|Q&o=GW|Y1=)rf1sEV5GQ0hrTIm`I~Y+eCz5 zilTdHeXxANgt6kog#bI_g@9Ux$A%ZX58_U#;E1&sD09q1B2C~>1dFh(8Lj^s(o)>f;y%}};y|2&J)7sd+2k|=V%6+~>^oqPk z^yRdw@>DJYb`Eb&cMg-3NYv+(_fPPUn)z6_bnn^c>etvzR%atkeNA>26RxdWBx%fsAr%+((q%QrR#3A5Gd@*k}#A8nbq8sdjh7OT5*8=d{2ab2KA5m@X5Z)VM6Lyq6aCZiZb7k1+NAQ7=* z|9~6wfFph6%atbxhi> z?8WmRuGMcqH=+^Q7-?F%;QVbNKsdts^5c99(jbne&DksqGF*b=*&+O@V+VQjy=BE{ zBR6wNzEDenOoiS8dJ82LmK`E4Y0QX&W#Fg}xw!}6BNSB#NUS8{^L#K8rwi~e_Jd_Em(kJumPx+bQ#V<2mZb@y|nh=uCW+yt&axT?Wm^ zfbLgcf3p}c9J`j!o&(!r8o&4>ChRoep|11{FY*uL8c@xzod88yPFB!3UpgJA9Gv#X zLaX%@Gq|Wx+N?UhXa}pNkO~wK<_fQXcx`d^6NI2e4U5s3090nzrw7GFvydJXt&wMPQ|-BSKwJS3%E>v+xf$)^;uK3<>Gpy!ILQQeY~)q**Nl6* z?u?InZ$md4y{59-uSNGs53#mOfM#=d7kONeM}VSyp(X3Wu(e$$wjs4YB0Ep-`chT# z9k)ddvTI&H^TjB%(7cy1cv1zF{$nN01GEKQebPMP#rst=Ij1si#66c$BS;`(9l?TE zfMiXP1pgF%!(iiW3#|TP&XV4l(KAe;jT~nMoj`el(sEu-R8}LiR`;aD4%Wuofsqvv zyCXzrOuQ9@47vy0-3u&ja*C6~_@Kkov91m0sOuOi_-&92c&z*q6Rf!xFYD>)?Qxdz z{mITt*BiSOA6!-rAzHH*+c5&=Ku6|`zDu?m{oa)aXUSGn&*C>+i&97U_NY>PSDOjj z0_$MGbG=yf(63SDbx}tu`D*5Nntlf^fn+?pisnhGB~iK05{>H&!=50wS*1$T5e|9h zX~Atka5Tt#`j8X}A<0SCa60IKw9FufN*%ZBb$pDB1UE;n$c$G4cvsgY*9FVkI82xn&`;X5<%?8Q^L=sUMNaBytu8Dek(=i~LQ^Z%Nje=i? z%9X6*C$?CL@Tf`(Pf3wUQKgVl7i1I?rO^iJkaBS{c+ z))IWnl5-VT|7{PO`CCJb<2>*ZYV7`HOy)f)w-jX!C0h3LbArduV7~Jqz zGT|epUN0iTQLnf8X3@S4-8=W~DJ75<=Ct-EneS8DQeh4YvW~;(ECZ3YQRS(<2S<&GyKizrB-ejF|6WZ|~WvbrPo#C4BI&x}!6gg9IHG zANfeAu-vShzth;>O$za1fWJQ7gI$`S^K$_e0=2Wm*_{FEzyN9$%3GH%5P!$5>O+hz4PW5?GJo^RaY|k*A(6?&pzY{C z*3JJq>csnXXi}<{^fK;QI9cX%Rxtca!miosyW}Wy|g6K zb^2%?u%KJWc=L))r1I|2pA~63W~}Z7OjQq9+^N_4*JIFAyrxyaxbF&))_FDny24~G zCQ7+rUY_=dO1Qx}7^?0!KkK5)zIWk{fv*e@-^j!5e`*9QJ}{XjUfhPxB^_%->Llt-rTX_><(yZwn?$@nBUCHGtpiqJJOPR8Xs{+^R-A@DpS{QTl@RZEJrH z=mKnlK*`UfgkEU`b%HX6ir0M8{ml!*Xwvy6dk%KOgX8%Hu&5_@6_94A?;Em)en$xdG3xuL^akJ) zb2e%9tC!3E;005orMfFGaA)A@%TBFM9+fEdxz5jjN_B%UwWy));Topil<*4S?=Z2P zbG5xbOOA*4z$V}+75P?PIbN*vy_*eJF^nPf_)hjU+x#Zi_UX+kET2owY5I^~r#QV$uRqlN&H802%@)hK&s&v~uuT&Y`eG2ZP z?_el&Qb0=TIypXgEfHU!J1@~gt6F}WufM7LEPmA*BA24~{cK66^pRx{OMlwYw%^_T ze$F2^n*6u9CwO{Wdvo9MYa628g?m(d0YKIbo-Vj5w#`P5? z#$yCYS==!R^Mkhj2c0%kPPGG!StAC2k!}#0_Gvd$)^FVMU&mP%k_^R|uRadUb#CUpzpb?zUJn*zXy!b>Z#aH$u6UmrNzX^-|JW(L#2Egc<&3AH0> z`syK;Jb%+&=Mm<@_OCgj9 zZ~RTdJll=2$(;O%LBLOtw*peZsKzZz&D+vh)I-4#oz>V{W1;t|WTu@{^{N9}R%}g^ z8$Uq~k5AJIaat)Jnm@yhE9yAFN@V$WiUh>WtX|)V(ym>Tp>Oa$l6zlb#Tq&gyYpapb^vpS$a<;t1h!T6^>W9vL_%v+mHeQ!PoLTnpFp zWByOjq{x$g?z&-+XWEV><9?TGy_9s-A=hEL15z^SM|ve_BM9Dy=xPw;GNEEGOM3eJ zq|C{VNW#MsP=nx zdIVBp-g)>6YK3OyTD(juPI0|@(aeA-@?a3rsU}phW~s}##r%?RY3~eL2k$mK8M1;2 zU@|hm>Mr5+Qm6fLp+-(s`SO|1)0hEliJp-%FjAV#ff*>XqDoF?TBTw7HMI^x6$hvaoy=I-FOalz$Lq_Gs<+)7v4udmlJOG-@z6Y#qB(LfF3F(2+dy&?TM^DL#WV@?8LBO&gmFu zp&Qhvf(Louo<(`6`cl6qr6^Q!Sz?Q@RwEfV~u>_ahDKmyaXKmtK7PQO%jsG}CS zEpislT$~#V-RLk@CcL8Np&^&<8EYu`^U3+I>N)=4^Z!A0{C{Td*)GFP$nYtubUK(i z;hvx78MGWbVD5$vjo)d$Ezp$D)9B#uLR#WiF}emb&yc2B>>7viW|#Y&`J@ z&=*{quLYqbOF~mz9-zrUR(R=V)Y)EyGaFRszR_l?s&oLQWFEHhpoh=K9tc%}J$^(7(XWxz#7Th6P>@VJB zHQy~5vOYQi#=ZhWb)?Ng4;^;6nJbzt~Wjjb^Apbpk8%!GGrvyV5gNaUR zfAVUJgpsIIom3A^=^r8+hwH{$8I%p-)fhR>T<1&`qkbLQoA1`5xn-L+atV5}CBSo2 z6VZyck=h6iS55B0$CtyJWVR_5IV;_Q`#J zm~8)RDObgbPX$59bZ0!l6GY}jY+7S@ygB=n%l*E6z1Y7QbJFcU;F2;?`~mWV3up76 ziod^{I**s=H+vE?s=stl^9W@vH71Lx>W3uxDn>>{lK)_uw(R57zhS5kzIU{}j|Naj zLA90FBYg9pXc%4acSuN0=ed$I%udXboLm$QYKdZhG^925w9s7?hZefKTj$R8xw`=z zERw{?o4Y`eJLboWAD&R+-RPM-P1ER?&U(BULbg3EV@+}wp3+sSliP^WhLQbY+>9yF zGLg#TLjHBuVA;BfOLqbNLaOi->#D%|9tsno4JZf9bpmKomQ;11rI%=*$3qyzK66QF z2f^vGUqTD4j707)T<|wak#@S-6DD=K1Vh5P*GSKFv}a)+3lC=x#>x+SQTI}iK&RWm zq}(8zBIY1GYuzW9!tdc^vkdp5I=ooJ7HA*Jc^5gy0-t*TmW$*}j_p&7Q z1zx}V%4ZE^%Nq@iykXk;@a5}a-pgFrwA@Fj69Hl?K`OU4q1H1drf_Mto0y4JileIh{pB;g-rEmXK4nfOgD_)#gB)dHY)v<{RmF5GVk-ax*(L;u;Ndz_uk%U4C8op#y-xD>_t4a(XDI?J~5TH^SNO zc@6C%`zymE$#aO`)auBw<@d`2SPsCKQ2+oVQw?#tD1YHNW99g3)g3`7n^yD7>35Z9 zoG+6P+RufYX6TH>jtxO~gCnYB%3b7Gksa#9ZxS6^PLD=k`RUiMpvB(HQ6;tfCS9$k zdvdKmW;n)Nsh&(=bp-FJUB^64xO=RZrSeZl0xqy<=O<{MeXh$+WnPRbPGk7=laR=6 zSW3#&m&vbHn-!XiPaEI7jHM1!E!gyuE09N{mK+PZ7d^lWmpoxu>0R=5xz6skhoHSZ zd*LkpJ}yubPrJO$%IEVkSoXeRZBSM_;fV4r#|&HRU97!cV7eKr8L{YY8QFjcyO7X% zYVIwj#d&F<%-hI{YWP)=<14s5xHwznL2LoG&Q_N`1sNa5g-}G@`!ub7J4L= zwaK*Y2{SR#F4%Q<_KL98P9MA`=0z1BD+D_}z3N*`KkN5zsFTjrr8W#)y1Cz`tPApc zUS$KlF8CG|(s1|K%}!(P(@Y1)A4w)sZoY$e(JQ`ZV)}cx*D6=JQ*QqY@8{-MHR*xM zarB#CB55Q0Cz7_BNiA@g3p6AD?yXl+Y^M)KWe9Cu zT?a?)_s+&I%g1ohklV|)CA0eFjzJM;PXe6zw5VjlUP$|TiYY0;`}AjCd_U98_-Y z?OL@`CBfG1e!q`(PQC!w(*)^!`O+gO(jMUi0p4mp^5hdcp$C?>a|bMEYg@$@ChipQ z(Cch~oiu1{kxG9O=0jXN=F}exlnT3q(xBW=gN#b|%+oNP@1F@i zofmg6cN$uetZIl}w2w%=Ku$Qf2w#<_&{^F3;PtWG?-?IMH=&2G#N&Of)R)aQr^MQB zu3}8C0wuK1-?L|eBxEs=^!xW1ZLtOPm?wMQAB;NM{^&c>FzQBR$643_1Jhk1%5zy7 zG?y1GOjnWOJCFs}3j8t+Q=SJW=GdMc&k5MpVlWC5P+vzFWL1d}#_2 z;_|}&zd9)YFAMeeR~Mmeeuw*H~BY~!cUNKHpeV;%lZ=JTJo$&9~Qq) zoJeTPE&KUx5Ub{R)WSMaYD(z$fVO&1;ow8=07f31YCh#gx^zhJW=kbW5`L@}otLqn zV(OQ+wfyL67D%Yo(v=)UF358C?N5;Q2EttMNCkV?547w23wdnw+v8-iGPkYwYf{D} zy6!#HbT{{w@ne~zcT7-c!>^okftzgz8L!Fw9K#3OOZB87fkrUh5{fna%xT3UCBlSn zOg5lR{uR@+4D<+CBv7j8P4Q6A{qR0^w?5*lY4&W><&O_*Z*oiZeBmaN&R={0d`<$O z(r`Zux+AR%iR4UP)E7OwIIY(((iDv-0#W)M51Bh*WS_z6Q8f(fn!h#F1nJivee{RA|diF72 z3V-cI+%wztHyuJ*xwUteVC`wAd@pa~JaN9pC+{Vr+!Y)afA~f9mO5sBx%Jyj!`_HH z@3^{FL*JSw0`t=`N??ph9i~m=Y3+BZ-L?eMmW>{4nreH01Eou1=WTBt_XzU}P3{{# zVl~H8mB0lFB2WrswFxnf6*UUPxrUA2r|`NzzG2shRq(8MWQQa`JODE+*u5HCoYVPU zF3q1CN4V~)=E`#ZirJ|GqRPB|Ep!SM3VI#x*!pazpFj z5nAwcsP=gV^R%R~CI7<1c9z`gth@)75(X@wIqajRHAlDIU`glkTWphMFe9m7e2)a> zhKHZO;PrfWDq8Y~;WTPHY7N2)3X#}uD6y2)V%3DxFrGOFDv_%sW2JmIkt>=5=2T;h z$n>!nk~7fQvGox3_lC(v{FK5dO~Z za7_Ip;zbek5Ad^b?v8F1Qi(Kmt7i!(7=kyU5mWD&WZtSJ0|? zS3D={>A?e5AEtdMhh48T#;MPy$K87&`L^Gi*gmtzzWpSD4yWo^37010O!gcp+RY0n z{lF>FHP^~ND>_PcZXCSk8G;jpYugq|UHh`;E*Z|nq>ChO%lBA+p!N%|_%IlKB*k=3 zeyT}xU7Elsl#smBNFk~R`ygC2QkazjRy(^o*fO%Eq-yz^Q~w?tX~U7q|YhxhnTkk>gZ zbQG3cY<`>*{**}dNMMpBAUYj9J#xlVs+k!?ZVCtNG4=5hCnoQg4C;qZ3T?wqmfcd; zYd0L2S~5eemlc3*{Kq=F;+&)N?dJn+N?gx7FF+1*vf>~&wT$&eh{6>7beS|)Z217z zBakUY_sZoJS)qk|=qH&VCK3i>*Be8$4w6VP&-O{FcBU`uxOinlweP-Nm#bY=pgD~= zO=>Qw9fwBq)02ll+w^65gz$Pk;e(>`Jjy^G%C!aydhn1Cxp|VK8U9>=cDD^!Iz%QM z2#O+`rgQOjE)@S3gZ^IpmvlH(0BD-tfLSTUJ^1XEBJiDv?HSinw$Vmt&hC=&;$&sT zk`C!08`E%RX=X%w(^x3zycnmOfSh46t9m@s($ftt+QkZcF&4=HnfG$N%J@cZ<(RNP zZ-GIV59nL)oma($_ojS&Ye+D%yi_yiTcLju98w-9+!i_8q~R$VmYZ$IK#jxD9w0;G z@RVVApu=wn^5UyY+!EPX);M!ze*$%_tnpm)yP)m5?TK@3tPWht4^avfM`0?JaKrtp zrI?Qs!iDB}AZLBh_Qs!})riDg-Q5bEMeUyF<-=$EdCzMVB#?IpJrRKZu*BDCtII@P zYvxg@-;J=o@1a_)+X{V&Y~k6ex|3g%!#@^F`rfV5n7WH*FOE6X6)ENy%{HBM+zW3o zq1Xm5>10+QZwJci^)ZF4qO*8YbX-lTi(&g}_GG+!13_D>x1di4T+x3@l|;f^hY(1v z_AA<22Q?--_+BWE+dl-F91ogYL#JLDZWo*U*4rg%~q>Q%@Y ztN$WDt6&Bm5HKw0StERUvK*Xfv-xE*+iYm#m2z5QFgV3WU*>?hkt0jo(ALz^8W)J! z!X$`ap?S+IE^JI4*mXVbl*+@rmH_bAH81hidz597rc_iCCnvvIO+y6{6rd}YsTcn^ z=hg^w6=F@;0%8nyEXl(ug=gs-rqP{rOzH@=lgMzEk}B4I<2+NV8Co%%sF-;qY6`Kj z64GxSrP~H<;2lfd{Uv2T-p#)O!=O%Cazc=JcGxA;p{wbKME-^nwE?pon`f+ zXbHxc`ge9|I}MJ8iML^uBhnEV(2@K&HWT;BXj4OKzy#tE$y?s~${B@jJaO1*$x zcfo8aH)Rzqi>xLZ+A!h0N#jI)!jaT$i`&%C_x##?`VplsocYZS-A4-AvphGeKNFJmQY4l-&Lr2Y zm`*o6>so993pMR7=aDmygTJ|!2&}?l=3FqDpibrn{ZgM3(}=W zJaOd3lzStix7W^v5$a&4g4mM}k8=5z7;y>rk>`4nH=JW1XIjWri;=ANnXjqx~0J_RY#47+Un@2ItJp_sZYZl!y=DKH?DWb(|0=UgO+e@E_*m{|HrW3 z|E6Rfb?jYUlET+3f0w*Io&&*sAZB@AB7X~S0lD6U!V32D(lI0-u0Ob%2gg0|#IWAV%M%t}705`0XWlzq@4mzjulK zm|Mc@Al^!Ak%>!ml&IS(F~lZYE9vTy(k!|kaXT%2g*BPJyc94QAQzV|0Vp2%0Mx>d zS-)Mxg(d&qHKG5F`~1;00?VB~)X~SH4b^p?$63prD#dSx8&B667q0i0vikq!S2z-1hcY|y z7#X6~U=kw4aq_DRayP#$fkG@m0l%Gu->tTWRquD!o|1BJoHXVn?5iAo!B6u>qN_bc*fEHD8XFRi>B+KjO&y8~q2C$luL(3Gh`R=CLm}T!IYm zs%?CakaqNGz?WJV%uHGzb;qEUPU%D+sEvYcrYYJx!IAx6L8iYRUftggFM-Zb#wiLQ z4>O68T%eI@WsK2B^|q?6_eR34R8dm%iBvG484|HFg2VubeCniKJf46ae+L@8I{oIJ z&Kz=NjV}ZS@UUdr1VN-RdfGW&uO_&@NOLXl175}h8@_`1NT03^V-9$FztfJwoUj=P_EOuf0RV;1^uxjTf7%d#xYAxygc z+p`2f-u^lW09t(N+%LXRtS^AQa*2fcbZsM_!H=J`bk zFADp>Y+&R^tSFv$13i#WC2p|jpCFY9fZ)xIW_x8=Z5h&c)?rGB(pbK7O(w{jjtqoY ze`3Z80y^j7VhcsGVG)l%>yni%2)}8cZAH+InlMe}*d27`{BxsE@W0Qo@W0FP^!wlO zyUjv#vA3_>Zk9h>o1TI`dM~TpD~jL#B@tYicOE#_>BF7_w+{|&F?*||9j2f zL!>|QF8_BMpWl1L`z1vb*p|%cyVw@ygrB=r;OU4KTAjD2d^!HyLKQ8;r0z7TG4uMf zitL{fVchsyryG5#9bR!SsOJuA6e$&_nKMZwDKfP*F=)Izq6a(v>DDRDbW2ulnc5h2 zmZd1~BbCr^X`rUG?|!_U$2I@2pKiZ7S^kebAOBaJ@(9}eF|Jne&lsp!IKC-XP4BT= zO#X8y?yq#|UmlVEU)E!FOp*-CEgxYnzG&j}9JPIdBb8pjKqeuYvfktb#fEIC#oxmK z))M37iA4eIT_-*GTKM!x4%GxR11U&A|LvNQQOIj=3wD4-D-4%;`IT zk7iN(!RB+tjeRf!U?|&=Jt3r1a}Kemwco*hzuOzZ z%57eJD*V1B#EG z(&5Hipu(0-5wGI7;@F8a5aFFDFSt7Pn$er4R7x~tF!Jr!UUukES zMC&p#T)I`6C)@kKlp2Y^OSv0N(?L_`{&j}mm2;N<`!x3Cg=bY0sAHqf>lQ9;c&s)u zuuPUb;CTt_w;>r%Ul83nVKKV<_9<7to9XL!Wm#ZPo}F|DYJD}qd;V*@+1WYmnYkWQ ziw5eD|A)7)j%sV&_6-_})8gJjfuhCTO3{`g#oeV8ceet?t!Qy~cPZ}f6et$lg9gif zYhO8cpEveCa^D#5FBXdgNYPkN?+(h>4`oNm?xO$JV!lq8z8pSmt2Y>k!K8&cn>7~4Bt;=hQ@ zy#n)u)2BY3hHCEwvt4R`Tr4>s6)DWV;qC{T^m7$@z=$v2caJX3_1WSz01N*)R`Un@ zNb=LaQS&sZ{=hhz@A&iclP!UJPDytvT9I>Q7Y)jDqd7t*oDg({`(|3t`G|0`{view zwCDbN(q6L(18z}wJ699IPR8Gh7jC9qf@x;s52QiPr~9%ITX5p#pqAMa>#lo#XPKeX zfPqV7ALqmRq}+X;7P?HrbTbi|0op4gkN6Y$PakprjOX?HIaR?!GF`cqB|TPHaxv55 zF}0t!l=KS#Ek*+&A!Su^SBWlXh&H$1PwvOw%1%Jj0)GS5`n+i^*Lvg{kgl)@)&y# zI`EnV`uE#kj!{|&CEpL}_b**#Udzz+D8!G(tp^ZfQXlJ_d-yc(N8F{maYX8G{HO{$ zF{^44g820`513O`p5HhdbaI^Ia5*8&{tFm`yLzAPEsB=!f-x}0OTE0eh22CyPCb;4 ziZJ;OGS0wTsUDFR3YUEc+I&x2s&MyjjI<{oa#7{qd1G97U1^~>$!7!i-IYMltZL724l-+iRi2QDDhS^G0M z-cK|n7D4m1pzJ4ct8ORnSXdV{QoACEB(`6O$E5-P$tZ-9VDqOeL7H^U-h{4 zF)p$zL*?w37R|R8B><69R8d`=iC&^sOxxZPF2mau;Qq4hWY|T4Jt8lndmGri@uhI% zh|fL%7UsTMR>5#jX{`=S}A4=j*-E)cC-g|Z0Y73=h*L+i?# zuJ9@teGx5}(#QR377wg#3Z#FL_K_LvHXD49=_;{O6o^FX$F(A=vE;SmPCwEdeea4C z)x<|0{ob2XH^EMA^~vkQ;!U#f-Bf4mv2Uta=f?vhetg#z*Z=em`hNqEjvpTz_MZB*6%i%ld@i1j!g(LIgYkQWp{q`$i>pqEQz-WuIxOmDY^Jc~ zU6Lr*rw=|xs(}a*~OpbV_I??j}yQ;igB)P3&;TaY;##^Nuxn7+}9gh zCl*9q)Dy()MpJ0Nlmo5sFB315+)PKojHCN?5Xg)UO>;%I6RXq$cn&|-`g~aDjP41d zVkUK7cV*~yb6a?tF6&_j;ZGrdpDA(1Y(>VJ5PAlW@{0$U)mW)`!SZ^ywE3@#@fHtYd-JvW|{Cm!}wz zg+kVvh(zvT&Yix_6#Dt{YhC!Eb~2rsj~)@4_{-W>VyaMrJtsxK9j}k2pGb8>9^|Y| zo12puGqgCr05aMtoOQv4G)n;8;0kvn*CIcceZIzA7MBz|q5NTO(52EEST=`mXCLG% z52n)Prr%*`8<;Sv&Wt*CA~$fLI{NQVe7e;76aM5M_?Fm61N+{d#^H(3V!N~ecitLHkZVyrS8|U43dig ziUE-=Nc5+nshp0p50%c=uC~lA-XpeA7Aj5eomPlw?d*KgOw>!K#3;2lC7YpT4NGcC z$fefg0kSrxQjua^#ER=r?G$X9Diu}HvWc>Y3;&uS^T!(*r34gPd(4cO*=dBS*{jgZ z6<3C+07Rna)6O(?87awDK8t1^G^Oq^_N-rg;C&Scjy&Ivk0i9vf7j5Ya4i*U>d0(P z=~dkiGZpPD1KZC==bLJ~)%NiK+Hk9lTbd~LdKGC_5Y-v!)g1%U*U69S3oLAG+?Srd z^{)c>&={Y-#{MvWB75L1%!JW>L$r$b%p&zy+p3HMROfBv3tYE>;U^^h(mw8e9{KnP zJ4A^KKCDoRgcZVb31frh(dl9d8WSD@Z7?~u{h?l_B)CsS zd847JDbY0Ycr(B%l=X$%lwo9M9DAHtC;80-QzFdihKJPClJ`34wcK=VsSqn}02peFROoTJ z#2{t|#L)?3H3;-yxtOl8c3#8#lq!1to>V7h{5ks&S7(_4O21gY ztZ+zWy+;DT`q4w2an^7ujigtpaVtM@&l{d==LkOs);=&7Gs!eq0tp zV_;Ms6@eaDcNZXc2jTh@_o zg0rQFgw}Hs?UbQ{0Y)R~`2jO{t3q-uxnFnj{^!L0bcq`&{8VSMpni^)BiZEjZCEcS zE5dzeV1~yo7s4LC`UE9orXs-f`khK}0BrWd~JtH}`AyL6h z&(}B}1Pf|VW+`ofTc#xKGI$njK9}XqIAv*_$fQa$+Hb$FK(>vB*pDT!juO?=tj)I1!D=2(J;QQ(iX1D?Y`y)qfD1*}jfS?d zn*KZs$2=$Oyfu6|nh;hakIt_rEHe7+bwmOO0UvejIEQ_-n1UAQN{(sx;a#C`H+ifT zoD3CT8bgSmj`!najO?8ce?<-8Rzl__koqH|b-P(b1NSEGJqTwq!{-~&l<6tBo}DVq zX5Z^&7q=x1>eNp@XKK;3*fLxJ@mlJ;jv}cBP+f6FD&pLbNqoiV*({e@4X8;Ubk3bJW2^cHedAYhf{Lie1OY-HH)`y-fv3Sb zAX)%Tf5HAqxrwaerHMh$1wMi>!FTU6OAo6ukDyUIPRt?~P%LbY?GJxT5%rP4A3%K7 zgMohdxh-G-5Y{YGo3E`c?QZW#D#5nmVgP7=ERa5g0f^j}(jMPS+(uv&+@}LLk%!-$ zmsM<}{03@?%7bf;fh?y~DxJwFc-Xlgh7j48NPwZR{TPrO(5<@Bafb$Ab!$d`ynp{v zRqrX(#@qF>+lpRefZzMsFw6X|OVYN&-~rVoroxFDOK?eg{z z&d>Ye){|Mi)qu!oFjk5Wf7A|Sf<-Gsb(6xFFK1gdy}F}G8IyBqt95vYtH~YXd=h5% zJ*Yx2kL?ZwqVjCYUD%yp2Fp?)I^=@0QLBX1e*?V)%iaNzM7gBxc7DSumqNN$o3d?8j|J)@n98!onomHe z6z)Ud*;zPg-5eK|(H$15=rI|nOD7mjb>J+Z7qOF^-2McIV0Nzs1-einq)-|ZW|?GS zmurL8gtYS+GYxyiR}6iX8j4g<2@ZMD`i^?&$L4levC>aKt*6ZxUbK) z36=}5YJZLy;!ATHx6cS67S%&5CE{FKe0U#oP*hIPUGgdh5p_!uM+8DrsS>X+Z6uYb z?M9$t5*G$SYUaz$?{cXO6FA^eRV&lPtu(lZ0k#6S2|344X%zRo&A2vSSrPGHs+iGU`I2DD{$p7%F znr7TuNWWZD!?#SGZ0$e76}2k<*H zKTfM_FI=VbK^&L56*I=$Di0Tc3XS;F>9Z9#k3+`c_p!l8(%2mfR9c6CZvK{%o9*r< zxb3E2ZN^Q%&khzaJs z@qsXw{gU)U0>JDz%v%G-YKL8kGw-ZLzp@cMJ5Y9SI&5I?X4}3qYLX`ogo-%uKW+gJ zDx^4~uRZ}vS?YgS#8~CX1oMMT4Fhnf0~cqCvQUdu5mXGYK?K{WW%Rv21RCobcPL(i zQwGF-{%RB{1o`+RELaJYFJy4Z6XAF=jIeJTNIY1dK2EK`8=cA-2YRki8KRNcm@9nD zZOTR!Zt>kiVp(fTwf7ouyhXIB7^&~acTb9L=$c8KO2b7l4pO7?gMYO7uNptDeLIfu zWPlvXR$8{KeP#X3or$A>8%%pab5Vo3v$rglvDhUVWNf9TkGmk#Vs$@a=J&Q=QjtTQ z-2c9MdDS1F*Hr$|CVL^qOSBOxURR&N^ffq@;^wW`+6tx1Z-BQ+yIzNUTtFB@-s?Zx zU@K{%HxCglY>!SC}uCU><5#c3CB{@t@%d=eSKqAqW zjrQs`{F&acI{^fa5Pi?oyHpU#hRLsUgEjz$Vq#&~c~WFXu&g3t53n@PG>K1dg8Wo}YTFT=DpWV;e#iB?XK;eK-y$csaTFtcfrV2W>p()U|c zk7(J>;K>8p*cXxcF+r-xaULLn)C(MxrJ)k5MmURs8c9lSAPrftw5T7*+i_ppwdURC zNXT?qazXktqaA0+7!6!A`w|X)5NYR%x4h%dvu{{5LNSU|aWxK71k(krP#LE$oS4no z5_PvY5^!KGIIum`e!2b)cJ>ow}l@gDHA!wzWH$Jcgi5dQc<*Jy`?-n_3l0+Y!`Phn6HQl0ID>Ww66;O0Yx zD{e~-#h)5`<(2)_0HqUhQt37S_)ql$z#QNA4|Dv(zs>P|{$h^rFhI&F6NV(ZO6_3Z zIZQ-+_mdKYQ`ZnMSuol^aDIq5mHD!daZuJ=-vV(5q|bK0r& z`{C54urs>|yDh?&7?2ofE6qCc)kruP*m-PTsW-AIxZk_Y1jY0>kW=75;kMo~5XSab?d(50Z~T|@dRO>!z&4uT z=FxNH_bLJCFN47FmkWXL)Jk~Zn*-h&LEC2{<=6?_^9cf-ll|vB#;e*E;|R0}Yen2U zshE^(P>~R(uA;i`SSgjL=1Fgn5U@DL1ic%Opy-jyz`d@~uC_A+KTO z;t;mL1l5L%C9Yll$0UE)Olc#@$nE|)u#5O9d%o7rO@Vor{93i(C2Gk#eidal(mam; zA`@D+%wn|A+pYaeT@5}F;;zB6nPO;{3rpLj&HRw2N`)d#`vYa}MYv${C@MccJBvQd znl8aRxBgpI?*cAAOS8C9`JhmA&sm9Q$eX0#HfdMmO_#Hyg-qMJ*swl!>n(B{DxMGC zXcKzLl}2S`Ke?>wWO%g;)49UQ-;uJO_g+!(DGdn++5V+q{@?8jkp5k7w#^4?SNq3w z%1Fbp@f^4uCS?0ycRvbezE$qx9gtMi4)vZd^mWEmTCDt8-W$ib9ld7ogzEhbzB@QY zVk4>yt~wBMQbX&PBIJ9(al1(v}jKZP` zN4*>TP%bNGX@@FIv=baT&(e^OJzn-!HmiJfUvrL=C3Y~qkD9@vAp|F3C1C_&AUjay zQ)uHubgJ((&Lgqa@;;0I3Bgte5I=FmlO!kweXeuUMKZ9jSFfiE)rX6#WnL`;^)_8R zsX#f7?DWY4zey6$lnm)AhwF>9xZI3%@59mKO%XzvUyIAPE$z|0auR+r9Vd}i!0~)R zy-)%Yn5E@wreKD^oePB&%eoz9=j$A{_(iv!wiywxJgn|uG;>h-4?}|8Q>aEG%G}l_NWd|W= zwn{T*1^W;)=L80Ap2Ordyxw4bW-Rbh_?L_99S^_>W%nnQu5P08}ws(~RG)#EPCduVGx5VCfQe<~9CdvL{5 zLyA$+kR`Kp*Np_LR*QT(Y2+5Br<^BDo^Q@zlUJp;EHqCn9jZA%9fA~z1&FT?nyY}H7B83 z?UNIstZ%4T3Qj3{XH6bxg@Qa3AYz-Ak#~`1XEnW-vkrvVtfzlep39v#n?oFXqR;l3 zoZ(~4Ym?oK=89p@?EHOO=d$-@FO*GU@v}sdbsB|R&aISd>#aG~G-kGEY-(t}86>Pn zkOzo-QM|6Al+K2+X*O5ECuf{reKu+`rnN`>x#+WR6oI-a0M+`q|AYOk%~-`+;W(y$ zm^Ms#z|IWAzg6T%PWeyRyS_UC?%*NvehIQ#d?aESxs2v7l zniH7W?HXxM(!(0xFNz^I;AJoY+E>JqC=ik#itOKKG9Zk*{aNU7^~0YB=8668VEK0n zH>Ol5YGkpM%d|$1TjHb)$lwL@?!wM0pQzlv1)x0Z$)KEl5Jp@K-^%!@(a?iEPjW;o zN`5iOU0i9QruV#xhqoC?Fd^xb2^(@tx&&a3Bi>YfF7px}qLo8qrRGwy=@bQhDW6c@ zmfk6Z7*9Iwvv4)D3l%y}FrG56<{`FuU@pi4xkOi$U$|T4_2jL&?Pf;V=??8wLx_o2 z`G|@8oJ5|VJ$8I8BkbP~OK`SZcR)M5jR@Og$U^5OyrD9_=lT9(xyG-gk2eVEF|E?p z<6>DMWGOPS_^}qX93H2eR_#ZgIDQlf+vPsMS43819zief0mtB^e6vQr^N)`novr&P zK#3&B@L5ZD7kM#yuMOy$55jjj21uaY1--auDcxD4YUxv6jZ{Sh46Ho`5PJ0SllL#j zE19d~c%MrQf(tpQw)Lukpc%etS`7%XDhMQqpmOz&?U+RXqOevw5t;id0%F&1-|s8X zAqg^cbCiU4PUlWPtO<@VyB;s6_hzJsjyL`Ttc|p*A-xbx{%p*Br*geqU8^Hpxn+*P zB#=@==5>kqxEKc|yne4v5rv82H0Nvi!w~0gu*|5JXTaFRwgzcO(EN>3w9bs#{mV-o z+vR2*rye#ENW`5F@gYMNHXWqy#}`%n*NFANvc~CM z#c799Y<^e@4PdC=?xh)@S=n_Px)~q<%2o7bWuQu1>A30fg$`cAGj-UQjANrd6m`!=EJav2;YHbuNCwpIhBz_p! zWK0v48R_Uld8U`Y(X4w^|I#F$Y$|JzZ+{{s#+@60RqK$R>CV{JThUe5wP>p*$PK>t z9QwHEH4=4u2vZXc7UJ~X$mbdHs$H%nVo!ESi;W_4AF%0f6<|OD_j;f-e!YYk0{ZFa zQv)wGTnFqYUNl>=A+EAi&0NWU(Ek2b`6WWAMLe#>ihuXl0pgQL7sms`2@&N2}9A3!LDc z#936NRo-%sw65@2WMIp zXoqXR9wFvONYA8t)0^h$l#S5sw$PobZShRUa(+sjrQ_6*6lJ(_6^t~KSycGIJKvuR z1WuU`|4f}c9&jYNFFf0asT&vXU6b}2QHX0!nr~}TAWRggkvv$vSm@Fttxnd_T(0}T zw^B2VBYGx)%Q-G&PiqM&B{Tz32insYo<0G*7;IOQe>E8&>U_&UIx8DDO`UCRe3_~# zE2^vq3`^1PyQwa=fuYgM-*v`&NLRMcx>JKH!t(~>ekm}942;r5IP^k(fRUgT`kQTE zxeob;ZX?RXt}!Mq2!8Ib;+~zvyEHC&Hzsi)5;V3N(}07G1XNrzFweFhb~dkn<;S)p zCVss2dcZLh8_}ak{qj7ptgmO` zPcidZlPJE0v_Wa58IsfeC@ZR-)Y+w9-|o_~Xi?R0u|r1mI99iI_f-ILzP_6T+^gf9 z77&G6i3VRtdr~s47H*`IX~vJuZd%IhVaA~lUn+L`d=nP8aP-C8l0KVM<$uK#c(wi; zh(<3+XEk$!1xYSVXMd85m48$7wfzBx0sKXa+-LE>v+2`JjjArRe6FvQG^{pGTuSgD zgN!p!uG$D?4R=LuaX|G2&?v>-e2{fDdj9vUtp&Ot+>_h0o72{*vSta_ zc0q5TX>p5?{nUk-`Y|STGhH5AL0jdLMXFHZiAJr9a$mTdUOK>#oVjhMkjOs#rt(o9 zt4&_Gr_3UddDIR>F~mLA81cYYLTeR_SNr+tt%JL1JJRv;-R~C3Vm2(jRF)hJUn7^# zdDd?x-m8&7%|qUD>O7WX+Tm98(As>C!S}zQ6b<1*n3~dJMRVMZo^ zq6S>BebdbcAfi}9FQgXq^pkmCB~Wr39*}erKUg3Y=agvcFRngqa2#M)e1g~X8EJ-( zJG~edEjdklZhL|_6mg`e>v#2BoZ2%Lw31WfRPHJ}53%HGo>%N}!wjZU<)b+-|Ma)o zY*$D}=4|Yi>Y}gIs0S`&0QB`%4?)JTKdbiqSf>8IDg>#q)^1`kiOX!wZlCw&1%U|Z zDclI^^~g%x3MIbxIgXVr*U#0r+~Q`T52J~4S{otIo?&|^f`MYo%|;DROO|%J89lb- zHFNJe=mVI3c0t?S+pVnN6%$GOcs+> z(=Q{XX7+1+YY^aXtlQav2DMQ;uRJ`1;VckrXFRKt=Ny12d~6dl(H&?;bFk2=@}#^% z9uZ>V2v%IjB_}YCJ}m%c2lH+fBgo?hQ2YyGWc+to;j$&`z)$7-=$a=1^K2;e=c3W*fPCbVwEnG?Km;IW7HLe&nFF zR|7`Uhgx$ze?%K}$Nq9I2^b1#_s)!W;USor@%=7+Kw;~hz#MSgdX=)kcR?VcKO6kW z6merfGz9WYFdmY};&BY#@NQF=IHERtM&D#V+k_mk|2wXp^?XenFnzL)|Fq(5^- z#-coD`?<2AA3mocu()?0b;QJ{T7Dzm8AzrUvU6l7X^>nY-n${Ipf)v}iq2D}B z2uY|E8P3u=&SKo=@3k_q8`jH2j{;nPcob?louA-$1p*BPj~3_Ik3SXY@9LW6zyAjM zMIGZSuzxu5gI1FnPRW}a%3`7}MIp6`e*p`GsR5%sXWqF>4DEec4i0Mv&zkf+INun^ zwzce&i30#E#dm_#ctSQYsBk5i@7%m&WsF~Tl6@r^+&Xhy120w8-{vn=lJ)R>P^Q(r zsVP+bcwMdEr0dU)yXL)QbNAR!X6X>136X%qkoe;VaJzMGD6YVzVdi+7>!w+J!Q%e)%#v8_ z;LK}t0NRU4A1L0PZ}(V+3qyd3tS=)jM@3=D{vCI9+pH;`bQ9>ol+ zlkDj1~!(2)tGTPUswV?@GrBxWP;Qgg` zxqm{R`HPNWV)jZ<_{G@i>0+L&pA7O9A32Vr7D3>q@bjxN!Fyp1#=6*~AsU7<41_2c zxgzSD2eGq}V&~L1xy5)Q;QcplKvok6;?xKY!t7U|qLN#j&kkGbX#|8ocr1D7C9gq? z9tb*Y-O;yTJ>LaYz{b+z1ZXK;PyD-dQ;ZwG zLBp$$0iula|5AU6_y>CEUmfH8fACji(p~EEwtksE9TKI=*m;B6MnR-@HlJ_5KAU0` za68L9Hm5Ysi{rp20*ThTF1OXJovgcMJ0%9?ZtI*M|CC+;vbgoki(mo}^-0l!-Z)$Hnl_Vnk$Xq@Md*4OP z&A>2prgP7Zc%FVJ=3-)MX(Uq-aZnJy2{Q3wY6|2%OG5Y=n;^Rt{XK>;<470%2VZr~ zq*PL&*1-1S?F4gZTvsx|5dYAgymJTcF^f$b%gdqVWOSkT!i#K_y`+Jgbh*YT2|rRi z0kFl#ghoWUm@HlFpp#N`u#CyGBLuF(lO&KZ*4M`O3VGE$5$y^A)#nRjsnAI{Dz_)p zDiQw8x}oT&+#wjjD*8fj-)^NDB4Ls!)BZAL!qzGl^gS^3uA{kV)Mvo}3Fw?wMumXOhr&755*_64Y!41(%G!-$&Ives@ zV+sKWVQI&|dGP&jec=D&^HghF=gp=pL2*^*5cunxi8qcCOtm!yQr{T2qjMDbu7RIV zDPCth9HI->rsI40dQfybZBb41gN&&E7|6XLKUa1=8edayzoLJ2Q$C9_>nYOnD`Fw@ zJF}LKuOj3fv-4QmawxkEgA5&R6%Oe4fy?cz#Q=LRYDr`+rSs`Psn&AV+4{uTT>|wv^!E^DqFRyE6(ZK z5gR}XcCCAH3Yd(0e>~6BW!b1i541afs2$Bbn8HzNEfk)3Z^z-`LP;@s+{T;#GVC?2 z*Ev7Lk+N`q(evX4c(FTpiEM)CU_J@S$ziEKLRg%byBj95V~YW+u||Ye#q%b_cdqb9 zcC3p*pcSp}=ADMToPCM~@Mzg{c%(?;i@?plfsi!Tp4GXkH$qVgBwpb)>t$U@LCz

noADvV$%A+_jRSa6&CtJMa8iPhin(`Fg(H5@LNVMLj?tI8g<81{13Jxb zzIZsu+}%5dmcl09&ed}odoo1|AYvdZ`%HFT(mILX&2M!)1P?yK!w1;`@tL)K749bw!UZs?UAdW)_2<^z064y-Gvayth4dJsMcn$DT zKjmWyLzO=}Rjks-(f-K=4ILRoHtqin)TZ!j+`~~X9OzMQdY0a%>NYe%pShI5yvG(3 z_S2lMD^&{q(#;*D&cCT^SXWBLI_sX4MAh0D;h;I;<-zt*&H0zZh**`?Vb9T5fmRQk zi$3>;Xb2mQwB6l!HPetc!vLB~cjk)1f#)<*(^+GP?*`=*e6*Llx7l6C*DWe=(dp51 z;mKK+b*7T^J!AxKoqBlr8;C^PYN<@XLYzCT4v?nFFrh57n?+eaW*H{*3KAaf z+3{=X0F<0|)*8Bk>K)2->oLdDlfCG+o&!_6?2s?LJt2Oyia3KeII$s#9jL76$U&D& z$yqlShv@ylAGLl1;a(xIW>SS5b$B|v9UxncScCaRmBEMm+OJ`PvPqJ))6NVJ8=u-! zl6Gx09`c;Il9aA#re97vS+kUnXWPCa)BB|^t8Y}G^#hh+yVEsyru;@g(sHgyD_D|WM?Ib`AKf*V?_JQ+&cQ^?~skg zQ2`=)AqieFoQL8u#G_~G!F2tYa<&J*Rw&AKgz5P&nv61q+dhzoWJ3Cn@1_(9Rm=E{ z1U`Yy@OlX8j|d^l5}gAPqEID+jcv0fy7))`C~QG!24__L+Wy_>59YdAtbkjWR!lrj zOhFlytve70J6zK6foGOwqx$=s=3ZVqx!(R2b};#Uj1*2GzB6{Px}Wn#zD_3kK@5ib zyO{DuOr4ARM}-@O0&u=3i#t{5X5(BNFi_o3j@u-}vKtzoQ$x0)X(D;(^Vp_frFio% z)NFrN%VGNB0$~)W_waUL20AHkM};oBMFG5hy6OP?wF1(6qc*>Ub-F~W@(VevHHt$n z26RQckm1GH-XOzzl;^C8QCbp*K!a>22x-SC5{_MSE9oA-kZAENijxj|Lds)`5_3GR zl93LUTlb-YIZDkrkrd1tXlBUDnYI)uitJP16qBc3DgcV{AO0RQEm^N+a_LE4%78rM zb&|UGv#n8OkOE(f_g1)r`~$;?6g_D~H-&>LzHgKYGI9OH{}`mSfMi8kn0jgsf?0NL zkx2kBhB^)O!z(KB;J3{Ik->j861+9?L?WoUUrt!CI_6yVDD z%AV|3QszC&9aU~`;5Q`aCUNh5TS-$DP;{(v<>QqaW~L9`6>F}#rqQs2iTkJR1RYi2 zRPQXkH|_zKLK4IY{+J*dm@eY(OaNnMpw~yLpe1W?FZyH&|MRNK?uch7_WMTgG60%HD^^9;ZCQ>|~+IvTH&|q$+ZHYg z7cNqwjs0NXwkCTm`g31N+mci_O#!96PAM-CyTlI_Tn)+Wud~R){4&YKu)%X^Y`a~J zsnVbKRDc|BnM*p$LUFLB$M|#97mk`NcF_mEYj%h4P^GY->4W6)Y07RRVWXVgd@&{b5a!CD0*@mHj z4s+=j=dS-Ix4!k17uUPd93LTe{Sd>`ti5d}+l{$`0u${PIY@^Pp&23i{dGm$h19e& z(bKi1K2CqXalbSVllgRf1iks``C&X;^QRKTdqvsOTa11GgZ_Z||1?5+0P6Z*mJ)v) zmx8o&{$3M+fHv4whI&>R{Oh5=Dp1%>47@$vg3f@*U=Y0YGT;{>mr@V_(hB>Pcm&56 z9%MgL{QADgfgp5X6QS$DN-7LgZe?j@iW@La8_YhX^d@@n<8EWX0bRg0p~mtlli#pU z7WAW%81dR4FIzorFiHGy0>KRRu>sm3Kai%IzyiF)o%#858k8+rF7voJ22;o)RP*ni z1zc>j&0r?Cd15>zTtihU5KTD-uxgfb<>RfwS-t|!9(z-qAcm^TMYhklIjMwo=(JhSzDDo?U&<<}$9Hdn%c#5w>VJOdmW zT74LJ0mE5OSzB3||L3|H)R<+wIN~c&Z=sAXmfiV%=|jj;#6Np`%me22QUmfcm$U5q zXVQonGLlH5I?koD|Ae6M4}?AVte5Q^kAZ925`qk!950%YmZ|4V&#wInvA&`9tGFd;Z(vRq zI(VJ2{3J5(fc0fZeV7>%LOKH9pos^^&AT|$5{FEzp% zAS9E^X`aAYwQ7mXSv{zhw5efL=3ddy2&%CYN%u`~&{_y@6VE!Zw_KQ9Mox}wB(_VY zM~=INd_6vOOi@LOqMm(!s8>U7rcU8^3HR0Zd<=1z$ad zvs}1%W9HSI?$(pClm_hA6M{xP3f?vZ|cg4 z;xz`>4{ly)*aT~cqxX<(vGc7{K1vB6M29_C741FCuXE9U`{g)M4f*xQA@oAg_c7kZ z2>bZz>3y3D1oGv}5|7EoRZWJ$rZ2mxHjS|HZv3bH4m-NgDvRW#I*w+F(9C9`dBmP`|Bd*AvP!&vq;&Gyjq-m3WTK;^tsYhxFiqVlyAg{XIod z+8zN>QoqGeIso$+@j=tKtQ zAIm;1^(GBSo~_7Slmw?*Dx7R$-G~pU3Hzi++i+a;`f#45AL*aPB%d7Ti+a2UQ*_4= zA$vwy%~Kt@$g<2F1!ch7vysB3j>v~5V=2qJ3A4BbU{ zx8ofjZ-*Mi5=_a}1aNQ^GP+;Vyr{Zb!?z?wX13YJdQHX5=&+TM=0dp}ModG~rT3~C zipkkkBt_HQK+Ue?Cq7|38R4ku!@BUGuu&K$lh>H~Y6Ru+r*hLSc$a_u>haZYAZ+Df z3)U;u6qDALn6|-YNgS)M?%j2aAxAAc-3B_sV3Mu16OME!f!dZ3zuNO-N4z9(N zegkq_UP(JrR7a-LRYh1ETW4dun`dt$CB+XuO^rffq2BvE&({yIN)@O``D0|Ryi4Yx zhPz$Vj?@Ow0#t5DocOy4t?9!;OSBF@YFIL1NRqi5w8Xu`#^EP754(yK5ticf-85%* z7<3=CP{bB+7|X1rk~{pEi(WspPFP#3&^+)9k&B@$Zwq&jv!CfQ}s22pTD*l5n*AUG4 zfAq3nFxM99lP?94Tn1-kMrp!c9-REbSI_vwyY|f52tI2PX{6Mg9jVk1?Nff$RjJTC zb9{1AezG%UWi%Qn%DO{5L@`LgxDpkcEA$0LQI@^{tR=xu1$Ha;k~iB?!CMg%i&Z~K zyEM!g07uO&F2p}@)*I91mPmbP*@68J5mrx&82ix_|2}KKBk+eK{D3NIK(c}KHh&N zBC35{B>e0w=qwG>12yJ15M-xWKRCknDpElu#&iuVdNmbJwXFc8pkWu5TK&MwAWK*F zZCo5(rf92=l9^~&K@`ZGZn=Ln8c~?e)JSL)DhAd2)>OGmdZUjuuzJPvqg!dO8jbq9x+>Wxue+vL+O~cW_r|rKafjgk;)}eR30VD?A$uf4dyei+fM79^DysG4}Z} zY12Bd+RlXC<~NW(R>)=HeD}WN+?BlYh;*&h;muD{xXCrhm97~}PzA~xhO_h;alZli zp++F|r5Du(cpK@A@1}yNe<}RaDs#B3bK*NsGgbCpUe0F$+tQ9v;OSPuOfsG?FT#EU z$z;qge~I+h3(V?VH*guF#XK#ij$1_lMhi!rN%0ndyOdWDRla6PBJX@lLY`~G2ih?4 z1Vgw-)Mhyl(ELA@|DPR$lVKhG0`u{RkHKSEva+ffpP$9<0c}L&T7t)WOg((FtpsGa zD$vaT#{>na$NqkTl8wZliF>#(?}}$hN={|@BmJG0p}1SWVBTvIoxE71i2(aO7?iA!@ysK^6#7 zf=|&<#qOTvaT?3b9Xney3Tt7O+2nr}R*52}$|e}L6ewujT;sV=NO!ttSjoW+kkyIW zbsYxNG&Z*Dow=kiQ$7tCKC|Yei4*)k?7d}BT0)>@sWC^qWPeWb>YOKO*l72$a9c6x94)A%GD zNDk?V#W;7!qZKov$&WnJL!XX@)o#MQSDI@4xLwCLUwkpao24gz1Gz8b%wF|XuVMg%^J^ZnQS zD`(ydJ)NHR(^%2a@)Aa56cY>oZto6zxm$Py%tIIUjTKTAe`iZd*Oc@u)fT%Dm!Doh zKmck=A2r>hjo9d!Pyfj>l2{OG5;}4PN>JUf0K4in?U0KwfME>f9Mdg2>mt-LHPgp7 zS(Fwx+Pv5$(SZvh*j)R9PI^HP+l9%pQ_gTSm+;@t@vb;SsZVeyBQde|0f&RzH(O!3 z`~e>ql6N0Jy+lhjWDwXnP?jmP7Tk=d#DqRs=;{@|OY4#CE|k)DIEFBSj)25NfY%Uy zd|VqUEL7GNv;69nw(#K>%njr?{!ORRTFbmuidSOH& zB^~1Pp0addHYS`BVBee^orDdYD%6Kv@Q?f~n#2DhXW~F(k|iR|=ZI7P_OkdYDS0Mu zJ!tB(m@asENw${4LT3(rdjU8}91jtbE4kyl>9A15kyOYFexCaR$EfihSpuXeQl*EuoYrw~ zbDYqld-2y%$KpBm5nm%Y(GzZ>?2$%%_8wU$TC~brlc9!OR%mW`(u9i}LK5CV8xiLR zVMG$?&GRi*U6#V68%$Zh0$9q2s$?@O`i?qMo%&?OGTvW?W26+PIM8lYYY&X&aMrJ* z<8fSeUX@dao%F&HA)ngDUxzCja1C)%sHr;QDsJuK45N75@3|1Y)ms&%h1Zsq=NFR- zaU<mp_PaPZYuin1K3~%+jlP=9I z^7n9hZYX~2d+vOz9uj`fyUbKsKkb~2yz3vsM><|1K58VSGK*y$jWVm(Wpi9sZW`H? zGcsI277^HMZISGnmaAR&Grw&!C~AwPp~x-;buuucuR(}3BXsoHXv4_INqrsaR$5M}xaf0s`6fYdsw=M*br?sj z%+0>iCw=(>2{_yPS{)%KhD4(DN<6uJ&6Caa?IsrN(cbysQu3X!SmWSn`FCNTP~P^f z6Quj4w}*IwE{NX6zL(8PxI{M5NJdHR9%k#ZwYz|OE!<@(fx z_nR-c#3VzgB~##n3}=$C*e!1lwy%D*ED-3joS89FXjN&YuKgyk+alfheK8&{M~$JO zdBny{#l>%@-cqCxm5G-lf`U!2Da+)bZCn@mmv{mSVBmiK3Rx#;viV7e8k#~pLg3dy znYG@GgAeJqdk3f<07?It)pls+sHKlX>3$V+eLX*_`C=sMJAjsnnQ`Xv+s9j{(5g-& zN$@kSmaHyn5_2OzY7UvhWGQ)IV+fq7r9dcVx)Iaq?-B&A=eHI|&WP04H% z&V|>rpq_42N(KVHZj-5Fu(&w^;nZ#)FVB z?^}F;q9SF)cB{EUV{23W3dCwA4jY}T_>rne*vH;N2fR4`ms;mv*KqK^V0VE|#$|6Y z^&0FxEa?x>i5%bwORirZJ@RVh^UWYfGn4#1?4^ zduc@{S4j^A)0}~K7W^#4N6s43Y(4HJ>>o)cQob064?y1;+-(2L9ckg7t@IoNryAjk z!)fV=nUd&TxzC3i=jEx9u{Y)|KT)baGB`*pwNvsg&NI&KOvy={qSn~dI$vAH*eshr zn^YB&o_opT*mCl=8t$4^EaS1BtGQPDar|{cXAHa{h(43Pd~LZE@1H8BgcqQEg8t1l zmGW=O4m;1yWD}##3bsCqQxk7ST_%Y{4l&Cy0 zH7tvu>lye(74_kFtOJNRIa$u8^iI`O9F=q*&#?U(1w5#Dr4A`!SCm%tve_kF#t*LH z8W&Ap`la02#pTbs@B!sTU*d8pxIzh^FGXvZ{j=xI(Pk(N+NTvqn(-^H1( z`Z=0-S7lM#%doK>Vi%2ar`d$1N-ZPv=Ju*|27< zpx(vHN&ThEZt%HQ>6~ZJC4BAS1vywgT;c06(@+~nWAvv-is{<_23qj9Cj~s%#su{H z7-4^7+5R1Hq;I|vDc0lrTGob{`x>|Fuy{`|>T%t<%dn1NU5zc}SqK|;x^y+!b}d$)Q<=|ro!pqG$xhrTnh20(U$&B$XI=QF#t4qHU=#nYG_c>T%2aCq4H|dXI-d zkmfgV5UC?7O}ZF`HK}+N?4FWl-buUmGQQD$^K029eFUMApdZ(+SF&J8)^Dx1MeZ2A z?0b)&lSVGPA}**>&B=et?M(StFBXq%P4L*^Szk5f=b_FXF)QwFxT45>35>6&!t`IM zZ}p@l|DSPj|0M54ujBj^0PdXo&r2=tn-cD3-bU+hKl$w5sV#DhtfO77`ttN5=oR2% z!NW7KWGCTIi9$)5Rk;q6Zq)Y-dJ>18INus+u1-^3Ajydm9j7VPguvo%%|M0laphMcI#D5JI= zmjFnj>dn0xVB@E)fAGe6Kq{_Id6cRHmX}XJZ$(`riW{T+xyAZ&f3zSy9%H=UEHZ97 zG@S{3+}jP3_Xo&$NrF7PM4GRGhO24=#2DPSF_xaTCz4F5I$Zu#jz;w9G;R3Fn;+4I zp;l183oku+aV@E(McXRQCMmAhfm5Gl3SZX><7QeXD@4?h)Yykxp7rwKQ!zh@aX4G` zKhQmD4{0%7BY&@Pdd4!{lJ+CwxuESw)c_F*z|m9C&_rJ4{#J7G5_u*+*-nJqfo5Wd z*;g6Ows8b?OE1sf)Ai@Nh|w*%e6Of3hfW$?ySIO3ZgKPCXkcfu-9z@jDy;uc-oyDn zK?4Npw14f&G8!xlxR~ncEn`lsW$##eZd8mhqMYPb{Iu~)b*GZPV=eF7R*aUYT z#bEuJs@in|*Z7uYEg!Brag9yGZTNcOvj{pDX$6xHf^WQQBO?kVS7VJ-0te7^W zTlByDhBBhc2Nu}%VJuzg5%L0+`s6QnkOpx|;{y$30;Jsp4N#jf4yaLq@xDiTb)C70 zqH%;nN0byUl@S_eUK1tTLo0F4F>-j(DVm^(ogBs_^l9qThgv=*f@-9D_AJ-E^t2Fi z1(3zKE+pt)hbVNbNt22_uWk6nm!zfiOy|9Q>k7ao&@)rpP$Lhikw-o+d>S2>65t<^ zB#Z=Wd{mR~dZaPH*HLhd`JpZ#T!CRr@8{_L@<5zi>hFDd!U05hABlNLSSN$!G39YD zrM@Sgm-+csnL2bm?`!pqc4E!~t3#vCFm&p((pEC)(u$EkFUoz!LQ{|*vjkfL!0+Y$ z&8cv{jR-Qmx^Z6cVk+7SntK-d*4(m_3Y@CS2f;loA(QbbSQa1VME*t)yU!^F@)RNL z_`T6-ht`nszCDVaQ!l&A|B{~2ijs78G>5mIymY9itY7Up2dOwsDjCw~M+sDiB4u1!Z zR-j(+gZ2x{MS&Cg{PvJ;^xK(3h35B?xuJBILQw<{djdL=z(V-Hp85 zMTgDL{f*pgI#9{%9WB81**_wIrW_~NZ_$5pvzIxk!!ccnD)x9|NmCZ0K2mjxtK5Hp z@{>{Yz{SbQ2|F|q6I(6f!a~1hQ!Ib1X&JZ+n@|J>2*DtLq+MOKwxl;(n{*&6*_60H zh|duFXbkDt0&EnvbUWCJ1)y`RN z%tI&zpPew+2X{Urdp+-=9Pty{X9`gw;aVu2x8ys*2X!W~OQ5!+=4p?6+hiU%VQ7+P zr!~ni%qxmK>!)83$i!|6EAVpQDWUlCTx7`nSX!0L^S%dXrhyv~Z%n%~Yh2lEc};lM zRd&=}*mnpQr-+quXg^KTgNy;Hp~r`4c>Z2GPwiB#>3C#bZ_6-R&mhrD;!Sh0JdkUG z26Y*t(pz+)yrc|Ms;;ECqL`w6x&L0g&JExi0w_$6C zzEl=OZj>esoLWO^&9+2#qMLZA8(}aw(YI1zK1=eg3E^<+!zaS#{Pc|!2}on(x=-BU zCV@vPdm2YC`0Q{^9bf_@9Ln0>(lzbOb%HL3TO7?p;)u}?tI41XD<=PX_(Q^cs-S9} zo3aAeFC@mGz70QFdj7K8zuYQi7(Kp1f#L}Zqyo!|uGP`3yii?^0sbojq~&pyo<|dF zI*tbL%~S#)uKMgAt{2!nZ@kaN_A`J`ne2s;r+NCESP;l|Tpn+EGzDxZ4Euzj1+PFR)-3R)aNSugyhhze;{ zH$(V;Hs$R8HMgv7yF1Ybdh`(U;|9LC>LCw?h~hL5kMgfBNWs8E>tvX)ejDYNmD$B- z;e=WmpMlyiv|%)>GZVWnZ1qUk;R+$uoAlj>*(8Se`Ev(Y>co=!;@ms)=85tgiRokx z|4V_6Ozh*Zu9}%YKsDU%m*J$Aw0LK)pMqEMgeu5etrGytW8&wl9OEm}YJQt^O*NJV zhX~sdi!<;9LmBn~)ZTKFH?Kf{A&bh+y>Hp)T0Uknei5ttO!%DX2rMMtF{(>BSYNX+JynBXn|_bB zq4gv^$9{z2$>rY{$p67#%Kyf2C-A^0eMeCWDVDEkT_{_0l~QJRT1;;bvd?%SwlOmF zaQ!IRYTFXO4!!r7gR}wlpO3~e9%}&M>QL+;0K!;=l@}d+!h@xKY_V3~?Cg-v-a)?d zQAyZI{T?V5yYW8f;?HGBZaw& zbO8n3?D@;Jx5acynarSpivxYV)04N8m(~Z3moIyU?CQVR4B{;`QnzvRxo_9Dy+V9a zs;MNI`H9zIY*_Ug^h;v(IVvd7A6m*cvEfxcbqUZS^&l{!KNavE*IgMmU|ql^1bIlb zwv(F5pZ3PRY7T5ts_u_aa`Y>MkeOAA5&##GWaqINs9qP#7ZbhnWVy;R&DTjOzEOV( zg>yhf1E=oLJL=%=!M0^mC2kUW4W!dCWEWe0WH5ZXJl2AZqNO8>TNaV`QY1EA>{LLE zD)ETVt*7cENBy+0yg8Zc?K}7=>89+qSaV=(H~yZ?4~lvzr*r~^_be#OSX$^xFB#rG zS44W{Xbo$uoponrMP^=&qY8sJY4MR6x_XhLms_o*euD18yPo)c#zJie4Iqigb?coq4A zMz7zds&^yerJ~hHs27o(Ws%rDV`KP?iLZWJYH4*A%uAVm;f3OTRXJoziSJDQ(1d_v zaHrcF-XW1GkFTb4&EYdp)q2PGSr0sRN)bhIzu2DXy2gzp{fN0{3 zuokdf0;U#8p5y5?u|MStOx6JSLh13VrO4R$(bWd;c;KIDTrq;|7WAmnqRzn!3T?iO~lKl_u?p@0ms(Nh~~FE=%Tv4voyi^uCp!zdl8Kr9>>+Hm%Y3 zLcK{W!uhUL1es1+##AZ67AHWQoKtYxrGjGlj7HWZG2mB3oFV{I<{22;7)9@;PlEZt z8>kaUtDkn>GbJPZ3*}jJ3J|{Bmc~D4<~&%L0zO=nLkc`Q5bhZZ z+L71aF3IE^8W{0Me$UZUq#EK_fsR>P^wthi$}3q$zw?1jevPN52kwC5_jKxs)Mp%s z%Qj_WT+Cm|Y+FYl{IeyBfI5^7g-`ZP*wR&Wi|CJ?11-@P4beSqxJl&tJZVR6edSb` zzQ?KoGRWYF%k?NfR~1B#c+$LePm>ZI%ClofA_u^2>%9_eCBoL5j8biH<9ru$w(&}n zGyY-`1%iHV5S&bxRw(7*<@54qwtZpwAZEmloutoizFu|STcya0y^9M?@14jW^kCO} ze19l)d8ud}c)GUKPHCK_Wnu4~RQDHT{09G2dn^z0Zon3{Cuo|tun%J*tCnxxMvjN- zqZaWTXg8p~ZyLRaLjZ(ns`nc;L6>c5h-dpP7VQ)TRLWs}1$mVF<0$thIa1%lA3x{h z9{1)Q^jh7aJ=X4u&H10jIlW87JE`se0Fhkk>96Z3Fz)YDwiB=(T&mHiOsv^_QE7uq zhZZcR2MJtLY$5>>QXS5ou2NSwX0pp>>A5cV zjjM1a@)J(a%im7dw>$}yxa-wt>awgjKa8liZW4B)3+bYOA6xFNkYzbiRSn~R zUwA~HGWMjlFJnk! zn?Xmb*QG2nnv>i=p7!MkzKgte8nDYGravVd(m{}R!;^-^Y_X+w;#V}ob+@oD#Lw~k2nCb@(nEY_he&aI6CjuR z693+fYc`4Inoc;G0!Fe7rC*~YnH_jKJij#-<#CL`NDf6;69>U2M$|RU&ocQ6FIXi)-(+G3?E}u(K2?d4oS}%E^iOLhQ`|^i!PW zkOYg8jWl)^EQ^>hvJep>GqJ5>Jpkvrhy)A$0a{^1%eQI@wA80aQI2>r{x)$`VYcJJ zItF2F5@&gS@SU=ysUgunw$Ubs%zW|8$31UKQ1`(34-mnwU|MHQO(TUp|JfoZ2XzG| zl30+HzQODs{HA_V2K~1A>7&aIB|f)%awpG$LzS1C@<-+(XV}6 z>P099(ko?6PF_xIejrsuMk380ZV)olVP)f7jARtn^IWrBuQ(#3Lv>yzl-tI>YA-&N zSiqF3*cgV11Z}m?z^-9o^&zuzQ!bh0s)}YuK}WroST%1v&|;tK&AC^MTJCt39kXL} zfs{iN;c1@g3TXd!y~vlgN`sp9^!qE{rD;Vy1&Bjk#%rs^G}Se-t5w3WvSA$%=PEzz zABdEzZ-*sivBLeIGn~VuRWsI>j;4HhGLzdvHd;w+IEipab{z;(IU9Dnr@)J@2eR0g znu5Xl62t`iQ8XMqg`S1rvmyqt(8p=#qt`e!`=6NKEVz9F8mbw2#svVp$qW#JePG?7 z9+8PKhB0SqpshhE=;5WyIfX{=YEb*%_UrNz`1uHkeM-JyBe1BjP8Ob-VBlm-3XN8? z=igb$y2{Ce&h(WYXPFkvlO7Bp+m~_f#13@q>2sN{vv57Wc-mL)VE?pIvWO0wod79{ zOJ@b+(mH&I_@X&L&nE20zz+J{jjvynXs!Hv(7jS~|KZB*v;3s&T~GZCubcxhILG9m zqA{HqHyd?C3>xUVRNYjHbq9nQ=krD!oX0{X9J~XqbMt*u6D82nD!SO{fo*@N;>UmB z=LxXj7gqLNN$P{kk8AecReqz@>!ywgiE=p5#Q?QiHDGwWOVh#*r_`n?9s zN5UNl7PpAt?Tn4&6$e)@YmdfFtUGc{K}3duRq+M}lOAQCpwCQt06sN!Rly)1Y9|%; zPjaiKSsB1QgGWqj_$!Ad-Vb!F;OK}r@2ZsBs$KE6eYWLv*l7>$!ki0jztn1KGlXhHJD|qL%yx=S?>Vz`ozw#1tV&$U4Nh62zNDRSA# zye>+h@RgvC8;5;xd2?IIN?_mHg2HiD)r5>fw2Zd!dl` zM{QJlf?s@JdSul{@ztB04KnCuR(#|?3Y>@<#*qfdGJ}pdR8g0KPU}7|X7);wjcJDB z2GF_l9AH8K%0#OTE6I8kbpdO^V90&CmA6tEf~js0%-wsc5INjN)##6T{OnbHPF3I4kkw=*7%?yIEwwilHFXEe!N z$R;IW#O@TqEzOh|A zkA{oeOpXXVoIDQbL-hsV*2kJYz{t$K;mDc1P`UwY0VkDbK-M^}J`R0&7*^0;}XfD`kQ6nr(Mo zq^YLB0Hu6KjuXw_S%~2xupTk={x0)DN&^A0_Ao`su`cAjnK%lplUsBROifqx97Ar$ z1Bt#!NS~m84Ep$w$gK~ESm1nG(|!U4G(8#^Is0wQ<1G~5=bc?oEK0g*I1(M9%+it@ z+~1Xbu$02Ct^$~ZN|5Y_0o@2_RP62Kflib9chUB>CET+9H@52 z)>>bSk?Zp2l^e87blH!A2#$^d_|ZK^K)0~^ZcKGJ^K*uG;)JU9ATkhR(l?4c5db5u zVQ|?Cj0C>E+-b_AC>sFIioU7!TrLMUfpfqls>S~(3u{tpl(aMC=6-p}XR;uuf5 zloD9Q&n{-E#f_|)4jkv4ux&uMbS_jrpa|v4ngAgV#5AcOR7dL{@qOv!k@`v^>HH%z zvDC3SsFU^|#k9ZoCMz7*XAc!<%2xJoDGkdagX7RZKyl`GeaeT^;~;GRqQSV4*9>Dl zd2{t9sZ_+&i~h-=1LhMO9t_i#s#w#|zPE4It5C zelo*In~shmR&1F2RhBC_!2kqQNGCT%;u2FbIv?_0+~qdopvCO>A{6_@V1g zx1Q=mz1F18yvYF!c%qw*sLUwOMkkYBjvu~QfuB&`<=m)VKCQp^fP~&%C!x>gevv5ihXPg`B0IfoN+7}G}yjF@XOE?`bscWw7qqYFoyN7rTuH%J-&e7J@2Z2%zTLKj^Lw=bWUeFy^h?j zGH+-l8k6SZw#eq?-w`4J%^pl;ytgwEEfK|+V?97rc~@?wx*{Jx$P!L?0|X@?4Q_${ zvQU)LI+=F5+EV?+A0U&TC|4hjS7s}W2ShV3j(U~ge2-G)cV9AQLeXiJ8iy50-ZGmx8<l-JJFInCm)$O@FXR45?4F_RQuily${>H@J15yVmqVs$6yLu<6#93f*N_M60 zn}#@H?%+oq%7+)$OT{|}?e)MOa3KDUYv@iU+0~l6p$zf8=WL%|d?sH(l-DMXo53~FZhe3@ zEW?QZKtF8_f@Yac7V-N_pGTfDi5Zl-i~!s3a^JTGX73YzixT+|O+N6i#gVk{5BWke zqJfvcJ!E818iRbfL#9=_vdU0hc7{{a`tDL z`n<))7zp*&f4iksm_9S%8Q|{PFUu6vPH>+!NWBvp**K`@zjx3vdBX0g@fDFHGx;6L ztL=@;%s%{ju@V%RM72TAPtX)`v9LmdFjh)k(A8%>wU_2eNlmTZ8xv*$qbHn7V?0XfOEBOlvGly!Y95&OK5$tK2xSU3p@z5d@}5_tOZq zuF%<|#AVj+MAo-{H%#RJ-6#^;aG+kpKxMw_$(< zZF$ISlC~d-ZO#-sVT`?<+0H1MwW@f?QT$1#__b))=h;3^WnT5<;UROpQeC7vfnJtF zC;9n7I)MAYrtDl6k@uoZn%|4(2<#j9GqE1&%Q|ovo-b!B<-T)mYE3kX^sFYs0V*So zZp$pEm)jXUO`|oxF;g%LcgxSxBUZsTFb)^`ds+%H5&b3?SV(IqpL0qTc!)G;qu zCL*eD{!X%~K21UepL5e^Mp>46PwvQ}bcAV5>E6E#a?t)bSi7>X-IkKmLpPPLnQ)4< znUNh4k3e4)C4~H2DN;+Ha=MI%^7+a^sWHnr^|Q4%^|fGOtF;)d zSBM5B?79bhi7`B*4{pHrMZNO79^f`xf7COgV{#PN0u3=b)I_U6Q7O`Ae|gQy6l%JM z$V?q$<)tp+w+G zH5RoPWJ=M*mTVi)n7zdl4$E1Jc`hb6F;jBPgGIO9H{Rp;oAh;MKPW@jNw>|Pusq;G zwPL31HNH^w(`e@3sUSj!#^|*padQ0>0JqoMK7TQnWN{AElPD!|s2;3JPrh9!h^o~$ zHAyhBtB|Dqjs0H@~E_ypT7Y8&0Fh zSyDg+qc3*Xco*FbupkV zp5@bzkM>arbiWs_qhR#R_!~8AHvHZ1u=csJVfBqikN!wfihESBuv6HM(dp6F_wXIK z)I*Qt)ElbGn*>5IDm4fPI0p!Bc)JzPF7G$F94GW9(@~G0OU9Mt+8D9V3hc#jNzU}+ z7*GER>2U#Sv<3hyhG(Z8y0^}hd;|)8Q%Xm_Hj#rbL6t{ zg5VG78Vf|eu1E7Xqxrl$JT0)KJu z!sKP1(T;O-ay zcqtN6BJtipp3+jORs}Yte>clSnipLEggJaIA!oCWbcJ|5_X_ zs&5F`Cv2#5iQ^+y{}dxrS%$CL&^W~%>KvOjQaGL%;k(P2>YeMoXn3oJgQRV(jg@%l z*~3Qg47n%{hl+f*Q^YS5dq)+kVj_0&73eV=D zlad`7I*R_>jwcbqK?k}Y^`C)p$G=g3Fm%^=lxoJgUCkJ*|AuTh&8ZRyDRK9lxMDal z4LPC>kZ!=DVzw7FiFhO|cW|*WCr&K6Ei??g^M>3#gfudyVl>`CU zp9W=FitQ#FdfkcI{$v}@&kS#a$%2V5QIXASokVd@x1*A=T8ArW&K7%0P8hK5WtNC{ zGadYOAzwd7e}G>er&-6opiyqzMMCz~dvjSSf}=iNUSrKKw(dj&i1MpecPPSu{&{hd zoBDNK<) zt~9eFw6GtgZBL!x7CR6wTo%tbA^3>;etfc4Pe?s9e==VyRuN5imCT&%j^xY^2~qxG z!X_%@SNU8n9>@7hLp(1rdzlaE95BMYCf<1LpR|j0dEO~SEt?o)w#qp5GjG*0O7o3)&11jgxCYR_dv8!)EPoc?v{UA)~XPqLwbbV1}C%wxQ zmG!~f+q;Y-SsSF9m7m6rz*Ig`dJ`^3^t#eMtS;Zpm&t-DmBqy(9Ulo!+k%}*t;16+ zj%m|r-8>QmHl2xgL90d$OyP8=kf@`a$${s?1GP@~c;tCXW!MfGR%LZW^64915Yzk) z+wv_nlmra zSgXvZ$ovCvz->%y>nN4Edb>`uI{oy?yl?L+bfw*lw?B8Knm9Vi;wX3+%MO=yDq|+m zgDJIpqkC>Km%XIiNNFojWKcIr8z}kSrn=Hqd9&Eq6uV0fvb^U>dt!(RicUx*_j&3u z>gUYDr!rkGb?E!WwdYG@<3~AaW}#JVu7kJQvx2%82V$fMqsKKHo8!J~e!andw2L7S zQ?W26EcRaVG(QFV=R;+FSBx<2+Y&>@s!FItp9FRC;Iy}l51&6XOE*5x0ynF~%};9n z+pUJta?zTn(b7#oyOVi~L*5nbQ-F%r6s5x8vj^+U%J%B1(wXBfwIsUTLCD)(pGe2& zOTvQo6Q3j-?nHkFiMa#_?QCym7-(;|pk`Y)={@{z%dr}tTaOYPRMo=kp>R;#VYm4Dv#m4@wux!s5Lub6wge+dfu zNA=f#2hKVZ3rF^e>vvlQrbUf14YVS6^lr@rg(Cgs4?|tY&Kdo{gFI^gr#aw@*rG?_do*CV-UW^Fu z1Q5&Qbj@sJm62H+)U-a>62JLcdm@y`rSkWDapDzgaXW}(WJp%cG0{3spfod3|2d@~ z<__xP_TBAYZvjr@xaNSx{#i>H(MU|-YyU~Rm%3iqsU)pyC(kU~$jp-FHj|{Ey2*OJ z>7+vP9{_VZTVPYKY1|&k9}qo`>hbk>HDsMmB@N5`wBP;KD)%ao%=byYDDvd17QWjL z*Dvt+RKWKI!WUHVqsjRi=6e>?r@K=pc}cfhz8j0rJH?iiL6nI&Dl0k(&XJsY)SU)K zsYViZ)chjh0X@FNy@vO1sP8EDyl(yg-7%CMXpQ$(+X11{<^jCOf?aBEfwOVBkMF!@ zq{CMiM4o)2OLKE3{KiJgN$|xpGGhoOsx&-nhkak{H0)ly+3UW954$sE+~iea>#64> z&*8*=(-&TI<)n2E6s~RCr?OWffK6ZscF&ZHh$K21RDbtU%b1FvA~oRIZFHVfofu|| zn58{^T3*9KzudD0gddVvI>Vl!*W!K+i=vOw^d)Na5D=khj`}LX7{p~hwHf_IeTzp; zMUo0hyaSv`un*d3QCL@-eXla<%!Efi6;Q zCz}}$&tQ%3gJtL0doiAXSvk2D&8eyPbP2D!OW8g)@M&CskC$~{90`e*II|Zi!aV*v zsj=kbIiw%`?^r=DnQ`ez%2@KH9=Lz4x~n=mY8E=qL0G)j1d(Ij|9GY-$@m62eGvDJ<GAAX&J)~<0@_8v_hZ=qL6*^DSF_i2l~E?mSqP#?$+4LgQ?{O}X|+?B zKAgE4Q8T4x=Hv_+4>4jCxL^`mVz$)&=~Y7seM5RayCuHqJi4Q+?aefnB>r2Zn?cMu zacAL`g`WSlV5yM4o`G7@iUJ00UXzg=R2mQ?;Yfl;lqTVOj7b>-)>0;^)LT_TBYo z#mP}Arp4MKk@rTjD}^G_g(Ba|o{GM(U}A7hUyC6-qV>4y*9KrW@^-~C>%6VilO5Ko zZ;9r)x@Eg#avD^o9(A4cT~25AYJMA?OSVzyRvClf=g`#XejpUbsH(DNvh?HNAP0d; z1Y`91z$IOUOZl@s!KJR{p*D=fPJBt%c5<#MhXdGhDwM@ zW0kDEUnkv}pR~-o))JSQ*vGqBw6P~6E_}Wwb026YcJK>uFY(Ajzu-i0Cy+}-JUVJv zX1j9^b)WyNIVjE~CEYNSjEq*PzA>Cuc-tZ9TIA`1u`pDh62W^x>M+JO&_N(FiM^v% z3{1p~2agTy06TQVmjKc)c>dS~1dr8yAF4JLOQAxkX znetP}=}2m|6`$Z}Fx$t$xfG*rD!doqBGbh@Y2OopUN=Fd;Pu+6To~Ii*4AMlh1ru| zY#&|+I6#R<&jr>l+JlpQv+}!r9E4*B1VeeD0b!;6V5)fk74GkVfgb-|ZM^#AB~h@v zDJRP3s6hUHYT|p}ge%;7_&fZy-F|XbBC+csjkjyo%?1dmk{wQlD1HJVJkZA+RQfu` z`FWXg3l525Xhw}>*#+*eOjs%P6^*JeZ2bv^h)mo#naOZjWa9GzKJ24VDXo0s`05$ zbun?gCtWg-H4k8v7E`vzDaeQ7{kTYnIp)C5csXRu+Nof&eWMHKb~~eABi@?Cf9BqW zGUW0Ny5_qz?~)_zPB*OO#xF=mlkxf)xt3yc&wYNf7NqkaDSuF_^WT*Wyoo>3B)RtI z_VGMFI_HiV6vsBE#)?9s3LIfLa9}BMg5dv7Y>Nc)T@E8kI!fgIgV4LFumh71OiV?e zeR&R9H%J#dNc{MH?IdJ#@^=ypd-&xd9%gQ|qaej9rg~k-nh&2uZQT5dcbokJwXEa& zh&A^Pkjd4FjC9~HbL!JNi|>+68iu@zT2-D;!UwKaXUu!u=FP0T`bg-CeZ>^+wrfha*W-$0R-lRQKivOm0OqvWCn z0!9utQ)Zn^p#9KL-Pn-K&>s4l5$r}apeRw&g8qloxrl%@j zZeQTwS+Bh6T=F1@0%*rS^W6$Q1!Vwz&sy!x-#Z*^U8qOS|{6Fe0&1s*}aDB zV#*Q!@7^q4PK5~HiL=vf4W%C!BWT3;4#~TW3`Swgv;lvQsmvr~!IV8M$82_Yf_~b& zUZF~Dk%$Hz3jxfej>_nwa?AFn;K4!b#(p%s7e+pyk~FujTbXaFKCRDUrC+G>+lz3G zMH^u$$9KY)pyYK$J}z!{6dd>!NR^Wg%Eax0f|@?Ijw|RFH`z5!(=;#T=2S$k%<1>y zqq~atKIE(i&C(azi=D*k7t_X(idAY0X%`}?b`cPQfxO(SJzWz6?2CRj=^VnN{Y4bE zZI94q`XYLBfs+GZ*eU#_B<%1z(RX}NuIl&DcU$$I%On_eq!i{O$wR9qLnUQ`VWGt zT#HgfVD15ev>v-y1*fV-rLpgm6_xv$U4?jNeZUEoD9m$gZSA8BwY8A%g3Z zYx-ZSP%Mv$CGK*|MdvUi4{;i&E$7enZ%gWKEworzCh-&nxpPkoJ)fgE`esDSsn$G* zj>^~SFg@#0p_if@PS3qdZWbq#=>dZ5k0Hy^e}LX<*RI;#5Hhzf**#$W>T6#6AuO5A zv{3x@tKp4G)zkiC$FCVZ*${NT8m)=rHaBzy@yiMuD?6(M3Eev{r8^sZ>KS2AvW|qEBCNI3o z>nyLoJ2bAfnY;?o#jQ^Ev1LsXXb;ECp6TTpa1)Cm+@JYd?@xoi6hYT*&~&Gse}h`cK$dbXam^ z>cspWoZTa9aWu*`xDV-?e}t(*2}xV#SzYwUDRp-C=O3Qx=ey%~C+iM5kkMHZ zY-xXLMum~FLeTBhX|tTVUR2MyTa7GlK40T|?itxrLFvb{^}<|?09$NlUFqRz@xnxPZMGzIF9(oJCN}?h~Y0^YMdWn=6ihyFIgY@2euZCif z-mB7^lt3^*5QLC(_ql7%nYmN$taU%k%$apR?N58Jz1A-O|NFeZ_j$4?0<@nh2yxr= z#fmC5=dP@NmVTWR+F>Vj18}VT7Db7Z%q~Asny`%XK3|yb@Tlf3(opKiKG(@r$H6Ot z_;;uW%3tq2Pae(Xa<6)olufEoYCa+Ik+QOmuK!Y6|4T30trnf(4@b40`x9_Z_=aWpNtxts?i+7M7xZgw5SD2;U0OJ z1@30VOx&^l_~scT-QSmbjl~$EmCYuqH64oT#M-pwq(m5Ah&3f;QG_-Z-lrtrr+$JO z!`q#aGOuSq?>1pdgCU9+kBIdmVa`An2>MPjFZVBM8&G-9$8QEM^kmR!g3G<+7T?-b zeo3Xq>27+|q={GU(R#|+qfwO5XdUTf@49aftuK-8)Pd;NQsg8krE%%;Dz7eb`GL<< z-n$ZYoukf*2`Mf5;FZ01g^paV?>C&wQM`Ahg6XuBFU|r~b`B!^6#f}#*yPPLzV~dB zPDYWCd37@)4O}0goAcM68RF`7nOl?{tZ>{7{3#qUQc9Nt58b&F z*En(@?kCHc9ey)Vz|y!U9I?ga446j&N&(qRiY`a4*-xj(*vAVSz{j$EKAg11A@crqw6sCwCNoQJ7AJI{9(|3$L9o~%algZP?h4i9ZGm-~H1E!y4X0cjMV+KG zIZsTjWW^1HdG20jm-`V#Lp|z(qK`=JyxF|QW#;Frc7`sV!;dm&nBM2TWQ2?fhc0Hq zeBO6|TTC?}O%wgv3)uLzzoR^->qRvDCOY~5D^uGwNg%f>8@x4H>3Y*DJ*&k^WtU>` zk}W5wrXH#0T!gvIy;Usw;3s)DZF>5*-|)wUMeS(UV2)tyaQ(Ig8L^cMW7^8jlLnPt{j!ohRD zElE`W5*=dw5zyxMgPzmO*vkf4i?}_yLFj4|6&r@Y1;ZOo;8xGVhM|WVLFOG7k-vgo!qkqiTL{**Hm;NE@^wzOsV~Kp zFK7r%;KRX9{K&*O%A#V+ z7p7bb^f>}%EZz{O>p2K5%Jko~WLQvL={L+XL%R6{^*o4nifKFX&c_&2 zV2B3?4)kovqx)M*5XyQ%T8^ndr~>Qfr?!Q*RE26|O&G$i&0nC3^Sf{f5Bz=n?>Ewg9Y=uI z&43iHN?!msGYkLJ{u49;6ox{;k}2fv7X@6YZudH*rLE-vrW}8wR;ya5CExUHo}*?fme0?jbIi?jG1Wgs4jFp{gQ6-Oiz`nob@$mF zMG_L#&uVx`I`HcZT9Zah!1#;6)q~BZrX~$CJ`d5`uaPd)NLwGQ!8m%yQ*gx~dt5jN z$P5136Trky1T0~GF@A@ZJ70_`X^f9;_=N2+R&|4BaUY!;7d~&J8TFEWjI|b#!zZdW zx6`3bVlq4FUH*%1t@KQsKsSXZ)lF19 zjK7GwbYO{A9|mU*)3J*AL_;W1nWgi`cEfkrbWUJd4zLT@)X4hRk6G&{mgbSrTFR~}{dM)@VyvC3G6@*!*%MBKr|e!-lS5$*lgFeAU$yn}Vg zD>Q|h7Arx=_;p)Y{#XE;eU`56M8#fkHTfSR(qty??cC=g{SuRfc~djU=k%EP;t}hz zJO+6KYlYc?dy@(*Yx1Vl4#;TpXz-${_RsK=Bj80eGj5rYwpl6FGgsfa{%`>1qdApm zt)#M5l11{j%zxBNqw0`PEUKPrGUsqHMLv>6UW2PU4)~6VE6*o@t>QyqWy5v5= zqda_D+4B`y%AEkK5`8ubtt&NE3QXdpF<1ivAdW`=h$9{ohwI0{=ydWU{&ortEm=_> zYA!`BQXY7%0j7;~qCOQ3hr_%^#@u=@AlJ#+WG(yA zglPnP8a;DAg%AySAzUFew$|c`iH7k+Wc4zgUF_p+Zf6R4o)^JL+onT2WZRek}^(M zo+)jX`-(&gS6EMaMP1cmxj)7mI_%aD!}9q{!i+5_8rzU)-(pAcY)qx70D`uuBH~zM zqMla90apE%2T*T{W<{xZvrivvP**wlEvA-!Eyt#n^lO^f+Cg*ax{TN`fb%TsM{>S% z>=gIQMfb6M_s~a*RHU*BKL~FBzKFI6vHB0vw;Gu*KtCD@>g;*R*M95crBt4`3#Vzf zNEi!%W?aRccI=Y&i6@vHC=WvvOZA@MuDvTaPyHcoIyhgeu1o{h=04mL`+gsc9^!mX z12Zs#@`}gZMY8;;#06`S9vx?6sOE90xhF0i*|cvM*!_Zn1#`;ayei{$G5{BI`5M9H zMeI=2Ll39NL3RQcSva&FsFA(wiA1dXj1vKcW^2Nt9sW6f>1e1u!TrM$Jszw39Xm7A z7^~mS+;Tx|f*At7wA4I0`~9Ar*-Yb2Q=U9>GGv~OgS=Q2S_xmC9}ahMTLOZU*nqYC zufa(r7V=L6tn#ujH1>zFt@@0ei56UGLxtw%pf5{*w5&=TIY}n!^(Kle7R^7-TQz3Q zMFzcsfC`T9=@l=G8I}vpPA@$5weV=~;g{yB7^U1c0@??JTQ>^36m?9No73nkljwn= z$$dGKeB6OAq`YR^u*x&+(pSpS>5AfIA(4kubY->r&Vp%;@g27K-d;XNNE?nb0pZG# z3U%lBXjb;1CUjvT@=(MA@7~ALfpfvJx)HjHxTmMy`Ph*} zO@?Y!o6TFUeM?J9Q{m3IGe|9pINpf*Xo^e}cU9F<^o-wBJk&CMmmh(++P-e_j@Wh` zz=IsOyzb1-e7ZIxAkB;=4SYE~H7ua$`srF5x0^S0Yfc30u4C_ID1>5UL;uM{fGl0I z6@N0vR9BVURq8uxRnq2px4{`&^}&E>@DPr8&f%)>WAIzy6Jr!?64k4Ph@=v z4tpd7e$;ZcpZ~E}&WagGk37}0!$?Y$eE3}*!hTA1|+TBXD z=f=)+RC&Nb1VtzNo(B#smLcmC~Zbi_)RMW?;Ni>Y`)rbnVsEt7`f@nx$DVzoauIH$fIr@bt)O6mx zOXKlr?lEBDZ%rrLTL1)@W-kS3qQd6T;+;G{_gEfKdZ)Q*i}o+j z^2~SPHs9=eW0hprV$?geQaNAQxe)v~yJG^BKcvh$uo``R{S-ABlAwYlCv$IVTqhyO zIw&!1>EH3nSb3C&C5I>m2@-bdC%hEFTGdV*V)sab1}a732#;m^IBI+9Ijt4YhQESh zvz?Cs1y8xsHdZ}CDp8dQ$e7pyVVqunSlLLG{Qe`pv9a78sFXMOcO#SJ<$WsJ&lD}8 za5>Cl;cFVU_Hoax*+Pz0BQqww=<-;NPaf1BZDuDQ26b@OPyUz2@09-&kuJZRNOjM6=|LGBu!W1Z7t{nr46Q~ZzUBlf z)Qog43XdD_o*GqMyF4nMMxT@3@rtS4enmDf*(E^3pFeBMqq=bbSV6wX@c9!BXaDg4 zAM$^|8{z-TIh+bhG{Ux%dCA|aT?Mavp&S%|Xe+jJ^)T&7HH_*d5df8sy z>`F*ZAcH+A%;F^f@31g`!^MpL(}C{ykLUl%5&XYoTlr^l>HD+Y>pz_$|9dyhKi@O` F`WH_wDJ=j1 literal 0 HcmV?d00001 diff --git a/baselines/models/albert/resources/albert_performance.jpg b/baselines/models/albert/resources/albert_performance.jpg new file mode 100644 index 0000000000000000000000000000000000000000..f61cd04e4eaff20c9e54e327e5c5cc627a3f538d GIT binary patch literal 120584 zcmeFZ2UJu|(;$3@85nZT89_mioDmpu5Rjx~L?q{&#UY7`WCSIHWDt-X6;LvgqY@-% z5XqUnnBM1k_xZowZ_odq|Jyy=oSv@fTerKay1Kf$x`&Igi)jG=mYkv-0D%BNB=`qh z%m68}o>t}nprQh>0sw#xph5@%Cvv~IvrKOG3 zG*smjm1TdhjHhPo=xB?A4*+)duFe|rQuI2ydi3x)kX94`6F>uQ8k@K{N~)y2{+|N~OwC+OK+&jy=nqU?Ozc3o8~|VfCXUXo0D!s<)-!sz zI)X47NF_k*3=#;!Oqa05Pgv*@e()2%q?6eBj)pXda~~W$Y-3|*3jn~qq>J9&!~$#w zHxz`q>`krgLAV}-#Z4@YL22Wjg76JHI|mTPV*z272Y+eD;xDkV@q<5Y8XH^ufq(J> zz6o|LW98y#YwY#o%m14ncD8PyynZA`@Dtn8SwRCVi7(}585ZOgfHx!tT(fjzpTFk!aXkTn!n%Oas`zQtOsQR0~}q|F69H# z1|zmMzICb7APq29KpBt*WB_`=4a^S!2f!Mz>RGQ^2fzJ%A_o`)&VU7A2C)CC`Td6W zk0`>n@rDhs2=-tHC;_(McQ+6= z1K;`Gn-(w+)>!@d{=0V*kXBnTUq1Wq>ge%)*P+Rx$)G8q$)R0`QNqMwH(|=)rv#XJ zU_vnA-*o*8A9WG6A9WUW7YV0GAb)|4%EI zlA;IxC5xYQJ@_Ln=|58XJ)Xb&|JM^Mz!bz%`gcs|Q)n5q4cY|lfYw8+0D5Qxv>N&e z`tB0`)xX@2v9b8mo5qikvjMrX`h)YkKVz`wvd_!Wb>S-DTH!kV)e7Kh=IILRBOvYI z=;dr>Vd+XQ30iA2dPRE^jvMqmT-?F{aB2H5aR6Y4^oLD{kaqs7E+7K{#KbQyF6#eP zr?db7Wm^D%Ir*MNCZHAQ0{Vbq zU>wwy1z-)>0``Fu2m}I$;6R8V6c9SdRR|k|2O5+Lc2JV*(o2GRuSfb>H?LuMc=kZs5@3KRt!g&2hf;WhloZMUWrqqurJ<@&J*YX<8Ttqs z0*!;dh896PD9tBM=&%P5sV(j4(h7{>@Lg%<^+2L3xy@Xa$%LQX4oKX2DSw| zN5w&#u-VzXjPV;f+*VTWVqU^ila#@@lf!MTPb zfuo1xiW82Lhw~9<3g-})2$us_5!VzKiJOR9i93M1iHCuA6;BfH9-cQ|99|h-FWx#n z2L3gCX?$b+NBD{OHTa|Wdjv!T+ytrwb_AgW`2-yV%Y<;kYlO0dri4L+8H67R=ZK(0 zj6~8zCPaZmnM5r_3&d!|%*2Ys*2K?=3yFJ(w@3&{_(-%!JV_Es-jPg`LP@WYDw5ig zMv#`0j*uRc(U3`!nUOssDesG^vpgi*3m zs#AJUrcky}u2T_H-J~+13ZW{c`a+FD%}T97?L(bG{fT;yhL%R2#*rqT<|EBIEh()8 ztu^fn+V`|8bVPLGbXIgP=o;u&=}G7%>22v>(Kpj?F;FuoFt{lWSGimYK=rhB7qe_Cc*oH zNrIz7#6oI9p+fD#sKQdhNa1?nBM|`+SCLYYZ#UU*+T6^)xh%>oY9^W~Iwy8T%vda4 zY+9UN{Jwaa_%wn6VT^cOfjV+E=sm844q*kYnrmm)*sJ^Jdt?^K! zO_NB|NHbq^|Blq1h&$tVS?;>s{h)=XWuTR(b)YSy9j!gB!>Qw|)1^zPYo%MQ2iMcm z%hub|m(`EeUohY|2sRioWH$6P?6^mH&-UIsBRnHxqq6&`_qFfm-#;~0HGXZp_dxMM z(t|A%X_J>GYo-#WFHBd=#LS}2mdwS>qs^Bs#4KVgRxA;gahB^=(pCvp->ns_Q>_nd zRBdu>E^M`Ji|x?u?%UPa6WZI@H#^WecsleuvN;AjPCJP>#X4;{D>-MoKwJ!6s$Gde zC;O8di`!$j8Fz8_ME64vEsru!0#AF-PhPBE&%Bnr<-D_eU_K^3A09G1^nW}nuKNx6h>KW=2-;unFc#ru$=>0{5 zcf($zW8-?0Mbq2|qYvXBbv_O?t2g(wD7Cb=%C&xMlWc2f7i+KY5b3Dt6zZ(%66mV@ z#Q&+Ho4>oFN1&&&SFpFbPq?qPU$p=I0AiqNP-d`oNO7oZ`1WxB$lZ~#QG?OxG1IZ- z&$gexe{uhEGVV8lIuSC7KN&YgJ@xwQ^{>U#0@LqiWM)3iYR-<&nar)tyUd?11TEq$ z#x2n;Hot$01 z-4AhZKjoM?yz!$J)nBCmyHp(^qHL&uY(=&L=MHE-oC6osEC^0#NW1 z#T?vUZ|492jxGQY^@09_{?EPH4~!rC>|YSLP5A-+EdK)k+`s*JCIA3MAaLo1OE&_* zTQFY(GY*(9UGO^+017YfpA6PqCKq#Fo*w`N+Zl9C4lgctXaE3S3jk-%7Z)eV7Z+!l zps)WK02-Zs?g=lkFZZBne3#Gwsi)s%`PX#O2oRv7n8JghkShQR0R&0_xo8IHK-y73 z#{^8jks&Bh7%CbZ9Rm{!M5w?AP#{n!3Ji*hdbyE@_=3*?7y&9F1Ggj^k-9PbiW4!9 ze^fd;qf|)~iAMhx6Ym4(01Ql0GI9#atJj!WuCwy-3kV7ci%84J%E>Dz-qO5tS4$go z$xKYm%q=XftX*8)+&w(KyaR(C2S0iGEF?PSMQq&5SMdq2GcvQXb8_?YOUueDDyyn% zYCn8zZfR|6@8}#D92y=O9sB%cW_E6VVR31BW%c{^&hFm+!Qs*IC0`H#`jf3coc$YL z1R!51Fc=gDzvK&o;sItT0SuLa8;wv>9d7JIbcM$someU=y`%|)kym4jFgto`8Ze~z($|0T};F!nFL#z6;~=trU_BBE!fr>AEZV+WI%ir8gRF;w}H41XoZ zA1UxM#r#PZU?UJRa`LN;S0xx3CE_jbTE_pU)5RpXqx8D?0$@YIUGnGhXE!+>e zS<;w>!IB9hh*FY3CBCD?$I0-_P${=V&5KT5g77EqCH{YuxoV*SjdX>8c*y-|-u@>` zs%a=8EdL9+c)!kww__&;dEkM4ewWL2$^>Q)73vidTZRm&-(9Gy@Y_f+4L#2C#OVaPQ~x>fo*1 zy2yNCrY%$!;d!;60#Wu*XkEIWLGSF;vGS+AUpxfpCt9NGX2~$8nL`oDi0%kX54@xq zuT|e%lv^y>qEXFt3->dIk}v^Zl6!1*x;+f>)E7W$;|0K40urSDD-XLq=zN2r830-K z-nk97Z0rh0^+V{1wqF4C2ob>d2Vw_P_*G6enEFsjV|I%uQCOcsDc)Uruq7bwhe(5U zr~T|xWJ77!CC*(B=fm$1$}|n_vN|Jf{c7N{Hw{9dBMtTSE2hvSG`eJQKTMWvuc!;a z^N}lXd_7o(M(2_1v?K-miXUl+RFj;}+?ooZ+-)dcj~|^Z@25~l@Ns(V`xW?#1n%n) zi+H+&Crc7|uM=pdXzuX)ZV>z1)JR6FeX3EplmJ z90M%*D;_qCujXPYo@@ofE=TmiU4^Tj^t8!G;7};0MpF8;%e$)q5R%FI z0bZAFZkh%xjxe=eb1qDk;MMy*8TdrVxPz(uIwy$yE}5qpJa28ms-_+G3q>|EWKy#R<*{9D_IV^;eT`(thvyxliRdUSUM zG&kssk5O2biSIv(Z8q`~RYj);`Y(W6Q=@wnd?%9gPE1d%Q&lzPC06ZCU9;Y>l(Su{ z{Op>oYhX72K*?fk#6HTy#Uf8U%ym(=V@y@uKkAS`g2vY=g{{g@o~E<-8%1T@9(mrh zNwGCA#C8v%ivOEePemK5VH=Y@{~QlpZz(8_Yf z!d@t1*?GN9uic>OHkp!s{jPu(c^;7*xL|`oPiAEO(Mmp2g>q9T$M1{5D_k6CKN1yVgW3CHx!9821%~`J)`f zTEpJph$@)+$7^GIqpj>RBEqc3R%dC%$w$7-%H09%ZsWq$d|4|}5`B^RNNa11g)5Pr zPl{>aGhg`)W&76i*y5Gg>bmb~4`oc+`W@X*jj6;HkI#26CdqmFzU%&6`wN=R99D7^ z&x7_%xP_vD{Ih7!te|JfaT<@L}?pHfOlbQm+&Ql^7)TM@W`UMvb~`;{4Vxoq zFRdXY#3b7O9v)Ei)zXUpMT|k`c2BMjw-Dgp$0@-A&CP_-#moR|e`@2*z@8 zdB5K9PN~&^-|JF6Kw~@v1kc=RS&&~$C5e-g;zTb>9B9{FF*dap4Moe2m;~dRow@r$!+C^T0Qd6PrgAdXSt4@|CZQghaGmQ0 zN$&6!eSu>o%h}ac4CVOH4GrP;d5fWQVo3jYB?I@+1oJFgjU_zS=9lIOM2Y6Q7+d-v zL&8)Cef2`$+g^jPHu!g9NV+Vs8sst?96Ynnt{7|#+Tan=kTG2k%QMSRpm|o8JB~)q8?tF5)g@h0 z`J70utli7Psg8MEjO~@`lxM-x5@G78tJ7&gpB1~`Jio8dO2Ju9Rt$!eVE!)SDBUO5 z!VOL}!(09Kz(?N^+91t6dMedwoWU-K1%rs-*Gh>#)z_$E7U;ccy=ZqKQ|bmUA1lU< zI5FJ70$h5@b4mHDzRNgD$0}Rc-xijZpvf|^dC7?r6^yT3mdXyS^%@OS+Msamo>?uf z(YzXapEzDTFO-IvLi&zpoIfTtfhZPSmOvJZGX3Q6=ttom$z(=k(5G3#2)UE5QHynr zk`%;{rP{r$a*;ROQvQUp29Kwmo3*t$0<&%<7oAQSdb%uO?wK;BGZSvb>?O!ebsHrjjc%0x%MbHa<3M@=v}GbdYjbP$!3Chz zASuDUI|9qYbaZN3yZzcv$ne3G0>2V3Q!mE>e6baREdl@}liGB%ct@Bzm#)b!gW+SE zKi)NK;d-}&1>ed+6vdMv%nIcw(mR$DW;*!tC26?QA4e)Z`jy<=VLLC6s&@Rx1J~9x zrmWOg4k}0E;&ncC((pF2x60YDF5Ma`w+ZyPuOx?f=0)$#_3iY1Y+dGIFxx)q0W0H+ z-L^@GEj~zb?$>LnmIpU@D^(ip zVWAP;z^XM?{J%0;>L^``SCqZ7V1HuA${cPcp~bcvook0MO;!tK^;XrIH|)1QsGuyebur&0|DH0o?kia= zU8H?iy1-i9s>C_+bo=cx@Az88k_3%Z06F0T!;yGY)$VyHTl4m zHs`O$1aDHwShhFDt8XA;u?aq_t4k?K1?{mSVlQ_qps&l{&muOa=YkZPn@I9lNon-lq2GrHEm6$r@PrL`gc`9p>C?a5_$n(h@Q(|0L6w{H5UMVV#EP5_sxd30_EdS7bygi5*u00J$T9jO;14Q z(CkVkm0b&_C~}Y(+#uOBW1+}Mj>2;Bd&m7vd26$1AYy&3>={uIa~`zhfCMRZQkA;5 z!@Qdf7n@Dg_wh3JBv7li8<*n^bw9N!D=n`qf2=jdubkTfjfv2lszH#Qoj7EGwqfI^ zZBPV`XX`aB?6A32v8}^}e$dIvIC8!juHW zO(HRt3T=jIc+VwAzV3A(A`6D5WHLwmhz9)E_ShT z@SuC?7&15EdhD$YpQ&ThFNlv@fcX>Orz`ZPSh?qxHd&EWMffy9jf~k|2sr5<5~0)^ zc}bdx-!K@TZg!viy3o?;8C!b)=KTA?x$2kSk^+1cgnH);>L#WRG>;?XP!RqNusui43#qc|PQ8AAz6r|h2Y z9XU!L879(>J?Q+i4{O@Pr^K5}PD{@DxTIa{F!*E6vhum3oTZp0FM!p;qg@pq&rjy4 zs&W#<5zTjMH69PzfxGh*Yb3WGF_*9*WlLc(r)?3^JCmWyrN*n(MJLS6`k~UkSYYSm zfsgOR1t793R#4}TrqJp$3UwUh48hXR^}y!JoTiHz2~Tr-B0KQ3?ejp|V0p#hHw#NN zhA3?Z;U zd4SD>l0CY^^S2j3QQ-xUmN52X2Z&aNs`1GY&NP}jz8alf$!UiApyo&pS(5}kBrp`O z;yp{YO-Iz%8k6tSH6vMjzvCvPevYb#o{LG)ZP7kCwcLRgye{Qa`dVeRyzacREjZ+H zD(mu|zMNE$;9fJ{i;bP^+dRT&vVaGlmUzb1+yP37w6?ci_q|i=LN)I{Ibl?^Xk8)X zSy$}Qr#P)sL+g>qDpFD`sT#ITS-k*wmDuogQnYEfSMKuZ6q&wH=&_0StFlKaX=-W{ z?nU#v8FIPdZ37*`xn^*MRz&PF>XSsIgC5BTM@H< zRWOa-fWk{OdDMu!3D&7Zy8ZQ1qpuJ98STOu59DiTdihiHKg*is>!I@yOyI{eU7z5M zQ0{|!$vTU(K4T2H`D(kA?2Cd>d(+yHR!euiHl?{41smy|UHXj;3CbaD&Kc)ztAS0E z7zT<>arHG_DXbb%iG3><=1%mfwj0EL3N-%5YH+V8B?FGa*GThLek=?+S5y0rj-~hV zYld1c-5uUssd1?Ph_eSr^W*!Eg3lKN|Ecf;8|DMV485O4>eYs1ySF#8BpDfVSJ`mN z(eF(gR&+PLaJB0K16kpHXxbrf+PC+}tJ$_o=+X;mP_I=JT36}pFnI+uKI@AT3`bAFAWiq##zbAL>Hi)$lI7WgxH8`7e^d%`TO;MYluQq*1+3bLL7v@s({M=Td(gtSdxkEX(<@{#5Q);2a zIaTS4>nVz+_r1(*qs7EuK$2&9R%l}9A3p8zEo-9dtbMLu%}KBSRELC-d3 zK=+1pr@g{bagq08N!w$-t&`AblHK;xS~axc!tHxXtc`;56j+NE!S>U>LW?z%&~vGS z3*Z~w8TzO)J7f5m-Z&Aazn>-r-7Efho3thi;pzadz|?w0&D7#9399lxQNZ8P!2iA^ z={kJlpOd!t0?BA7Yhq;ic9pYs*6J;;R6x3XpIsUjr6OFps_k^gXhDxZ*(8S}rS3|C zwq!qb0DKUj>`%j$8gMCG)s%Ifm&Th6qDEVyaL_Y(s@cMo$sm<)_hA}vPZ27-{!rNL z>YyQ82Zc#Az8{bGTd+NdYD5j_NH_^nUL+9QrFpKYk@+xkyI^RLASB#CIwmm82 z2stY4uH>#DCU{I6xNrfijkMIW!yf4?Y|o`XwAe-Q6ZTQ;smfC!>59TexZ^ugC7Hdm z3TIFy{t)>XGp`EE5AwxxwmfTV;l)c8*J38Ka0Mx#V=bJwoAA&@HCiV6%GNsh5iW1| z5q;80pI`5HhvP)#LBux6pdqp?sbR0XA=Z4-QlQG1P|*y7`~y8Brijz_(oTUxPQ2Zt z_5&gkQKHxhIqm@W-80K&H2I(lV4jZo%ni|g*9>$t%P)XBFv6@g93DlQT&>^DF|wMO zN3nPE&YB>tk>maN{>#q3Nduvft)TtaX}2+k`m-4~TJdCX$?Vc|DC(<20}E zw#6jp%fSb)RM_rz>7(nhsoiE%2PYdiaT7F}ESOtzlzww%;;TkmP} zk|$3#T6)J(o6qfC>YSG@0Lj8<^n~LT^`Q!8toorzp%&S^hw6PINi7FYW@^=avjyYU zp^xcY56E}V?;<~rR*_Z8OeA+x*-cJ_LVMrcPEd^@s0vs#eR=_)vpVfwbw-FZSXrUI zxH2{Ao_nU6mxjgnYZ8#e_~xDkZ%&%gOA9PK@S){$S`1hjlnO z-YPo}A4T{5jv!MX{CeTXYsm#|dAZjF?zhoq&XZqIO^DkB#Oc{8B z-Qm+F5#Qg0VXBC4BYD-XZkEOJPzSC=IISG{c(_8^J!N0EC-e49J8gu7&C7p(Kyu%a zyRGPBi)BObdV+h3U^M&WU36$lnp4u9uUipT!NwdPQ2bB={o5W1Gk@iS_^8+GQwekE z=SRHoE?Lppzeii*qci_bYDZ{dzJxl(hjtiTFXaQ(S()S1^7@HGeZAlbeN}$r%~yCt zUu_;TU(QBD3yHFTmYhc4b4|v zudkxg=I-YuMHT?k04F~H=)iLqCl5QDstukSKDG8j-Kt63Y@C_{?0c#=dOl@~&e`8Q zliV%5Dv`^04KWo=e3czTusrYa8nbQo_hwFg7`uEogA>n9#n`8x6Sj*uA`% zvu+NNMhp@Aeg0%&{ZixeXZ}wTgpU>B)$P42di?iRC+#<;KKH-^VIhE9{dYsDvJc}? zt&&A}%>q&S*4+HB#+e!fhR^ps>3v33X`hi)oqBfP0QN8?VaL;f-QuIN;OziGXWm}! zb75Sk-HcrP!+@4=t~(2%9RyRrw^de7oO@X%5_&tM=7~QC;y4rayNF` zQOz%dX1bo(@0^xh?jmPsO1NIlbF4R;jV-*r2@OXDj0Sb(;_aKK(O)tzvk`W-e$|ij zmHV#5_Ep~00R)%a{0a3^M3$0qc6QQx1D^Jd&jtZ#zFeq) zTH;4M2~@PqxfAN;a6}yF^Bla2NU@0Oq^i$ZuKNzeS_yF#5ynsjuoP{d}CKd z;PdHlgwNi@l2dQ7sCh*$lu@mLySlW(Y-gTECzSsD`*}cSxe^sRCR4uZjm=LyZ8g`* zvbI`Y4Dgk9@$ynf>mm;mBF+iKZk~Iq92J8zMS?2INWa6@0pm02ZHj6bH1rJ5I$+Oa zsQk0mWRU;R24kec%5g~a-0gtbwiD3>695vP;h?&3`)%oS)k@O7XbGnIJ*GZ1`Nxae zgQ18PqDcACgpAp%f>rwo4UjOvun z9p-TsYmD2E0{hg^o`QFDto44Mm9Pq{wLc~aAv#qGjrc?$K|a!qbTGue0J`|z;1Nh( z4NfP`acG^zJHe#d4(PB(#5(|%Fq$F%U_0hHq4)PQG^+Su@fuqsG~p~^+W4{>yMjVdYIrhtFV{O zk1}$}+&-YiIHhhzEV7j7!orsp3Iw#IS`pW;M$13} zVK|*KM1uP3Q`b1T^Wm~~vztkH4zqNlI|V6ZQ46IBy12U84^(SVYBj)Py5XB_L{qrYajOyO1-LhKh3*#7~rLi-!{Y`kSd() zgZjaekgPym?$@j!TH~WkdxD!K{DHHd&cbOXJ-}#C1Kktbkp+$v-Aq@8;C*wRg+IbKK5l_%CRz*!xmDI4%akFLL#S0PVNgtKuZ9!3p#YSa=UY}^zxT(sN>LNkL7F0v;S?F>Gx{7_W z?%NDem}o4j-#K^?T(b8O!G%nT8+f9NQZ99lJ&)=vq0Wnw9EDR;Qc~?)P9rs64`p`0 z9j$OKHk_)sI?D955@cE-zFc~l*=d+ZAw6RbwoJDm=DW~$4?gs-y09;dvo=2AYE}Sa z%D(T;P_3*fUG-kr5cjO6Eul;Fj?#}HPOo@-obD#>7@c496Qn#IQ>Cfp@`y}ArJsfj z9b?w$9J@cW$i6iwVHh9$H7U@4!;kF&c&rX?#leifTSWCC_4}UF`SQ8jn4vyZ?rAaq zEIcpNPjy!EO`-$txSrzTxT=Vta`3&B3-Ct1v2~@3WmQ3*juRDavcH(*KL41Gk#w#; zdoyz}_RYW*&0d%`S2+<*`y5P5HYL!(ggM?$v{O|*w&^&m9738ed$vv~vEZ_(6tNi| z`27OFOgty(raSjcJy~gJeY(r?x<>Gs6Cz`er;5^HU8 z)T5HKRMN&{U@ms{JICW%Y`EMb(ZINtwqP{o0-zhA+Z%4!CDOk<(3fGkAmf)r1mXtCX|8^Ah7ZRKG;-L8zQzDqZtf$MG~KFzae%HYDDg(~jGv z#ltQ3{Xl+N#k~rPNb*W=#Em?hRAraD8@BXiaaOOXd&Uz_;d2R3bn1e8KFB9~JfT84 zju!J#Yy8-s_AW%2Y4DyvRaAHrhf5p95wc^y@SUJC={3bnB#%}2lI?x(Y0M9SA1X`r zGwQqFy&%-4^4guvRN>$C>2k4i{B+eZn|IQw<5c`b<67?P5Q)Esm@3)mq%VL7BJh-) z2^4J@U<+QJfJEh3Bdp_mbe|GQt#c2urQm=%rsTDIkfH`{<|+KTayYoqS9mo=sjw%y z&K@@`px!;ZE4{E^-OJ@hdk%vl@JNUlV)S9kV&Cd(lGhV^UoMeH!uj49bYBiWzF)lT z#;rt_n){79%UyLpxb@6#7hm151J5_Aj1n#B0C5x5T*|5IZYP)$!d^^N601(SWHY;o z8{%GQGES_iAA{SL3}r-=xhH&!JiHd@+%oAmj=^I&^ZnMTbGj=F&LROKfveXctOgQW zXX%E@o;~Z|Ll)e+0&`ccEs|VeS)|fG=|mGW8Du80=fqrDo=MJhLkdWC82l0MmYtauS0*i3J=KGwZ#RIv5kNLJ2 zV&~O-P=rof?7!%akA0x8R4EvpNY$`}CTwL5nPc1-n;EWxtfJF(#?O+e=j)oDjlj+X zF^6il_L2_JRs}pwnVDKa-{W@q$C*O0Nz>TO$cTE~Qk|33trkEFJY4W>`EiMXznG~6 zhpAJ?Zrgh)qE0nYxh$b~9aW%N1TsXY{!MrPoVxVM6N?wv8`|`_%XnB~y)p>fDz5qh19Jc{k+f8+7!TEcZKP73TC7=8wf z(pTc^cUT^2ESF2zq57s`9^Xxm%7K}1$zb{tx^L8jauGzIVhp>kHbrbcL;oVS1uRxx z|Bo)hK`sXW7VEO%$?d`yFaIjS`K@@7_2zSZpK^kUG&eXsonFq_^DJa97>Ie2Om;B3 zL+sU$^hI*(UI1gY+NUR3bUzEg@@KvL479Br&OxyxV z3rNY5?AhLF3$3fsSEEl_$bAm*(uVlT-A%qT%zn|XR~ccxCub~Blb+`%YeylX%MI@v zUcI5RLE;+LgrN6TpA%xHw$BzLAFLSdY`%b`|Ftn~p}7y=OXPHVL?SVut8%jUB9!{C zMf8NQe8&5ZD+^d3{g}+ZCT;K0BeK1~^J6f;ID7%D8?tP(5BTvt{Lx>=&(3?#TLv1I zCRsz{&Z(Y#a(;UEWdO$fqjJc*zjms1N95jfC-OLS6E`vR`8y5&8(IiS@x=5mZHp7K zl$?2KILNI4rsL46U zFX>Ilc}VH}m`3h4-=nN>A?_eI9s+bCiY8_YC9{`bazC@14I4Tjh7|ryhQGBy{+BjL zQ2ta8^`BVx`NS!-Sf?cDr@{02y;Xf58npGdRUyJCCQKI8C*YA}@n2~!)0ZD0P^zc} z&=m9c_w{fluFaxHw({=aOUxA;sG*Ji)v_D(UjRWm0u%D16d&z+PbLw1gDQ;jQPPE;>gdNxof8eYpk3;iBU~#x+f#%$Mn$+_n;LJ_A+06U;csetsaI_0 zzTicGAB7*^Pj&vKUQ3(lTf;H7H9EuZxD-!=ty|uwrW=A$W@2Z~Du+VA_P^2^qahHH z$r@*g59zq}ZdBRHg{ z1Sb9_|NnteuQ|zm$ZtB%p~df@Oo(JiC~?SH*0x&Im!*EbF}@~2^*5!mxBz|_kyu5$ zhoS>r=;300q3c*|r-~h&LYPuLL4LTR=)a52vPK&-!s8hkg{G&6zW(DPm2fN< zK}o^nPjbgJrVle?;m~LVuNS-tB|!uKtJHMV#}Dnb37Q$q?EyH`d4;VD&e<+f8>U8)(<;>y#t_-ZptRxL3$Gz~@CQ zB1v`mLl*Qj9L~QjJ-=P8U|&J$%82gj5I*&Z?-G<#f7N1_%8r@mHKMts0``po7{_2p zi}N3?z%Mms`RRqQK&Ah2zWkm7^g}e1`^~{H3@XvxBNdyU7Q$d*Gd`!q!;0E5SC@$) zSi~=LB=w({*x%Oh#Xoyi0)O|cEM>En%hKY=L7xW`yalfLH_GC7Cvis*0<$drzABoP zs@RhaFHu@`c47FcgF@Ya>(3c|SE>ERw`qX`Y|Io-M*8EB=l|}mW53$ve=;Aa7RPtK zT8W4{k1ez?Gcj+K_1%sNWUO37gz=aAUg781lKb& z@Zy8i{iEGbQ_Zo|>ZF}z$$(9h&Lse&e_L-;w@*3O;NXGnXx?%KBeUyScvsDiZ z)Z(w5zWH}Yhf_57PEeF&8=QW6RGiPRs^JUD!9nwqkb!F}cj~QyUmE?ZK-s?KFX2TJ zE-Z>YSpU=C|M&L(?l}GX#8Afn$m%SXXU#$%@`V8#lYlIZ_H?NHpCST~i=n?o0{){5 zGXK%DB~cY>3l%HOfTou?;oir?$vmrH#&1=g$ULLf3~RWpg&eu!3i7^(HSD&voLykvDGAUf&+3l9L zWIT)*q(5`gClm2)uI)53;Gwd7>DQJ+`y{-6(N7?~uA+uXzcZR(hz%OinGODKu0DwM z()1hLJ!>hmagP~$+r_1AMlC3`*7v=Q#HV?HPijQoaO+LH+qzRI^w8W-aPD2%t67Q+ zm*qEjdT&4FV{rqxnIpJ2+?-ImpBeGMwN`>L@#rIRZe`_8`ex`TQ80R?Klb3Wj*lGO>XdH_->?oUI2b685M3Ls=QxhMu9hv z<)(do`cU(Kg}Q%-yt&uG`xBhV-6x=nz+a-XkK86}I0j>azwg-n{-qWAf+|0@Psmbj z5?$jc>B!qJo+;!rFcd-RXfOH$(NQM@->;`(YRFnd6!@#1>T%NZk@=Ads#EHx2_Ki0UguDoIO5&8Ns$RnewU|uw4FT#SxER)!J8Wd_E%FZRY7? zS@)jw=tTSih#AaVIn?I8S~$s`jSuw4QkpklK1fN2)}{eD}M6Q0J}FCo5y;cMMqH*I{zCa|tniFLG{4lo_QQ zkv^%q>FkR)iCCHn!ru61AKiI*VKWe+IrivU{VJ6gkxmtv5$~-IM(Apr5dY|^(aLcU z<^2V+gVHm&m zjpcc*pT8`Zm>rfmAF*E>QRH=ykxTXHN`+$ieiF7KE`^R8_0CNpEL(3e~eZ(fRWr=wj=U$xKMJCPEHl&Xq9dum)=}xM-t~aea zM;snC-QNy;;t(sIb1E`YC@EI)ZXl>S%oG|)ZXQYf_;F5fQPdUoS_8m?R717Op|iG5 ziOq%_JVpAFIlq&Fq+H@hV2_upaHI%`?34-P?Dk4)KNR#~pM0hpzWA(ievh%z62%CA z(l=4rQGB<7{?z+C_SqYmeMV28NscX!7 zG?!iGG*>T`pC}<)-)>i(ew~bu1c`Q7>Lg7469ds#0-R~b&D4~GwK>|>hO zdYqfH#}-s->IL?h|) z%$HXZinK6M+%b8NCQ+1Kw#RL1n@P4rV3KOE-}Vmthkhm;DHH;U4^sV%&MQnj%|3a% z7R?`7uWO6@VUM!xAM88c2#@xSmcuDeLZg^-$Ag7zJQ}t0NdL4i!TLs70O>lHg^j%d z&`531-L0m$@1Bi(BAzN4;_-C&%|hVa$|F;)FRH~(sUxW~qP_vDMtm0lgT`Sd-g;B& z(K7dOUlR?C@Xc1{%7LLYUixbX!A>M2)t38GwvTz96~%d1RqyEzNtQJ9%2(O9A?&5C zWjCES=a^!Kb_PoLY@FIS-?=dceJv;6(*)VuBrpeF>S z)i0_|<2HAXbzFLe<%qk+#F6Ohq=((3Y2VfiWnIMSO&s^X=+T-#tVGlzdXShv)y$ev znN?J@xYZ+EK8VQZ3g%ZYwEQZX(XT z5?jfy7rk~!h7z!OYLr3@dCty1nQ z`%3>#ubjdq8EEK0c;`^eLjKN({Dl>1gIaDY%7L1Um&kpLVj!4vxfJn4>e#OcGDdmi#hl^J ze-+6poQqq~p;o7^mI|BhQv|uj#c3kAacm-4c06H@5bRbpVV#z@A{5GBsP}k7`bIfy z`N)mG7RP?A6l86Qf=O-QJI6nxunzPtvMAoCf% zPokS2IF%#JB3TZ#9t`sI6#Sa zO@jIxeAKDjb~7CEc4Sy*7(VCS{mEZH3v!K3sPQj6bW z`hU+~DEPwPW_c!$k~?yJAxDHSnQr!S%GECjjb{RZL=g*Pt@U-Q3BHp+b@UBPVNsw3@a1^h#@!TOP>i)XlXyaY&8M?vk!u$6S#@(zl*p%} z4?ZDMJSj*OB7?c-Wy` z%BgNEB1tn178d=v>pr0iFMIV7;H7&G==nS!N`Qg)9IA;n^EE^LgxndgLH8G+n;Eck zqQa2B5FjKdiGPQU>aHL}GKsqM>|8Wi36W&{W+9+OQLYcv%S(_!In1@2@DFO`apxFT zVI|Z-pFFpz_IH%nFq>fjhQb?tg0k9T!NdbrKjv@!eh9NdX$5*ZsqlrhX0yc-?9=}2 zK0NDS-p0nH+HOt_N;4WWKk52G`EY4@_4N|W2ev3diHh*f43pWWSX;qCzPs+Pjp{EZ zDUk-fQb<`VZW$I8RKF3?GmX8d@%52y$#|zrSc4OlC+q&@p{A&;h^*1^8}wE4b&yZv z{3)Wf;1X&!m}**V;aMFPncfFOe??2W6yR_1<}wDPE&o^&?$%I~5Hj zZC}B9N5W|Tab}Cv9dIVMV0ZYhnZcJ%iv=(TCO!FaRj0TYG90GQ&;GN+c_Rbwj0g z@Mv?~Yx8~S)y?anS3r3(5y4q#s9CCQAQv3CU60)ZCKw&kpVku{n2;}(gJTxuOjR$Q zq4{DgCam`}lyPp+xCSt~Y!1O5#AbI>*?*NXk$4A+3Pr8Y&iD$687feSv_rFNo;O@W z0lC+RCq?wJ+t62us?vyP4a$;rU&19BNRA?Buc^)_zEGHRN6&@Nm*D(+ny$!_Gc6Ws z?+b5yka~>HFvHva!@)qFYl$Vt?Yb4_jE8y!Rk%d6N$JI67$@zz*>D!8br-aFOSRTidP+eM+Q8A5T^pj(T09 zo))Ks@%mjYXroPHRHnx!W^(S`tj_I<`bPUiCotMFdr_ObPX7|V}8L6dRX3OJEcySO2SYy(B zBBQUFrYBoO?gSK_e^e7(yU7sI&tJ}`N6fJl6#AGwXDbB_8XaTbE;QoOOL;2h@C_WH z50C}bhh#*huaS};Xd7>;i)t9JNW^!DTc81mkMq%G#9^Uv%KCG((fhbQr(eF~AmmLl zruCw;scWRX5beuyjIjArQ-N)$qk@8izdU+vPLt#!cmJu^6*81|sO~ME87%|ZbDJi{ zSDr*21?ltKbkWa+m>W)jBu&__LRuQ{p&eO>=r%Qv0&P#U)lZ3(e_q5MQd(TEQaZ~l zxJVMX)&5{P!vuWL?*IsPs$dJ{^qnt%Ur%s5|0onKc>=vM>_If-X0xqrTiSw0Sy)7p z8b?BqfjBmcL=t|)uBst%zTBc<$0`bwSF{P>sGuwWb`iw`wLFb_>Xn&88DX8<2Nq}Z zQGC$5Y#GU?1Srz9weq6A>bJBU1BFHm9@*C>6V_5?#9qT$@iTm#B-hC3&GSHa> z#}Qn73ji>KK2G(zWPerG)FXg`(%Ua6PHa&tF%x1-;67ZOA3# zkrGvj)+uC33B-BFmGYGKJ2o)}b^Uhcj889XXBC`~VXpw+h7^5{XnXW|^n*A!;LWC1 zP|#6pOm7%D;RSp1IU=dPus-c$m4xqsxBLlnjKq!J9szs)mQ|}qFOWqfRF99VNE@vI z6tOdpyr?m?liJjzl-n)6Db6L^^lT+7mI9ks6#@>~JMZaoSA{ku?&wYHO#`q!`i_Q_ zBY-c&-?)EWT)BrMo`-9VKps=PCV%i0su)})1Ft!9rR`X$F!HCgXi=e)ruJePQ%xyKE$f7l>=whA}a`vJNwwRHJ?+=+p}cPWy0GohX~=xFSVv(+X{*61=E zqDgkYGF1B7b=5IoGXa;teXf1_Z0+rP%)@R!Km`$9wY8ZLBO7zZvL^)V;o32Y$)`vK~{SlKs{cgD^`)f4qi)XGC} zg_QNu9$Q=zN4Gl#viu%1T@xd50bISK!Aub&i3y+tvKFoksWcYD4)yY%LR5eKp$bp_ z{)lLrshyBcx#QH>;UY_b%wHmhV8CLGgQ`@1LK1+m-{8M%e269aC z*s&4ER;qnIp3r=M92AzX5G}#FiYV8H^z0k6Ge?0Fn%IFPwN5)|SvUG*7@NG!0hXy# zS>fMDbfb1#G7&EZxvA0j*~CtK#`x+4uAIdip2ULrq9=65`Gd4;6H9qvD>Ih*zJVn%3MJ* z^3I&X@;WwwY3qW-Rg78M_kH+^?xr@-+kxwDEJ9RK{qqucrTE;4*!QVB7U_`SU)N$9 zn7_r>2NUShg{<8V6r|UFO{t0nI?gfqWi=Z7XEpjV|F%{!WCzqg{&o6)E(bj4I}G>N znj-pY2noPra5!IQhgP4Ks5v$CIgvrEO5L#auFgy6Gi3V1`#}pVnD<(rQw*`WtjI>% zsE*7LU0G$-VNO~evn)D3m9Q5!!2s$M@MwIVp_>f@#9!+%>qr^alpM5}LWCB%pQn!2 zzWw`YEL*DAO!{2pdoJo5c$fsvCvrpT&Y3^&==HFwgI>#KJk9<0DJt)UFxXBaeu_GbiJ6UQm2^q7M8=S>gzk#T zF`D_(BbURRR|6W%<8c98>yk5xn_?wIfy`!hm-P@On!s^%p{%WR=9@ zk9>&xul(-bT@3Mm_=&ckJ*p>U!5*DSLo=M_shHx+dB-%H*eO=}_PSbyv4q{*Nzp_49eD z>SDSdJ6iQj(8N2iDe~zh^~q1oE~-Mmi=;#Q2`}A$fZBZ7C#|^kB{^0En%>Pv?BAeD z^tRGtyD)gd%am`6YwyVAq{fk1tLGi2f>Rkg*)N5h`D|_kF(MBibqy2}jfnw@L=>Hv z3e}8@OALp3$$wm@$~nt76^7B#1&hO;7-!dR@JuR{mf0Y>I{WOY_fJFT2C%|@9y9-& z>lU%9ioTw-^OpzH5z=Oc{g%WSU8mN84fS+Oq#c*NuKZD?L0L~G=fk3EO%1+XQB?Rg zwSt^{7@{&lsD*hC^#y5tf!pFjEm^v@v35*=1n3;sUxCunHhLe))5ev}j9Fxxv@s1b zpW2AijK2vH*>-EgB6RYL(It7Z_`I~XS|NAXIXILmxt$>d;E8>mT}RW0)8tyzMO;in zzt@Ij-w3NHFIC2!UlDSY7rNXSCnw{>2*`bQQiUm7#b`^JZFKo?9mOAHwL8D9GVxr# z@=H4H0Gb-#Mf2PsHlxkcnRR<)*BAzu?-kgTFRqX2tqra7EPLe*0&vP{1eR8}Jl_<) zzTJc)y>^qC_A|fO|Ke(e_?N3y&P=bH9eLagUnFaEQwes25gazFG=@H2(t1q zJpy9FImTAeNAPCnh8|31v@oVOx`W3rru2E#>1*G|IR%Aat%{L!vmn+Eo~~)2p$Rk3 z--sVPt2~y-KKU?5nfoz)t#*|{IMF+krnkBQx} zbb$+6Tl<&9`Ilt6U|VKqp${vW%xfSe?yPc-a8-5fWN3r*^@<8{9K)^QYrg!yS4#dZ z`TqG?M;YzDK%+RIk+AllVw0x-7aF!4^f%D|A4+~8RrR7GwKgJ#hpxsIMDFa$M*3HM zC57V=;w!zHROd?(xz`sDQV1$T@=e{SQKbgbNpNABlyIDai~46vN{9q&EO!;Jr9{@% z$~UX~DfoG8ah1T~iL!RlzOJ+c(#ld+T4k7`*suVlWx$-zv{3PXI{VuAS#8C#!y0*Ql@afJQ^ro-uqQqTQX-AF) zbI<-#-6qEFtrvCVtK{nv)Gj2%4~)qkV`y9iFh=zFsr|F=_Ddp365*Lg#;NvwRCF>} z|gvTNap4LfS_8wTj=&`!k5bZ%*(w zQJ-R!r6W)Z!KB2c7TuALxvV)old+WxK`!d5a&njt08P$sL26$|4Ehq@(@RzL#Dlqy zRndGGHfl!RS6^~u7(Rxk-|wng?O2zfbpEZ5^m{#tSXG^$`U$o0Z2w2n&hI%JiXg+Z z61<7k*keCp!8Wgg%1aOPEGTYO)&{UMzb|m9zYTmbHh>jC;^&ZOsKC&gF?lwlGcm;d zeaQtg3f7l4?9yq=*$0E(i+wEE<`C7#<>9a^LCr+`LdQSXOb}#}0CFMm!^-r`<*iJJ zDv)#C{rR$(newLjd{v6sGbw@d9Nan#wd|C+we%#wZ1=f z&rR?tH9&B|&E#JxfnZN^(g&wg)jkv%@%Er{OTj;9tuO3(9Ldpw&DwnVRufRM;7Lq!W9+v7<)$@VD@&ua?V?b4oKt}psN=};mZP^dM zqWg+N<+;esj0046!xbeRa%a}zL2BUw0e!PP>|xncR0G(O(sV-PB;5)Nltuilh3SeE zB`S-8g6EhL)EXoovJ4Fk>nd7ko?39fO^o)bp2M90^j&&JTo{XSLIUMYMO3lL#aD~- z%BCR#kzaCO2R>Ded&*8mm^+77$EcImD60!O9_KXbLsFkdsA~SBKLJWBt>92)7kC^QgRxuQ}@tzG}D@Z$@+r1BKZ7OSs=!{If#)IXJBO0;+jAmN0#rA^biij$f zJY4sm+bxBlHl%fk|5Jsu>Ev=(Vr-wIn6%BWQiFfFCEmX;Y1~8&iJ_sdkAGzI`am|C zBych66c@B<0T)-kW7t~NTVgV0|M(7%HrEhR)T5eVn7iL6CZ{(<~Q^1=G>2WX`6BKu6ef*6o>qP-fv%;gfC z#VO6LnUc6jf>M+%c;;Jj#0?1$a_-rG1%zTY3uf`PCthBq7|77tPMM3E^Sfydz8;=O z4-6|j9lULSQorqKN-A;|LNH#Unrecy;(9#q5b zuz2fE6F$i!ygIU?c2n}Cd{;-Aky!p!vw zKaPcq91!_ttL89sS#_B_`$u{=74GsA?3r>FqVMa_172nH5v;F{@{7ezl$!p$j=u4z z5+r;G0LPfYuGYaxZtBhwCx7k-{`q$ntHuNT8bG#5@iPB$GwlJc;mMaXL|i^lH=l%^ zFU@w9d2s{3X96|!#w3tVjK-{o^99T0x{7J1Ois|yuh^HMttMZ1EY>rdn4pq zKMjSki4DhFIH5>K!0akOKQS?;X{D@j2t3dHc3ax*BiKn06XMXXdz+#*ydMAU*~=GY zcSrp5+x71VUEYvspd<9A&XYjvXWzQN$S{KE2&PC~D84me;%0XhS9`3YWw97aXa(w^ zD5$#INsxq{NI#!gIoh8xx02VqAZKEX7k7A5OC(BW%eW}}06>=!C{^C+&wlB%q1Y5Q zWnKcTACHk^7EV!VvE?eRoNo>Tm7pddPE5~Anc~FhN4MNi;ptI+B?jnO>U=G|#WDR^ z?$e^ZeT%n@Fq&Uh%aRFc!_kub5=*0);3jj)VsW8J(u0kiyEj@Fc?5U_=+^sc?`1w_b3j$$Q92(sLjQ(#ndtd%V022bg6 z8B$j4>pj&oEnI4kxq+tT9g`{qe-6DL(x$Pe7xq=M?;bMKrQn--85IEHuQqU#`=G4e zP}1{?>$*8`DEAs7T=JFTND2H_QP{W;0&~? zpKtuF%gwplqH7&@M^_@eE}bRa0_5N~oB1fn1!59N%@=g2@K})26Q&FDsWP*xAolY) ziEQ;r8TMlH(0yDSc>v?NB2+!4UDb_&cK8sUFUP2D>wga%}vsx@geoCs#Dzd-?2@4 ze$)Sz4!73bZ1LT@Q{=))bLNv|S2tWodp0~64_|VoXUj2sx~=44;r z#vga{k>!s~YraZNXS~ZjPW3gs&R3-5s8FI`@@a%n9-)JAeb^V;3 zm1MX4aB9m-*xZJl^^`%zjA;5)=z# z^)U0OEt2rT;2qeUVl!>fKZ8lf)Rlpsf-A5=y6b7o9k4U%2ou^l_+_k2>4 z+&0)Sj4svWl+E(gWI_g-YTs!6`fiHwbC85yQFo~E15K8FLoCO?=X!cdk5d+fz zZ6VKGR6fTO2&$^9s_Q1DujKGfvKYi%9yvc^*b(KG>xsAYc=(bn1e1tf)&a?k3#TSQl|MU52#=hW>_)HBq%;srM%}V)Pf<;!P2KF$H;&-*&0;dgcmY_Cau<{a~dz> zhFHy-?Ba+Dw#(lFB8L8Ku#I_jna->0pY$9shmm{m=n_jdDxB(Y!FH!6nk6K!mC>lnqUSA8+@{|~JD`x@ zgj_eE?oFy#cvmAa=iKYy5tqydx$w-Vf5PiMQ_YZbP6QxKHad~y@z1IsB5lFfB*2*v z^%qZLTK2kl;5ZCp12`>voi8%!$U4mWk^C_W3`17amq%z#jco}-BbN0}c-xIF4vWuTzkGW{?7PfoV8uoj z4$E2LCAGJQ-j>+arO+H-#I968t53XwegzfI#^(*b_wReSlkQd}{uOuq*gr>XBIRpL=oenRhM37@V@ z4vkGexo7}(=tA-Cr5K0*t!V)nj@#V?mo5ynUB1F$4|=!Q87d?Fn;93g z^x2v&PO?c-;7hJw)0O^HcU{+-ktM3WQ+=Ry2~G4N8CJm}sE69!NQ^6xjMJFNN9}mu z&>ymqPKFUvzTJ5+Tu(%zVlm7L%jhDFp1w9*6V~#gK5S$z{0nwH5?Mg zYlbVdD=d&+Z*ZYeG~?84WpF$7n&_Q?Nt3)>mWBVl! zqHig;%73fP7;k|0uK+-$0njb{i9P>|#)k@q_W5!~Rf1D1sN0{6P~WvMq<>^LUa1@s z{g~)`gqkH67hgY-o~-%O%(6*opaM2ua2PWs3t+WtB1ajbahCMY5BWODq6usSG{#) zNJw|fQV$1;rA1bdod+a7cfeKzw`+;J-EV^Ql!#-eo=F$O2RSEM6l^nnKI(S$Y0xgD z0QLS8_5_E}9^9(_0kR(j;{#Za2jF}tuLR`?Fbe+*F^=EY62>h_Ky#tEJLSG|mIq*Z z(~d0Ko-2zUf6voZr{PoIpJ}Iw|JSMhXZl*my6nZq*l3n8^L&AMaZ*fn)j$J}fLHIa zuxmbP!G~JW8{m!z>+o1HbS_Py6}xkjNt~DMb>mmqiOI0lbxnSS&z!sWp1$>?>w(MI z*V~J2Qv3kPR}bo2Q8-11xc94~P{bGr)_x-+DwXWy*D5&JmV(rc-*YV|#zzvQL#P{c zmv(Hg>U?}+)3ax8T?p>@w(WNuF}&4RYNT=%4)!!>TP@lpc{7K)T*E9JMm==Ft8G%F zlbg|_i&}5zFVwIOtJ8gI!@X=n>9GE9Ri6K(`oE|BnEzr9#RgWb71-m+P@m#J{<~uJB)J&xW`u5iuD+E=xlG^Cklf(=sUiG~#bgw65YVDA zjU3z(5H~2RIlksDYLCs|T~ddhLJ#Z7_js0t1qn`84b@)@0W}K`wv%cB4IntcLCu74 z9H-}T0D`lTQnm%E+BDkxQ+SD5fHAhcaQj3NKiQ7Ww)sOc%ui$H*g%UGlOWTolifCN z5wG;&i}N2KZmpaY0$c>O@{V9ItEw&DShi0>{C~P-HTQ7d}kGG z*f?bee_W%dPK#|XY&y|*bf#_L=^c=S*r(gj+wR$e|7bq{-i#*KZOODY4Pc0tjm6E8 z=0Ju!4CTCW$Os(XifLuEC4+q?yGGc+ZNNl~@qN4k9h2MNFd-kO=C0ztO80iSk*^9{ zEkZX)C1Et8`@3vh6J6-tP_w!O7R>iZ-JP?-7S}S{4@`R*zDk1FREyLeT87sb79Me{ z4v&xI_AhF(24++DRNiT~aA4!awkukfpmG)tXo|Mv-10#zcg43!wC{zJz4FwosCu8K zCCAC3JFY!P(|kmN1uOLSA7|ewWReLrUVhndD!5YqQZLEq`b|tJ47$4R=Gz4J`MaCX zaYUuE^!3l&Dqnqmy@9O;=x|o&yh2pC-+scblhyW^DsOefA6)2Cu68XD24N~^?3-MP z+>NTc<1Oa9P&D{gfZbmaTP0h6CeZw4aLLZ$sqU|gA)Zdcm2O1IU^>fmjpzj`UA-<- zTae2{p0@%G_t8paP6}Qxo3gM=Ror&1EK_3mJJ_>+is>FLyBpIPewer6zR4mqiD;6+ zK?y3<(u+-LI_(KI(^q|c0%gXf;f=wqb^$7&_G?MfAeE8yN7+-1keX0y!0tU!g9Gbv z)lj5TXlQ1zTDIGZ$wFII3964(F_mT16B9~CPg#`X##bBIr1Z+rL&I(@**(fh7}seX z!fV>?bf?pNdHCv7vn=!A*{+YPF`1diT-u*4+lDPo3~k(k2!)Z$Q4-i!9~Vic`O*if zioTZCRPp3ug)*r9ZM1%`z>_4o<(J>k1NQ{-izMp<~1Lla5*jU503 zC~L=2Hz}ReH}*=7YLS^*(M4Sa8=Nz1Wp8!+8-(EW3uX9_N3uIL7aJ6WA}&|K;%vOt zo7qXlVi2r~^Hlfc0p=IinzaYAH*RM6mT z$hp@^F=O1dKREX(;%7 z>V0;6xKS0^&qG!?ClYHI=z8u-tGmXMf=lu7MBrFW(c;UJ{wrg#vpG+751Y{*#I=Ph zuPSn8e?jU!h*AjXTg}K0Cw5fE(;*RPx29T7!LCyV|i)SX9wn4=o+}6 zzXJHe$K0K6@F~M*8H`dUo`GVeV1_fqN-E@dQpm>1f6?FDBLV>jrmZd3lPRFOWaM=e z1t*Dr9G-X{J}K~$6dj{90wHAaR(51&8@%DukZfR1rtOrg6XaZO-LJL4VS;`IGCQT( zQC)}>Ng~geV61^moC#$|xptBPz=VK8Jkfdb{@qs{v%JR0;k~?b{^M*n#(e0#xO=r4JNIZj6X{-ilH6O!3 zj2Poa!L^AA5qqcQ9R2e-^`n2{L8!h+Ic01dyev@U_zFlozUowSUlaQQN>a${J1=Y; zW2blHlvy=p&BN%|yjJVMf_d#&Z~8&kj0oA+Jzq9^WsnDamb)g=3PSjz+ch2QM4ILd4c+eUhXC zsI&jA_Ji?k13fda9pfZKjuWC2v8?^b6k6C zS<4Bp*S*h&u8{=a2r=YVyy~Vdc!`{_r^5r8`zzUK*?i~Hp$uf z*B*-YIql^e*!em$;2`z7R=N$zTFSx0*dyESO>Oqpg*WpFX1fI{~tYK2)aKMZ8m7e^KB(0yv^ zJ!p)ZqRkQpV9w8emv;T{pM?KMtHb{akMZm0)k!`N{EsK^bab;V?UtoeVqkMmk7lH6)9|S5JCe_i$ zyW2eE)b<&$*B+i)$}aYml|%aAr)mzH+dq*F^5ty3&%j{SYSXA19PF;-v0jZK+ehVB zS{D!2KjfR>B`${g^Ns;GY&&Tux8wNpq{DS;LMy&8zF%$FYjeUmH01y|e3CF1}OpBCmCLs*V%krS&r6YK=LZzIIag0-4M~HSA=_>h} z685|Vh4XKB=ijrcU!VGIsSZqt64KZevIPdGZ9#T>C0+nkB1Y1^QEN>~YU7z5W5=j1 zq$z0Y@HVE`c3cTbs;b}VYk*0sBn{Hb=zy!DD0^RT7nHXhq$)Q)^78cXbStbg;OrX- zCUNTmFQ;Gj!T?6K3IHh6G)u8#9Je%kZu7uRKJ=X`a$jh{mv0Tut2!XwZF_>Ky7SqX zGT68i6OQwZq&{t0!9A;$Z}m|JGZFwQ=S!Pz-ZL~F&iqKlO{ujpr6q=yD5FAn-n;s`}$V_uv~$B~EUzVTrA6bh}Py31Jg5ZAmd<* z1U~@U40)AFHcm9vHo7bF(Kn`AqOs)WjF}O>BD!n|$VaO>sl)7(CNWWQ)Dq0Z7ln6l z3U&2(I-SU-7^mF0sSZ&o1R){qgZy`MbLM+O>7+3JLeCuQq0|Nz~SGO*I zV_QhNsLH`U6gC+wq<;W9Y6Ure)Z$GyREl?8zZYtd*4Hj3N7ER@KMX_=(~XnBzRm>R zV8*k%R{*J(pyGW4?FbI`h2+`M^bb(03bsL%g1YksxOfsrft)P5XEp!sOwq5Q=P#WJ zCy(i+ZU>v}*8A_3W<|oQ`vk>uPipxy1fmrofD@0}cZqf(sskqpYW$xNm|pS7M-OgH zYm}8cPC8AFAfEqYX%l0Nsp{W2hIyKc)z{8+GW1P4bTlo(zIsFIHE@5fZ2)tmgr1ww z>2^$KB94N+91YGK)7#L5c={-|M4Rv44Ok@u853g+59+slVMHXfOc;JLdB=&Ga@5Ku zzl3D}rK$R>?!yxY{rR8Ew0eXon$NBVcfZ2Zwhxdp8B*0({O3~DOs>ySwp-TGr7rDr zs2eb+^0vTCPb)SXuxaS?^53+_zOy6p!ShgYfdALF6H@wT+ldh2D~!tm*Y(7gRj-S4 z5Se_M{Td@Nsa}ak81NIafYYiB6|r;BtnV(_R5vY|#6;|BDWtcu>pH)5;uXPfDPcV9 z2Usn`zjb;sX;s@Nt(|^=!~mS^zf={aNy73om%G|c&7P4Hyp)V2(VB950QgP8mC;0# zyXQEaXS8V6^nur~izka-yL|Ruz9oW;$<>TN{4kO>*_bHM)xZCKnqlw5m2!n@*)ZI0 z!~jCjTNnH8P9CRAwf+>ca)j45Xf>{7Wca9083Lwvhss_AFoZSClbiFTLE^} zi;Dv&g?ESWM=zP7pHez8#TxeV?oD`aCmtVfdNM|c*t9cXhWqRTdkP(5m7{?xg7+c^ zRN#G=)<_WW=KARk^{>Lcf6cn}4TCAwLWnINTobyf_8#Z5&k?Ge=}Pp|c>%CBPb+r0 zdZuv8C435f{vzO*x))@ps(-#lx-xc?Jgkp*)mG%KkM~19>SvJhXvF4~GxqiKaM&jR zx1qJ*z94lyb_hN}~JFE%KJ;^Lj)(zY^_sF5fp(rfP!D zvL16XTp1o&X2Fc>2~4Bgy^WLb9ZVU_YhQq*?m#TB;9iN9-)i003m#i~!NKoT1X~_4 z=bDkj=;pL;b8_|>y`}SLLaW}rZe3-QSI$z5)=rT(>5jldzH~#Vhbf~<35Qo%q=K@{ z4(n0C7TsnOcP%ZmIHcH(@=wvA3>HJb%`L?JgLwDXyzs9zwm(<^KS4LZ79fDOU!7dc zj^@KnwJ7zkiQzknQvt|`i`|pPd@Umh7u=<7l?~m?_)c`z>=O3KBJCpJ!83I*^UQgZ zA$<55*w=&bIAUE(mBYFFIi4)B6!YcD17A)pp#}LbTP5StCp-Eq=jN9V2Dk6~GSoX^ zUV&SDO2)J2ff_zgi+u(vE66DyNG7Ys&3n5Z`EZ6x-;#({t^EgxhTlokcWS-tdpaXr z;ski3XMKAu`48f5yM%qBZHED!dNikD^z?;)^O@<R^nSKZzA z5nphj)n*DuBCCiZ*_8#Blbc_;qO*QBkrJuPK-3Q7Tu{5OC<&n3$NArG}#|CX>p2FFP^uMp}r z7Nin@@I(s?$rr>Z6{<*5t5T}>&}s2_2Ju3nz2U_I6SvT@LAoZCVF-g`U!#+Wmn zCluD4g*3UKiCo8-z#;M#0a!DL952btZ%z=V3E$F|x+;P>SMX4o>0yy5d#Siix=??P zX8RfohO>g0hV|o#ulB(Nb8=JdRbsyRniQ=IR@X}(MI+7A78O~-{|xEf!c`4;WL zhfk?A35o+Qr7Uv9?IIIBxxrm9l{MG$7YZzZI(f2xk{rQf!;vDq2h%o7;iQ6YMxmic zWF6#%+w?1ljaK}$E%h^3sB^?Bs_>?9xb}pQ1Z(qknon@^?}Hx$=ly!jo5=aws+=nwsExvOhpdqi_^MSMI^^<9H^Oh=((nO{a?llFxj#VJ5P?d}*bgTFFB*B-s0-jp;V)-_g{Rg(%5m>C zu=9bbq?rAiln7RMbAQZkUu!2AP!hqqGJ|k^JvZ5JkDhb{?Y@Pr>pu16Q39phZl+tJ zeFj>-EB)apULc-NW)uIycIJax}x z0=6&1+xJIYpuz~QxXLBt@C@e@#$=z5E|~Ryf>r&|YoUBud%^knKK4q?673#mW0n=- zGvrtV$7bI@|Ik~R8C77Td?lh&QA|}}SPw-vE}tV+C5U%9O)4Y2RKE5&`NEKw2}^vv z@_O3t{A6AI4~XRFm@5g33f){fPQ#~ZHhR;@KF``w%XOgD>4Z0)U|!3Pv|z-Dw!b0i z@GyKrKK(2t)o9mGf{MByHs34pe?-s@6`2bDY)~?CO>ldnc%f$GtC(xCLG<3^1}&QW zHQ7uLd?)dTxW}q(SSFSr3E@>NUttwVP4=&8#+fvLeP5Lvum`pT* zCZlMB`8w3@=1PdxkOXcyBqlfN4_x=*boyWsBn2F>tXswgIe8)=whu@(y+WEH)R`%! z18|TLK)DUP3-ZOvogT@mK)EHTLVtj2_g{0N>A~u)uA^;FvvymR=^cwvVCl)r&626Q zN)z+pdTNXvQ>%oh9x2K^GK!y@$lAp#I;XTZl4*?`Z0^4)>!b$HDZC6)cVQqaPV7GM zs_b8uxRpJ_ah2irjJw)V3}AN8Z(a~(1YbV7V(gX>%b z4kM``*$%#A$>zPs4zgJ@-ODX4*fHd<>-_+?iUT*7c3vx2*E`Yo9d8lzIEB(4&Q!P< zu+E^z{8LS&&&XWc?J_}z75jp?h7zcQx5h3za3)e0|g6o$txFjZ8&Hjb5Vue|x~oXDX5Pj#JN zD~}{f2P+B|C(^fi+68Z3K|Aw5s$k#DvU)br+?ms+>CP3Rj3HL+?QLavm+hOkjb~`u zoTsUh!|7-Q(Pa9xPPx=wk?qA?a=^_oczeF)eY1V)VC`E8yzreFJqKragDUxy66hRx ze?E=&MeQ5({Pl>|>U@vb@UV z_+jeD=#;wLfjTdn{q=6Rn68cR2uf0bAX>g5>;h zRl@WB6AzJ%RuzbheC3<@hB63cB#wOR__nu_-1nsY% zzs`FTkTAbpUfs`I%@MH^aR~cfE3Fbj1SmM+g61Ac_oXf{P~F#Vkx&_*3|Ly-FZ6z5 z165M4AZJM6KKaPWeFY4VHukue`X=+WSFRUJ=4T=66&#J@NXlAwo6DRvYAK}#MVUcc z6`TGW-Z-{`_ zu$VpFpI!eFsF7bilCBr0MBm2l`LU%cc~^QCLziGb8?Cp;ts z)YdozQ!UfxG^Ke>6{OL0DWiOtXE}dHF*a#1Et+=^r=G<7CiyCLekSl=CpjpNyY3Qe zy|e#+?7ekVlzaQ{KRBe6C?F{a2uMhG3?VHk-QC?G(%m2+NJ_VKiGVanr$cv1ca6WB z{e1V7B{Zk~^9CewGmUFopL z*No^j*p3oz^wUDehU-7~7IXfvnJ=489w>kb!pLbVPcFIfD=eEC(*e&0os*>hodg~aP4t@9FBj!#ygEd7)& zKTA{_8<248DhtSa@$HjdTwDNe`qfr$P@7C#*OQKE$^6zD9NuLZyhY)A?+2S|?#9WM zyz4R7J(Qkbrn(l#t|m%=vJrw!g|9PBok?V!6`mWjF3&tgk;!U-l#YiMb(b?VU?t6Q z-PGsZ=I)swx$;cmgUmCYdbTDJkaA7v66eRpg6CaV)6NIG%;uj`bK7#W zvc-62YCMtj&Xps8C$hy&Jdj7~w(G4lFphORczw6plD^c6?A{<_k|HS2dIWT`vR3vF z@(n|C0!i)OM0s(&G)TjO$q`PO2uN=*Z|hy;P>DG; z7)fqe`s7;WlXdGOW~Q_tUZOE*X%gE-#6q1Z0q^ku#%q7&4s5v48Up`vw}*)yaaopH z#ce}?F8rgFG7n`{WxKGM(&?aky$Hy7QyDB>h5Hrl>yu!-9>|UP2r#GgRE@rPcr@R; z2(?DSgYt3T(5i=0gE#IPpIP5Pfz!u z?T>5af}21EKc@7brwX)p(yXmseAlJKxeIm?S)*km)vDI;j)OV$!1GeyBuIVZ-|A71 zX`(Qn>lJS{z1asG`Yh`mc+Xo5TI4q}9AQqtN6U{{!sd-nGiz+RI0uE3jg$0l(v54v zVaCi7ok;xSS%9aMaJH--OHb|q7D*go>FQt&G8?P9M)g8-bK*b@|B_F&$_5vZV+23y zdf{O)9riVa=ZkAgC!)I-LvIamN@KRJ8g=nPm~QStddxI-xek#TF9CLG>G_*O=5oPn z-9}u>#Dawio0supcShDWUwsOfXZ*&8GC&I`E6Nb6lt0i{fz7E~ic82vxixy%bXdN$ zfbllZlK~9~muVI4HcvuY@1Ci?gtF#3|Hq?~5>?0fV;>if$7x1aAN5pj#!t>lB>$>3 z02Tl+?w&ZaHoDNjOZeq7eC5kf%S)q)3?(*{vGzP3GKI#ddWz(k_uixlQ2Y$`5o)HG zTAOOa);x1uJ3Kho!mj#}5&V5ehwq+{h>;zNA8s#dAK4_BuidAVOj0T3=p)|7ol6uo2qd^c ziWQZCpJ%NyNNCRZBu>~63k)c;EKU{U`rv)L@Vki{B2Rz^FZQFC7VqdtbLUjFP#pGt zuw3Q~hE)SPj+eguWaS$8m~S)LiOz7|S>()ZPG^{$*8z*NYZ=ajC*#k!|j*@Bm?Vg)dd=brMWzWLN;d=F+|kGW|?*ke?B6 zAKRUp7?P@5|Ii=KR9<3B^FdSwj(I4k-A^Q3_O$T={YzV>UI2**kk!vXBjm$|nk#HQ zvRTf#n5n^-y7sDrG^X)^rgEh zeJ}=k#{LtEXb7Cf8#x5Zim-3=dYHRInSzmDiOox+a#Ei?4DMNbXf?}-L=s^E+@At?F&)X1i_(H!Fba*l{Yfbh=oNsCJ(eP8sS}gmxtEf9pLz+2mu5Avo z{@+{w>DJ)d4nwCLGG23un(DQG?Z;I>I#PA@Ho?{KZC1EnQ_xZVtDpSK#rUgDUNit% z?UGN!G-7X-?ZA*!5YJB!1@gzEDp6gDRJ{MZb?lgQML~*Hjw*?zkfG=v5)fPTno3BY ze+PqcoDbiGnB817?=5>pm-n>T z=FX^ZQnGvfz*d~_+Wgq1#(_wf)!GxsXIohgiY{_KCg z$F@X?(FI|Ok?v6Lgm`b5qe%9Y?EQZ1IA+dnegk`SxdZw)5PgeWx)DUYCy^j3JaEQ! zkp*yAiZ-gfanM->lrSTX=W;!C6=adw-7Q|+gTbZ`dRE1pD0fr3mrpoxvYhmtXs@*p zuUJ8P?nn;v8wkeR?!?Joo5l-~=a^!Vgm?hX#RD^Srv`kJgy-@zx7R4cDGlTGrM6B= zYJhtRph6r3lO>MxlV8?_m&#^PV2;#<9mkH!CF~+V%LfF=Gx+R)(xjg{gBeDcVMZly z28P5D;kAY6$m+A5too>|@Ebm{MK$-8!BfX_f7t2f$MSE@?Orr^*dI%fU|yIvFvzkx zSfrBt@#;D(nd5wtWzjX>>8Vc}wh-L4Kq>71vbiVUWR!K<@W>?jxsGx?a4_0huEt-| z!4hEb8ham6zkxCc3|3ieckBUjsU!{#fr0Ime-TA~3@Oa}s17-e(h)5YY4w9PpNrgx>v~HI zV~*OfP6+cPtiEnbw(PK&$AnuaSbcz>H05TlB_<}SD-<;IV*n;Y*$I2m5BoRhU z)D&@F_LM<64LQBuw?l8l+x}tR)D#)- zf>P&e_QHTme;0{FQ*rKb!xj76t$#FZ2UUrB6WTztx5M{5f%0jOylr)}tf}tI1k`+E-myR0oorbQW~MC{oC$X%i30rjwA43FIe{}}u&m%3 z6C`>q)xs!S+t!s190p|F2!VUatSkJEYh1?lQ(8iaQUA?zI=*-$`Tnv%ir^Kexdm@)y{ zC-D(}T9$k{|316ef?MGyd=gU*0{?l|((|5gOlgoo^RRZfs;h1`DVzA7#|PJ{Czqih z=-;=GGRJ+g?k2oD5CTt-USIa+{^M9PLlI&rNnll zXB)*e1EbyV0zC+a8+xr&oedP*Fi*qk%MLS;f2^Q|LZf14M*gzTHbixvEl^(`mt2J; z%ps*-k|jU_>^zVmPBL z93T`KS%4uB#QMssVDK8HxwFEtkydBtD|DQ_!;(Ex;v$L|*lYcKdxbMoo9qToUk=66 zynDYw1XISKYehHjtl>5LD21-vlf%SE{3RL6n%x|zzDPOL@~0hxwmq3YuX40c;sG%A zR4=-vydRBqD=9U2|J5h<@H^Au?Rn9B`DKpPz>G%#p;eCIUPe?X9^oS{{MD;*CTo9E3$iyn5ALZU=0``%NL8Zr7Y8JL)4mP5i1~1!;2PrhGc7?=bw> zU77xPdWGlKr&5jgV^jN4rsKvU$A{$*A^-S2{n5zhWJ+!pS2PI|MYv~tIm<4=gkQgOus6Xz>HezDjQSL zA`HPY)DDLui2xb>cuHcoZMOQ`vja*9lT$_p=D#{e)QD*Uf!7(?13m#5L>c}gZq z+_HV{e3pSKix>*f(%iJZgBl4N)306FTsI~p8^5jsxHLn(sEIPDnucoo;~Igi`d!bS ze4FCro2)?LU^seGc;9wM5}7q`#@79VxZW#`byx#Ggg?E+93A(Iw{Wi@fQF8Mv8kwy z9IuY%#ORABv-MVZV|Ze4uji-_!DmVMOQkF<^CxlMP{VosaRkYg$X0zGC_xdgBr3to zj)N>OT;s=pf|enKG$< zSvAVb@d`5h+F`Ta=Q^a{@t`wCv|N<25Ck5&3+nwbME=w^|Lzd6TE^PwT3TYD937(R zmuR_0^z#8xY*&uB0fqIgAAiQ(XANkU@}-8HTf+6X zC|jK{${UGTd^Zbtk-tF!u>c<-4o+(5^=lUn7DGksyJ*{ynvC6V^Vk9m(VKBzW@I^ue$EU& z>P+Q}W!epk@{e9t>RF;!b->H?85!Q1i{eml*R6COIJy65@SjfZ+E-Rd_(xe?rHe4u zwT9}K)j0xUMVKWZ;BfoLEcx5y_pgPEt+W^q@ny6dyu#4`&N>3mzW=032(O2xzH>RQ zXYl3Ap8|P5Yt8?&P=UPPpaP1ctURcx{9L8N;)e_hU8}_2mrBhKytJ6pVbsgm!*GmoFBh(Tshc`hE3ZRcpZ|F3Cw9As|JIjpitfm3|s7ei9z@ap-enGAsVfRQDjcPlw6YrRwk;&kbL%(f-2%q_Rg@ zQ)7vhpW>Ggn%=yM&7X~W0rDba5kM`{V4I4RLdDmMC)~3_?Yf3PAucMda7!DZtV1xQ z;Q(*X74{Tw#yu z&B+U8mylqBm*Ck%l2lVO$RSz{5IeQF&KEG9gp~GCbT=rK^|PK2UQuIGTq*Uxd>JMa zZ0JY6AL^BKiX+r{Bw3A4S1UnrctoDIKI9Gu^=2=j&#J7EGQA}2njC{s*0)za+$@;> z4#G;Lh3yk5ucl=ZA6KC2HBj$!T-Mt1r&SA!#XQ=*CI?>uwYQR>zWtKLN5-iP2(OKh zPH^%}VCjYZuiYZ&&*VM5f+zV(re*bt==;+Y2YZ*Vx_lu-uxhBPdP%?==jC&xFP;)B z=>t)WTnuQyX+dH6W)#R?-`y?p+iyc01AJx0ft>%R0GBsNa=+xWf0sRhN=@4BzV_c0 zU3izk8O$(8ae1X8tr@=i_~3<&!{-V;lnrL4>!XVyL~L39B+vU6OjR+Rj6RVF=M%7-pFXTQDD9i|_h zL@cgyuxOc=a>bvPdSEm!Uc7vzfG1+}7(c}DBIpoS1=rWqbd!Qgo}F$O<7Y($7w9?z zw-BQpl&b}ug_1)-g&!?po5w|vy87C6%m-x)f~7ibSl<%%JK`|*^F3{40kOZ+cwZaV z_=#8T@(scCYWS%!aaWC34*h9?g_A8{RKlkp;5G5d&E{0)GP&udJPX=d^Hjwg1IPUa z1h~P{#A#>a3LWmd37gV!FY^^jN!2mvJ;dFq$6OroX6BU!9H0I*kNq?hig?=F(VzP8 zso7mj1pWvG`2Wk1`4kTVQkH~q#xNHSli>y0^i4Ivnf?7!cl3WztO^sYZx+_yOf z27?%;VwG0c%`VVFD)M3ioP3n~8!F6G=A`eLx-AXC+q>u-?d~ZL%SDJk7|XTqkp^%s zgZVtSYwMEd=E&Ll5JYw@;%Wk|JQN803l%0F_#>SI8qmq@3ppee^FUhDEbGEh%mtbb zm4^Dj0mQnL9g!{flcm^^UDPR&jz(+YbH@y{s6S?9bpqLIKlj=UY_T z#qX%JG5%Y>X>@M#U;L({_2dvp1OpsQGG5nG7GwkhXu}2?Abv4p&qyRyh zSG)s1AS#b~>b0GBj59KG53s%Agp={2S^M||P{$|$4KBO!l$87Lw6$EASQ;8ygG9Is z)ys~kH}j`Wd_}afjqKcMY}p2+a7r_=FK3)v0T(eoe{J#xsmMnH&JSj6Z6TPe%CF>c z8qJo}7%7zDFkibz0iHc_g7`Syu6{6~1jrud4YGd_EfIBC;2{y=ZHUEOh1iusTU+mwG2BdO!Tl=Lec(al1c77+s+cG-$ zrXi zOwTl-+o8(dQ)0_}hn@6&=+mF2g>WHdsJP{1XOk<_XSE()>Pz8(6Rl9gn-`}GyI66D zDL0&h)sE`RZt6{Mj=qBmkP0?1+#tOT0CQ!nGRE3&b%!EudM=9bK1b~sT_xNX`%L+r zm~aDVJ^F70MfInFVt%eM(q|@Wj|FDZys7qIgoDv}om|3{M(5@=%nndW zHO%9De9}ze81`^GXL?1~sm!UjzPR7@?d&hHnv)pVT)W7&@e=&TeK&Hqb5z3@|3&z{ zSG_9WvN2@`SM~0v{EoAHV&`(;S3kN7ZBZ$>4XXIC{9OU@`1@$xr+A#L>!Euaf2=_N zwKCly3}R}f@GC5UM3)k_HJHz#SU-sw7o7IzoSj*jblsp36(05>Nwt4OECWX}QT;w`TDqpTrZL_?*T;>s2b>K0JV{rbw9LFG7Q$~s4@b-s?T~)dVZNar zx5XZi1CqdpPA!lKs{^oI#;U^)$XOJp9lGb{U*Ffnt|pqMo>6}X89HXdjVKYOa(C`0 z)Q4yVce;~&aghE6H;|GXA2w5++{-)8Nn_7-jK2T_v^`r;7PIH(1Rl}(qGC=E?WIN@ zLO&-noL0PN8R=#2{gdl=+M!habSpHZTW)Jr#k%Y@b{e&c0vLP%5%>48`0oh=`OjVt zp1NEq^A;0NH|Xd60vwQQSzZBfZ212jj%~c3Tikeb(bTR|7w@>Gl_bES;W<-R*(|Eg z33ayJt)Eny>ERt>LYOFR$*h;#?OtTt#+&^m;^t}%Fo_U*Rlbs}XvUiHh1&sEl}VUW zZOKedO%jtqUxCP@oVv_!{JFCJm8TL-yn~xsM!Kfa=o}J9*8Eyu``TfQQKglJVxGBKN-|4MhJhx)tf0HxxS?Ss6H~W$(`3 zb5Gx(zLWcbJ+QfU7lwTY9sievu=l(!s^W9~NY7iA_p0ad z@c0H0rA>#BNC2t~xgyDK)QZK2Qtl1ErF=$rY9!bN+td&e@e(mdkI}dlwh{6(rY@ww zuA?Gud&+8un*uum|EQG+?nCal8=8q6+PN9@M>8>X)~P}Jd3j?K#6`_P)xu^>lu+w% zFJO6OB{gvXxNO9xf01YWtSGvDjrslW@Bx42PyKg-!^nSzumKr>1j!+q1Y>||Xl=Oa>#}&-JzU%y2r};t#4;11@EC%*Q+-NGBK3 zmZ`5nl8yt&kA9xnZ(qYj{V6TmU5Rw1etj7XnE9(8|K)#E9dcCX*stE0@VU9PT%s)~ zCNy}KBwl;aQXkK?zkVL0Vw!>0mr5e@@y(zE?isI%14g=VC{Q|qyc+>*Q6UHGK;V7) zy*D*wk`Zz3S8}0p6ADDVxLv`1L5z@GjqWGDWZ~TR_F!x!A|PSfeovenBH1hgG7`=*+qe3Rd)Gd5>I2f$gQ$@z$^fd6 zj5r=m9DvjXTo`A^O0Uw#Y5`AVfal~-BrZTRhh~>KMT3|41uHF|!m2sJLWDje60=g& zhrAU}oBM{wof`ZAhBZqFbM;ytgt4yf!=j7v1iQJezIc^b4h!7V{F#|$MsKBC=`U#A zDCfLAxvdObd^?q<^PqMni)*{@6Vi~$o$Tt-9n0qBuWDksr{wAS;zd<|Q!k4xhrmf^ zRgx3ys)EVX7b`Kiu5v4G8;CNY@YP5?`mnjiqzl1F&oT4H24#WVfjj&^`WlB6p@6DP zq54BjRjisGl4BR0#DFYhS;h3dH;sgW75#{I!D_mb5M>cDbN8I*$F8eA&GZ*0#N5gO*nlZZCr?{Qn)KUN5&go~b=wLW>KbZ!iB@@tgCc;_C#>B0$#qiohuQfoH~3H+Q64d?8g1{q8%He1wM*b!9>*@Ftro8yRT%j#m8WbwZ0W ziafZWt2=g=$|XQ2M0V_>^~n1e!bHAKMb)fLQ#}xnJ1aTbhd(zHi-KPgN_{83Cf&@m zQ#@{+_o2R8GfUiUpZUG1e%(eyJSVA{x)Q&w2yh2IEiT-W2dvTVxns!yU0XCvKj35Z zLEu$yWH06dx^_)fZDmw%jX>DW{FcW!@Ig)eIZZ3t(!EGJujhrK1-Vg%6?BmVFG*&P z@IH_T$j~Z;f+AH%OrcK^m0IE5{gl3aqqihyf|d?-QN`MO5db<{`G(OF2H8u=$|&h? zNGr(LeOAwU`)f65skmBPx|Ue{)LH-5G(|upWbpbJ*kcH>0)UDCYH~FeQ+KWQ64s$z z7cMFXuL!^?3om^?a%29e4LIC*9h9h&hC-kCB0z%~5u||`O7{KcB*UjAc89Hj^~>=_ z3r_lnmT=ykjB0Pu-0#Rh?gBbWxw=uh($f1|j_p(foB2kczh(>`& zHoXG z4~$vw(k+-}2KC=Xv88n)^TB#1RtH;A@OY&w#R=#i%?V@q-+fsv#LtDn2HY`bc z4^!W!W#edbABsV|^C#Qk_OhzQN&4O_8uA*t`$Pih8De6bP&PGds#s!fOiqzDE~}z+ z!Pb7Ip!VqQHGKND2hUu{BBr`@WLE)>369L}B_Z|migh{xB`{t4DR`oy)ahLxX~mq{ zn~+EjBs8%xgEHWf=s4I9Ivnl}qRd1xqf7m%HVW_5>+oUorSNx9M)Fkg&lc@}nfeha zb4ogN{j+Uo!L_9Kyf1M(AvD7ksk@d-b+%w}pI~CH;&dSL033;cc<|4W2as`A!4vQ2 zip+h0RX1nP8DD@>+O}Gi|Ev@?Gg)}WW8D@{S)^P0`uidd*wN;4Xu+8LGb)k=*K;B_4->>WnI*Jl~%bvEd$$+N279hXQ?US}*2v_-YYqAr>^fuh!@z7TTq z@zI({RC=Hy?_`sH=;7Rqbt0b+bu{YvAIQ5&n{oK=Y z=Bzaz({G<{8S{!^b>)aE@!WF@v|)8ZEUYk3nM!1Kp9tw+EE~1a6WMC1C#j#^RIof( zzlr7NZIsigYc-TP&H@l-hZ~EuaaHJfD0Go??jafv++Q0Pcu3i$$6W6kcu_|FB~EU^6Z0nr7}@zDR)vYH+Ip2EMF;>Oa}6 z2Mk{JsMbG9+7Tdh8vMGgV>PZla4#2ILD+=?>s&u~%qVtCFGu4(ey5S%H42QX+F*0P z!)8fsF+k{jmp9U|DYybS!Gj+tOe}p6Qs9X9nT|#l@I*Bri8uW!bH;9`$qAP&(2kKZ zAsf=X0tA=#7=DhkflPm~S_3Jei^&Da)E=!ORM-}B$Ej?;JD@K<`da+q&I|fFpUv%a zIfOX&`c=;i5e%kuTKZSck^i-QS!E-c$&3?n%Yy`uv*$6qr00-1E30M*j&0G3`u}=Fh zGDv(7$;t6&pStwu78U|N-^hYbif124hd53VIL&l?+-nEiwLwY*r}?6ysZ=ZT*V0lZ zrL!~hPu|qzlFg*H`y#TW+@sAaH4ym@k{)|@BHT^Cnt4icV=0H2kW=_1ush~OrKD{z z=b*K$d69!>k*er3>-VAJKL14$vLM?8>~xC4zAT`@r8|8GJhUELfOz%7YNr@6;_MMBhVXik3lMty zGIJBCcMchw9p|exOoC&=!(Q?Svi8Yy$ewd-@ZM-M4Ov?%G=CneHd&sn2*=5 zIJUR*%_=FE+~E!m4-l2U3({hdk9t}25ScyNxgxEz$^xmaCf#V%4;RgdXJ9m%#{e== zn-6A@AB$}vqro?Nv-6zD@G9+d;hvj`cQE;YQ*PA z>40VWNcyv8^qr_U)8a{%JHCwE_qn(>eFTURQfE|ZdK@sE>-NUnL#x#qq~A@cidj8I z2$bl05tlsjmqR^TiIk!gdJI{T@tehaKbTVCnzJI%@#dvDZG-2Ty>r=WPq7V# z7Av>aSKeUzWg!2ytrUfltDCl1Z(@tW-v*iRW__TT20gF@Sk}y*Jr8(f%aL-dCux_u z+tkFKaY-tmkZo&S(jN|%`gJIbH}>cMN(I4+x-;)1d9^Q>d?bh<|J#!6vrMo$Kv zNEpWUX(NYrcdWkDo@uP5hFle@`pl#O(NG<8Cka&vNAeKJ`dfr|2bMmU1jEJC#Oi(W zPpszGz)w_lm#XrA4Ml>}fxu(a*KyZ`6v0RLUa< zZ>PDI>x44^G03$xk(*4kC@1#-15YBmzv68Cn;O+GdV%< zWXf8DhlfaQf>~w#npOK12EgrXKagy9To^p_i)fxa&cKKvHWVZyDnj$+@slZ3x3s}c zhqp&#?#zgiq1gf9{BN&zfSC~{j92!?@%DsqYyJL*W1}fp|L3yZ4>=S5i6n-jJc)KJ^* zd31>ygBG#FUvqI4Re|I+VU;cgX2XQ}+U!xbgpw)YSL?} ztrTc>fS_Ag#36zz*r3`XDIB$23a6FICtcIHa5Fiux={4+`A}(FRFdp5E8N7e=1s&} z!A!CJ^kPjkHSxdBX23U=hhU0*Dz+^MR-Md2n_ z=&vyAx=u_J6ht6y#oxorEV>wS5#vpIwTunQ`DW$Bh2Sr0Q4co*_qkL?MqFu3z6~=o5ca8vUH1r)p;gW(T$DHo8bv!$Nit z(Mc!fqDqz9kqJ&!Ob_9EkZ=}U7qNH;GO|zB3s<-a8I3cCqS^!zCy8=^82Ao8v4@z)bRDI) z=iz-J9!#CSOBF%Z0zKwvp{r-W)r&)B2Ic~x`+(8+95qX2T#4{|vgmP@&!o*U7TsLDDh=&;8PR)G8Q=JeoruYYu$moi!i9 znq0k6FWYos-n1n|7vgw{&x5a6@qu?=fK?T$sl63ZG%IQ#vIKylM=6}M@*TO|J11D< z#~%}F7#5^vAFr3^vk~sdpbO#A>&-tfu!G9J^y}XAQJ>_nRR@_+SZ{zR(gE^IAY!AL zvGS6s68Xy{HCQ0kv>jn3{5kBJ+Vp_fW&!Q0honFuc>6pD{(*Y${jn*o%!)#V%MUOrf35rXhMg zs?K^;6$lz>W$uqOz$)iDtHU$ugtT6L60qP#9lGt8TjWys@7CDTerP_v`Ds&7eR-E>OyC0uA~UciRC9Fm9) zw`NNsmM^Xd8p~vIa=!UX&1p3W`ed6JJD>2gb)OySk@bf9N;R7YhW=@cV)zRSLkO6E z8`J4IK&y~7op9doBTvAmiSS0^v-3Mw+IjTqYljtuz37Jby^R*TE(Io`KW`LnyX9}t zuk9buuNqpg&ggoR4@LEws)p^OET}j0vuhS=Z%vJE>6VMPjtC9cb)9@gDs7i3iTkW7 zB-?r|RAZ0Qqlm_ae{uF#Bw$N94Ri2x4$#)u(OvOo_Hf|>D8n2s@@k(=-tKdPoTxs4 z$0n@b88D<>rtaTdH$A)x+dMfbMqxNnWB4?2%FHQyBH25o^Ey>p-O5BO=2hJ{HT*g> zrMkh_qSW9I?<-27TU{?G$MAC^1lH+fU!=d>Rxopz)%;pK8B!r6i$@;5Z)83~g3k zig+~nE!mL?E0Em5QzLnBLO=QRa->5(T(Gu#isyCG<|ri9@eSPI_Ahj&ROG_M5er+h zUF+3AHl@xL_{ImFJaFmia}CY4RXm0>1ENA?2?iI+s9$xq{nNc=#DaWC4VIm>^kaq{KAZUl3Pe8?;j4r)`c6591qIUF?X?-tUYkkv#Z*n9n9l1 z)ipeq!^>+3XozUeB&&5=L&+YHwMAMJF_E(3Pb^~3LNgYpToUkmBC!`$#%gXaUF`p% zTnsCgKmuPfA6(1XPv~RQX8KYhJbj7IOMNb1H-LUooO}^b83jlS|9aE`CpN@XrwUeO z9=2lkle(5ya+*RHk5BuDO1nFtcED;S{ml$7ZQcc!ygPpdq9kg`SXP}j#hm_b@SDkU84AuyC0xLs=!KU+l zXp(hlFv|@4+Vl`ci<0xFvCay5c*w@}t7+L6COi#J8Auin58H>$0Gp#)Uc*D|M=YUj z&HKkS$PA8~cvZr5JO^XpUNqwq-%2Ce2~v|UqTnvxIFSf>RVW`|}I+M4s*#I=maJE71_?_a`J|7ZQi;n#jcI&3+Ab$2y*&B>}+ zfDxiO9)`D=$Ak;byJmFi9@kzdD6!+k zLun=0vG;5LiPG2r)@dim(HD|0#^p8!@3b01@rFkx$m?f)w7Q1C{bpNd4-=F_PqFr_ zTE#&rXzq7GMN76a33gv6*<7AKmwtxshp(Alur0&*DJLhmbkd;LkDQpWylb-B&`jmQ zrhFHvkQJHpi(0sJOBEf>@Ozr3-2UNYX7p5i za)xEO0>^_Fn?j%iqbePf$Zu$zR{od1C^t+h&#KL=Sc}{gZPJ|N>QBhyETNu>wa}Q1?6jMek_3R@)|#0_aW}#16zJ$VmRx6El28r zaq?=j&-=RXAPo#SexQm@o2ww}SK3Gw4`G-C-|WaYkHFc=q1)w%7{T^nJ3$9}Ut1fA zE<0Ni^jHOOS)J&JKh0w=ps#M{(bbx=%68;+7+R}6Ez8KjVH-_KbSIH5)=d~{tBL_UMrc)clFT}*wq!NHpoMC%m*ayn6ft$%Zl)iQ8yjpn;j zLKPc(D3fsLM7h|;8SdxfbC^#OG?6IU@abc%OLLR~V#|f(iYGcq&kVYoYKCR@u`BIm zgBX;HKh+&$ToT=Qg3nq@#P7oX8K`AYaBoLDpxP6`0e51QA%rS5OAr4AB8_v}!%X0d z<9yt7xT7hGtcJ}vsBLb3yEgyeFaRyP;dQX|FA8@CdMFV3{XH1TnSMXtN_0q~mmLo9 zkhG|9&2s%BlaYx>8_A>!-FgR|m4@{UA)ODEw@|Db&2_Ws;!b5w-LewI(L9e;2l`HRtzO|f+#uvZ;(V}i_jGl(=;$Nm;o3*>=5I9Z6AaaIr|x`%<^%_4Qc^JQUKPz4jE!sAkE*0iX=>Bq1*OmK&iYN3nS*v| zgAcssYc3(jC9`+Wb{gDqxm;nQ*W4uS$`8X}H?_mxK{;p`1HU=FJz>86fh|EMOeppA zZXvV8H)?o2*FOpi$}+du*)KKa`MVAB{XZcI5{qjI%f;oimn0IASAEEMfMCZGmv&3b_{!4dVHwIkcNFsPLg@N zsTE(fw${lLh{r677iCYqK`cYt_Rx8vLdM!q^$ekNIeeLIcz%0A`Si=_NOo)*Hip|7 zph}dXUX5K4FPuUhGnr@zyP<&|e$7-JCc2q`GGi_Z5K!lb`NIhUu0yhvb)HP|ucWy2 zk@~50N}T&BjJ`bfaRSB6fHj}Qf8^kQTo<|sE->SSr~@in8?AB6kzOgo7{ge>TJx0U z?M>-@SV~{s(gD*aK{9}H`ixO_iT18|dRy}QUX?Eal~pkVbAUFy6o#Kw7h)Vx3*-v` z?l8#((u~(66-=~#4;h%Qs(_48)T$On7LbB}NOA(`{bNc!K0^+6AXm#5a^71nnD7%8 zqEZFrJR@1I3)O+4`11+w_I19w-AtbzopaFriUsFOSp4fy&~nv{#W&m@6DKnoRs#~6 zle5+?w)clK+sLG<1b8S2bJ}>`4goq|x45UjTn@P8FGTJ%NOjqb9VX1zLjRAE{`-i( zOPvh>A3#i|JN{44+owGc09*X-F#oxJq+@4xTsSq@}%R zG$z?baIC~d>fwaXUtb!!&~If)R^i2Rm z=T2H#?yI*LF+bMal|yvhbx?K3sVT~R7-^y3^BC#zs4riomgg@*OGTRR zjFD`gbUV$6gaQiZGxoDGS}jAGXUnk;;b<;MWz`^;`1b7s)`vAy38HE`Pt+<9kvAX& zo2=VXVXl5!Uk-cQ)_N|)K9Ed&cm-58kN|ZABx_Ypw?|c{ckha%aAW(W#KrN!oP)2F zwK|FqHc70mo^RUbQ|N1+Yzb<=Jq4<66qVWj+3fzF=9>+vwEA_BK?YxU2F79N>a%A~ z{}WgH6z2?#_o3^oQbp56UbR{?h~bC`>^{CcD;`R!3}oZrK^M1E!HzD3SUsiLMOHt{Wz`ob0S?Y*-pDEQ64Ei6ZCfvof5h%@WOu zH!t2j_JLkWXp6n_#boR1adXJsiU=`uU3wGF@*rk&u-fWe%q7llv)(Hv-`euiG{$>u zqosH5FmTL;+!?qsVnR{v-m2;y%<|$L{hboJw;iw=;R;6XUGu*Ern7WYE28Tmpfbr3 zDAtkjTGA_7E>;%i?-T_p&eny&U;&9T8^gd|HC55FYpD}{HmX;3NIeftKD5vWR zsF@PNnf)Wm{wqNmPP3%rOG8@uO+ussa-bp9R)e^5GmW;e^!`&7=>xhFhr{CR#V7&7 z=+~+FTZoZGx~$!ZnK^N&&qY_^<8@tE2*#r1JyPkkrztn_bV=9O2DnaPhyHhKFR3m!QPxI!^M9AArYw7GD+n8a0~)9EI? zBO|`z%d**FYTghqn{8Q>ut#pNw@kx#zA|AG_zd8h*DBHSyVO(h+yHkkd5_pgcl+{9 z0`4|(^W|pb!zmZGCQZ>gsqLDWM}8(-V{oMn2=z60L=7CmFQy|G=F=smkra}5D5@L& zKla`_s;YMF7hbR^r9=T~P*9Lkx>Nb8Bn zeXL_cHJbyMu6S~gS=}!E?da>7k)CJBU)#xqeZP>#yQJ(EZmtD z=wrH_S@By{4EpXJ?FHIxZEelX%Px6Kj-whPe9m6NiZRHx=+7gQeBk;4t~9&eM&A4& z(AVs+l{#16JWEH^DPC8XeLK`aG|jwxJtN&Jawr(%W+QFOV+(iwM&i5Dc53>%dZ_fc zlXgsQLm*F^KpQzW2Jy^anhqd!$SUb&2~5$zHj=@1VViHY;cov#)%P<}(Ff!uEv)6+ z7-#31ceg~!Ej$=ayI${iyZJanM^jdI>kmq%bvVz%`p^!<(e<^~4JYk0Yi|cFK@sWA z2BhJo4sIvFnq9oAsjjYkS;-f3J=fAG?*-}|e|;l;Z8=$a`IumS5BEv5a2t6MccXv{ zjk2T1aTf+-D7OJ0S-fo1w}yGilTJQ#Cyhg5V#q6_0{9dnB+0kI?V}8J4BA?nxxesf z7MQOH=I&6FII2_E6DigGsamsS!0aLPRR;I5ww)09H|HBVgNR7_ajS0}w|TE}r{Czr z6rqzLjb_V)x#()z5YO!Ax9N62aqdPR<9Hr?uoBn5mx3(W%Y7ic#gV=$WgM?iE2;5N z4u>~T9v=}f&-M>DsVUOD!Clnbx-X0!Z_wuu0Gea^2zzR#s$0R=(pUKV*$sVX29(T8 zOGBFbStz2G_Ke;C>*yZUc**4X^@lK;Ati^#H4v6jB*$jexQEts zE@nUR^SLus;Z)>jt(%+<@=|KST>!LQEr&>VQDuuHAf?>9QkYxBFc z1w5*I9yLRZ?9YcuoVqzkymZKm#(nk}HlfY%mKkQ8vGm5U6&~Rj-40Vg^Fb`J%brUS zQsJj}x|3bGQR-P!wvS42l#EJuPWSfG(q@)M`JFw#h2N@v{VsFMp9^rIp~C>F@}H>L zU%sN#k8&2mWQ^G=+V0+TK5@@ri{m*|+&SOP&T*f$WVyG@YO-$=E#IDXg!(~ChZ$!1 zv=`@u{wW@$i9O)64OB`tH7$*?0GsXaI5dSfts;UNSUK90GMmgfU02XucIm z5$O+54ZDbv1{KXd@v?)275va<5-|hQO(7_cYw{=+-2GBp1Dc1Au_f#Ys&&`!4x>xt z!ksie_dBwrPurqawjS`%!fd7>nO2_)T6-ag>9RX#=vjy9J}EzGEL)Sx!>pd=_`eR&-Ui0r)4-!tAYGU-@Nu@1WPD9MGdN zIKp?22XEBi_>@=O>ROjJ)YWQpr7XOz5e>?~k+9bp8TOHG;LN_sHGA^<4*vdYuP9*R zEC%2}EtpCw=DwCcX7(U9-UUoU)YRX&dvZ~HNV+6h{L{ESD~y1j_bWohrXx->jP_7bT3P+fz*I2t3B&J%1>~}`@;R@At(aS#& zIfe(vJlH5Rj+TQ?<_$Bi)Q@Gqq<$@G{oFAs^e77nnztx#TG$-vwE6M}DA8uY%LMBN zp!pFVYiu5|KV1kiD4_9*YRPq1`7gv6{A14-jhTegIm=FReCiAop+!2h>5g7UeAk_5 zlEnKL!LFwY=tw^Ki!(#^cC{6fCpGgMNFtuiMr2GR{TJ&1 zI&j3e3vR_+(;2KIPory9<5*}n@P(MYMXl1JWw|U*|M8u(wk~VTzxPhr`Q1Au6B}@g zW;Pu^i{7hpih0TQbS!XsANuM@SJfd_y1_oo)Eb&#N{i(Mnh!eNmr!7L)kew_;=pE1 z)_TlEM^g;$rkXr}deolBF3@$Y{IrQzb?wcHew8fQYOg~lFl3_&!92MAx|Mg{4fLDH z|D;qCA)>GSH2kL20M8^@K$0FPiYX^4&n8+s{=^NFdN_$)EyJ+v7-Yxn~=t7}6IWvQa{b2S8Eyv2Mf%&T+C&Sw!c<%e`StqW&Ps<~)>eg#4OIKF_xJaB#UH{?GtqX{abhLR!F!1t zH6eqXQJa;J6^d~_D_nqg>vyD@Rb-*K<8(Q|K#e+!jw%5Nj7^nVj;Vf|?`v!9xdY5i zp*C*pNDH%bFj#|j{v$ND9dBsIM0m)ZxK(^W=6i?8mxBZ3%o`0Y)Q{wy^rI2^oF1@V z4u}F&OFRrMeG_i?oI;}wHD(J7-8jy&w)vrU7==f^!GR3<8Z|9ELY15s$Pq>lr{u<^ zOq@XT&Jo`*1m9DcyZ#JVzaVn{TMyN=S|=$AsL>qFdqVTu9i03fZ#C45&{8I2>p7WF z&wF+@-@3^i#N0lsyIo^XcX_WCSS3lB8!~?NoE3>yVNWo0KMF7l#TCw@OIieA0~GA-Ce%vNdU_PP#!;S4_%!YOYIznx$<+Wr!| z?H#^({jaL6IxBa(mPF@Gw`()!RgdzgvCZW%S8T#9#dNc{ZMukSe~B^ZMLoMK@?>>k zI_tD#IEe(ef|rl1`QmhZu8gbCqYV4!4m8(Aely_cZkYbx84_elPIk>6z&8}?#I}pK zC5N~HXV%vM9+b+%Z&@FKSJHClW)6ChgqYa<)uKRkNIBx{^fY&jT|1JgH+-G7p7ee9 z6b0ahko>^h6=C9*iK*JRC|>bBaax?U=R;alt2uFDNmaU9=PQLp+~;K` zgMulLDFT?eA$E!-5~~dDb84sVYxHW6l&iU|H)VoLz*NB*r3g={yhhb%)inPHk6Im> z<4l?wq#XzOjYD18(WUwsi=!@#jbuUtpHTV1BF*h@+OO>Lj0M`(dw`(nmWevTNGe~Z zkwX*)_YfOOml;sBJQYe->tQ8{b}jr;;mL)q0e<=lhgYMsK5&^W^EoGi_$a_X}gbm z`Fa#{DeA~j^jp4U^eWd~N_rgS@7PSdrCdk-XOTF?WW2JQE@&^fxMvb^&$x&B7EzjM zPB#3{xy@fsNsubTfZ*UYk}Hu<6E0PBhwAKL!#kyv2)E-LI=^?8=C^_VJhDcU?enGS z`C<*LN^HGG*=6fT&<$N*s0E|VS`FEIGN_U#1O{kg&Q3pgY`S6JM4eVXAn#`}RRgoD zAbMwvB~XC8@5Fh&lR+0V7eSx51*dOdFMn1aIvjs}qNV2cLX*K$ z)7&}9jmJ!I)1N6(K_m`I0M+r9Milid_wuSpi1ua-dIXdGr(8IHqp9OhZ-vS9jkJg7 z`}Pj3`E@(E1(}#afvgOX!|&&vxIMa@nMprX z93|+|Pa4|k?N>P_@VYI!XQDrn?56Tcq+S29@kIzLY&Z7s)-%_KUc3JdRQbJR`EmKj zeH}zvoGeIhY`inSwZcJ-W6;}C9SMsSM@;EYbA@xeKg<>Kabg~iYQM6ST7V{F8Pf`| zjrwyICp1Qy9x;_gn8tGPkoym^DQjBt!oriY8Y%P*deeMZ{Yn|M2HQzlN0Chu<@Cfi zqQB!>ftr4a@_#fZbXvkTV;_agv%F0Bw(bQW45t)TTrn{?NS#qG$=2}zL}b*KOl3nA zVU#~T)+2ItxP~tg+sg|zId?`z7I<2W<5{O=w|Q|V`b;7m1B}#ne;L^MgsD1&Zl7#P zSy;Ecp^Z|9fS5tPPH79tfVc5otRDT^3nw}3g0}vl5{24AFPge0;?xlDBDs7%f}x zML{=Gy&T2%_ShvNq^=n60Hwaoe<=HO{G#kLksmiFmulhRs)w8BF|?wf0gtxzcJyWo z^^7JWN6$aLE>`*nU2(%OP(T=HQ?aniXq%7TV+})@v1A*R^S)uAn4N~}J=WHCd&;Ad z=53HhnMF`(`&Va#VnQ$dSf{BeR^oia$Z5c80~KVgN-j;8Tb5$pweaR-4jiPyK~BH3 z1Gj_Or4lNj@pMy0O1kV8=ipg!QJ5GPJ>J8+sTHW!2CSHhy z@P6au3t8Gl7bV{u)xNF?m)L?YWRAV-$x-TH>f_aqp>h)_5VMR6meVKdTitnU*N9Rg~M4~oAvnc9WLhmdg9YVg*k0uON_t;w~v4`6iKfweoYyNJXb_eE!j7g zDFHgz+7sPbnfsTaV0Rk2Z9#aav30EVGn-JwL@rrCsI2XrU|Thz*b+J$!zgH; zR9$OM3{b?*GSMs@cM6Dh?$d;;*v!pM@!y@!E11plL3=j-*C>vpwlg?;4xxvGKPm1A zzH!#cD7VbDEc4_%va3tl5R$S11A==QXX6V@SLun-WCl^10cPmaF4sG`8PTX!T{~uM z^AxY5Z_uqky7$|vu292;!^jT&3irR?O#-0(F}mGXAyvg}iWLpbVX#q0e7hIoU!g+| zw<<5hg`-*WhH^-Rp|P_fuZF(u~1< zPg4y}$BYg?EzmgCi?(%=wz?4)NrHpCXrm}?tX5y4+rX-<+QiXbrc8rFl3>(hr0Zf{ z*~iv!@J{??c;nk_QU%|fR8aS@E$U>}(*kO|_MErlSXiew==xs!g|S_+R8bsvi(Y_$ z>iMs01X8u`53a6`TWJZ9y7pF09hhJ=8MpQAi}N&y(OBaF*D5ahKl5<<)hBZD=2u&i zCiPdT*InOq*s&?R%+VntbYS2G{nPT2QhM_>6S1Tnv7AD0@I|-Rur{`cM&rBwrrwbn z=1IeC?Q2vkVHC9y#(Q)siU6lul30o!qi2UccN1#v3u=fETnvqoSY_ZH65~vL-w^tt zPUC+^FVk(arBt#tVjWw=llnl1nnw%OFkIn&5DuDt;49UJ@_D`DKF2mIYQ;15M*d(t|b82AE{Ma*ux>#Ad^(?@4^}4z45F)I4M{DaZKp_<(L41q{UC9%kNHY)4x(Nej$4U#~+GQ zKN((CMVYE%oUHK|@QyR=Y&Ibn$MEeB(6)_$4=SJ9=;j1+7c+msRNQCh_#HVhfY92` z=KKQ$Is5^74ZWrJ=MCZ)F|B2~>O%EDC*Kk3lTr|uP}C$*i(#LPlXyQslT7-y3$#Bz zuIBZuYCLzGf1zD=TF3bCr!6}1k>%o}+Bo$gxT!*mK2c2hO>Zc{{NvPR|G=#8lywsN5s9Ll?~E2e{aT zu?3~M7Ej=hzpX7zZCk6=7c-%)wy)0%yMT-VdhBs5{Gx>(Zt>U9{&%0jf3ax!?}+37 zv)_G8v@p;;Y50gJ2l-YZF%hWEo==`Q);O%-+#zRED*)rFj)Y`A))9LaV(}eIfcMIH zSE1YR)6L9te|-gi==k03g3P;aiHe#b^qLEo?#_w!g~jiGC&&NJ<9-LQ{~lN$?f)F+ z>-|0fe^$~LuGs@}lGIMGE-4}+1F4*84#pH0k>C=m@OKMVbF z?%!e*ki+R=3;VDLU(>NS==p^RqydoD$BKB(u|?J3w5|iztnpIku>u${UC)7QPBy?I z_x9~=u2O_)gCf9DRY`2*S#^m0zvnDv5NNo<0wGhDXr+T7k-q*G=RA2GNhYq=u0P0+% zF)R5fQGStvla6sIW|rsD$*yt+*pDkrq*&+*6ZDCuu9maFh%_7w6v@Ry?0 zamkOApKWzu%eg*W4Ta>R5D;Pl6Dsq7v9Bt{-LUeJRBUjLTcw* z2P3sE_O830GskJpMgSVd;S7RXD&M^{G9Mjxt(yn@=h^JPkSHY)R|hkC(eO9lv;oYDM30~p{1kCx#NA4Xui5- zgXI$;5_UCZYb}qMjwlmTVnMCMR)oKpta;(|(z;e3QB-yeKx2KlU67Z`=aFg|H7*?I zQ%rk%pNaGq0MD*{0#fB#F~!Ppo5OxgiKR;7^7cCh)NEI+R1l+RHulQS8@OUUvwRF2g~S zA5#Yla;N<>P2$H0-n$O)ZPM+n2%gZhfOmvCLDnOxOwVD@W130=T6cKy&d^h9f9`@f ziMzy442l}#Jdi6N>W&e=_Y4+|BS_JB)?H7OiS6|Z=Ivil5qivbZ<&nPbhuxV513=S z%l6s13DYi^sex|p0RJWY84`*KS&)i72Ll6gvHR2Z`KUYaQ#AkQBO1_eu@*}wOFuy5 ze&1joxl$hi=WKOQ&=?hPx=7QNd`NeGRUPw%vE{`M{)6aJKnWnc*xD0l7kscZ zF5sUi`d8!P-(jcnz*UA+3Wc@z>PuHA>W5?EOzgW1%uMEgdH9yVSje4$}J zcN?q{?(<2s93+Z^uC~bOz((O#yggaNAwMq}BOgdGjG*_nmt)M~xBY&>wAVCRT-O%> z=t}>Z4r~`5{s%7W?FN|%8D8x7@v8rNvfkV504`VxatiRVe}^xrE*{MC#4XI`Gy{rkE9{%^vimfr+0hQB$*5*ie&YIC&`eyq}db{R^%29U%g?F}qwbwY@S91jF` zgKOdr;ke_|;1D(fjVJfD6javwBb*)xLQxyZGVTIO>*8q#20$^P?LlRsc=xERRPu*uU zIAs*N1!U?aZUq3?!D~PIYJ!NeoiQt~AaSEM42_#bao*F`yw=5#)x~AP+E^2Cp%M;+ z-X*T{?qSNr!-*(V$6;BdBUX;s4yVEee=k#@Jc^~Nmxxcl*%wT%MH zJQ>N-$iPThq`hjQSYP^rB%mD+8vi+g`=65Py2fp3#+e^i`Lr(xdt$6P?$zJOs*b;g z7(OW`_2^J~9GBzXZLCXfyvrqvT3GwWE@N#yQbNxQ+6oXTFlA{Pyr}^*>)>-DbOImt zy4yytG9`OqIE;5dy3?Q8gOWofrUxIKqA?HSsR|^KJ^u)D@F#I%%1Clhcf-hOOL?Oiegr)<4L8}qbx41(@VkmM`{wiX+`+F119Qqq zKCQ}0<~5VRm&Yx2^;lo#FK(VSlZ~6yhm~xB_(4bkQM?AxiX!0W9p{@e6BbFacxK(1 z0@lu7rj~5VFuI@P_V}hpZ-})Q@0Mj_1%U%0BwLJ{RkDsS6Ub-hTH4{8v0I;Qp~1y- zI3vuY-uErrnLy~O^^9J`yV{ax*vsa?XcZ2h<-`3ch5(9_T-~cia@vG^^wj&Q&H|&3Eta9uE9;L7re&N zT2RT)yms$s*0GQK)Tq23__G1^TVbi5EWM1O>BPOpft!i$1w?B5xBTlj`nUCzn+z83 z7-T6{AypDI$Jy+Wb@#tYTy`3I=xKZ$C+h3-?-H7KjF0VRAEt;@wSz^DVq8x}I>(nS zLv7BwM`4PRxKA2`ZO@)P5|J)R1}trwtQw1zf%3g$>1+xyRWve8vBkCO{fS!sSmk<* z6N1!RYONL`BOur}ypCM52btx*)$Fvi(m!H5Lg5sB|Bkc?(zujQP`--0w`N{*8J>1~ zXwL*S1P-G+l8&8+Pb;fIwUWkz$cd0%h zRujN>_a&boJfN)L4UT)%H4I|d4L}T2CyqS-STlY*;^y3VPv@Mr$-*PQFYy?cdNVfu zyJrvWMv+%!ZuMs7$aZUwgJx^ju!L^@*mZbeo_=$Z`z9+Ic*8*At6*pWJJJr}sxXR? zeKWotVl9b;G?`%;t(@YH?R516~#(pvK6jzJVJNN-W6zQJ{#q)#kHdY>TY%F&3<`(=a# z*SQD)DY_Huv)`+jQrgZ@AXDs}cIOoMUOHR$TKFcz=6s1oIjy@kaj5L{EQeif?30Zh zyLj+hefq~0&(6vHD7Q5A2p<&ICkRhJpy&*h-psA452V{lWG$rYE8n*M{OzEi*5M|0 zoG})M(;rD+2X`_#O8aGV_?+^U*v#NJVdv?Lv#eDpy><_ObgH=g$`%zgv2ceoaVj-P zA{+du%^XZBOzsAiKc>%1@kQt^CHD~FCInEvKu}~3qZeq0q2Prze^>mx8)c=25{T4x zmM8I8mZYD!uMCCKSV;jLyeflQBMCU~GL%y#@mx1Cn76kaV?oY_0n0#De%;$V+=~yF zcWYPlFVQFw+kP=wV9rB3vD}SxIRL3_Vv5~6U2)`NtcrLJs?C{zC-HeH z=ZVxf`&SwI>&ysVGJn719P}(pFB2^tCVU%xXFD38(=t}VgnTP=#rV~k#@te{Ii%JC zoH|v4lIXl3#TcQ2T%1O?HCoR91Q@4R00|9bhoNW+vQFT5dw53B2cTXcK;d8NH?9z{ z=k;4^{b#yOVNe@L?y|7(y3N#Pt85*&uclF%scQ0kw4*I0j9%ix`?gwMH*j!Q5zq3{ zJBiy>|1-4kaCh#gMB6ie^bp)={MtOh@z!zAf_Qi*Z<7*tR+L%T2#jlR2Lw#-zHrsR zOwXS_G7*tN{~oZ#kp-X}7vT_#&)gxx-4{XL-a)sHdLHM_C6qQVjggE9ddQn3xU=gb z4{?gg4HxHbltOy) zBQM|M$2&5~G{VnEvh2GpE-@8wW1V`0<5#^obOk|Rl-6nku?gu1$PKdj%mpXMm#E45 z9cOLKMiv$B1t7(c3f~R>7WH~JTDJIlz+Gh;SS(58UxGL;&ym2_xSglSzLNeR@7OqN zcimu|H*avyyfM!AZ>%95sjB_+Hw_C|;mc^4y(o@jV+7c@MYJPIqZ;d);uB9okmi=y z8*mHrSG2oR7iBc>2r8mFZ+rU^6{WsW)5<7hUC?=p@<`D+$}4_qc&w3{)GtRS`u)Cw z&t=h+9?`g-M2bmde_VF~4?n_B%SN)EGT3HAk;eVW_=N%nB+~6}7CwCh=mkSR(`NI? zbMZ5}FQhT2hS$;MpOn>bvZk_#@sZ>eu9UT^+>9@xM1pRHfm2@{0_^1A!h3w4ttM

m3^YAtoC4Uh!5~Lp;<%R+C{`Y>24A+>^kQ{tL)k;CL_M82d)Dse z7wG#r>&kKh3BS^>T@_@WI~p@d1t{SsviA`Tw(|QA&H36;hm5?Z@~2X*{w8G5A#a7h zE__aPENE_WOW}0q(mJ!TqOm1vu}?{fDJ#!0B97%ax;u51z5inL;JQnw^>qoxWoRj` zv0m;kK^NpOyBL6-P&>PiY50OzJEPG=6>un%+_z_cxuF<9F|&JJB#$>=Rg@}Qw#1lo zAKie5@)KNZRxXWwP8e4m!UpxTHxxm$Qd72coL^`fIDZ0uJA5;tpYJ4VHO`h(A<~PL zp&a|gfXo-opMST5Q*BbcsOF1>>%98#=H~$>nrOLzrzkIkFa*Rr4~p}52p>}T^go=R zxgRth1r@y&b3h4v(l+BiISuUU872(HyAl=E#GEerD}QBgUT782hiJQYprAP@KMcs_xs-X3pgrRCLQf?zHW7-|xYIsL5YBWfl z_+wDP?n9CfKI1EX6zJP@JLtwFjEW@|KMks;mA3&xM>lX>to^gH(nuKtMhuU=v;VvjZruc^xr zKR{;%V6{QGY;fa+5`E>z^>Q#Rfr+kGEhaq~vWMZYD?6zL$-Fizm`L}teCBGM!kC8} z!h!(h9qa8Lbc}W`D-&m***3=@wlAVP6YEal?(!QC*uT;8zuw_r zzyCj^Z5_C}%AldQY8)&auVZ`g!PCcTwm|5c^U>O)kNbeQr7K65xs%yGg<+<*}#3r~P4( z9c3HI>&r2B5MUv)tA#n4)wj7(LOsQ`4*cephu_Dp6(93dnivQ(3ZqB7Zk)VEaDH&= z8EqF7t^!FCk(WKI*!I#HXx6Ck!c8ed&m?Z8X!qx^e&=k;TuF02g=se8@OAo2?Q3vA zw7&jiLrje2Wa&uA3`t^}QFp+w9qnj+Bx}FyEnEqE>oMPBuLn6ZF$Y(X1f123nyWI@0*O`Ay$bk`WUI=m%G9(a&(V1JHb@1rOnBYQ6^!}ee>DGOz+1&ufjV6^`6xNjEm zoF2|a$3qqy^j!gERU;L`qk-S$ZXV&aS>JRYkLheNCkRinQvq?(3&W^l;>4?>X5He` z4AZ#*WPEcUnkS+?Bx+rYwgHS>;=-ny2! zkUxWdWw}2y=-%vnwxA_z|De@2%`T#J^bRfc?xa70RkOi{4?Jcf? zbkWd;qacL3gJHy%b>lmcbtn=JS7P>k&Q!d;Hj4|mMi21C?(A$){jLH{%bbW%olJ(}@XF@+lH z8(3cS(K|j2dCkLX4HY4M504Wc#CAC&!slN~jz0z!@q%8U^7?a=e!?WRwqzC@`P{EG zJrl9)W6dB|e&w?rAvnZnKggDb*R7}rWt8HV#mc5%2EYeJYN@Kh5h2znyoE9VwJ}b#rGn-)RpnyF)s6hT-y_~zGI>{NyU*TfC92;^hl$nScIpc@IRr$5zZi&7j!^EqgqwHlFOZG;#yiXvbRAVendX5s#h1vJ!@ z2z~lQ!Q&%$OOyy>$g*KfD$kAt&E2hOh&?{qkhvLkhGrdVXjKshFdq+J7j_x2?YlPVH!<-p~jxgyU`&`1h?&oM9D*I<| z^_(lcZHLABDy5`XS0l zw+Gt@pN=I9kMrKN0}++2Z-+AWIn&zG=1~Hplxt2JKnP5C_IVnXGZ*+FXSOhCn9Mn~ z^5agvO7Oxkbyi!qXhuSoyQ!_u!+599Nj&6bmNkilH44d0@m968@$rBL8prCUK0WFX zk8yM#?7M2fSTeXLX5AOLv^gNBXD;J@#|85`PzlD7Kaj1BkwFn7B3tJJwjBoXAoP2x z#|Lub+8>}9$QAv>5wA4D%oruuL7O5CFm_W5PE4{kr0z#s9d%`N)?&;GP$NbU#0EJx z6L~;o@Jd-r+CrxQjZ&Q)w|?fP^;xWs1F=y7!oAn1ZrvhdL2O--6bCq6rs&_$O0_h! zGzzsf+RY1h?Cq{$SX#LLEr3An2+vo&qPhIr)rz?@?zUgJ_L z?Gp_Ve2$E(Lq095?V&$6eyYS< zpc#2Pz;5V{tWqfOnFOBmGJ%8#WNH|Q>tGd}5pQVtC~;PxM4VNMn_?ytqk2+x)O_AC z27QMi@mmTqLP-9pAcRHNwktY+p~s{y;xnbXjW3v+;O=)nNhM-Ks;D>rSh-3u7Cd8N zl2gTi_>7FLRG;6@nxA=_=i-~!k+#|OD}x(bR}=1?y%-i`#$S2R$D>$M;iad=m{a$4 z#4)2+0q|RRlmMxGlzkN|F%3D1IMek2$gv?9*1`KEjg(BV6}2SnKp6Ls6Y`cIaK>zom z-qDrk?_Wo8mY-7RZ$c6*tsfMBcjN52S&pV0qRe$U?j##Y&Q<7}zvo5LFN-OwR1dV{ z6V!}U%%{^7P;oJ5IGR4_^t>3mCvlaXzct`;n$C@1^;gt z9bEmWfr;VFU z!b|Y&@SeF*7vJM4DW=5NnwQ#`&yUTBkW)Y6CMwPJZw( z&&Dx=R&ai<7E$p^o8}XrGBBT8QRj%%t$Gtqq`>!L4G`D1(bk;%!~H;be;imNMLqar zYGBdA)BPq4Zt7#eh`_6Z4W6%1cQdRciqjY^4|77-jgmR>alVsej`z$4g`x>Z{_?bR z<}^}J#-z%llFXc*RYI^HokRNd#jI^k zySGP8!zo|#*;@VIWZN*03W1!$hf8#AzICg0Lw6dUjE_KYR17E@DXbeydQW`>SM96# zy-jxO?Bho~Qz2WA zU&`|!14`iU5-xt10EZzO@_k}Xmbb6St28!lXM*0 zrK1o*qcF#3%U7R+h@~$Loa19h^M-Boy5&{g8_pjZ!ZVg)3isI~1$%4T?&037^-{h% zh4KDtviV!RTmJiSF%!8SmoO&;gA81*5rRlFMoj4|{_@6IfElDX$F@hFwMCmvq34r3 z>(~W=Ws0+3vc@JF`IHHj_W%AyT8qq%&JIH8XON%Q`W5b2fP;P#J{}x2EE7wiASeNw zINR^AN1T%T(Jut?FS>ZH8Qf0fDqBFh;$%^wS(I-#iz0F57C${^59U_b)4**_Q$Qn( z=Y#W-B~?6iZ&q*_bYMG$F+I@sQOtN2L!CK)_muhwR==NF|7PxMR<>aveN*BG^ZXFP z!m3&&DH;I7BGbX;XF8?>K9M=b?fwLp2!;9f=ZQ;R=3Wl1jh%MtHv%>z=Ql8}y+Izp z@)l1W9~DU*h_0Y2AhP{l-qtWsiUK$D03fOYeZ%2Mkj<|{Np*4;;Ug6y0etW!$!a?h0s~=50lqI=TXB@+l8^hy)r=)E|c#3X__ArYgXrkN6U)4J&h@=~AFi#Q13S|G&mB38`dLQTPhsr|=_5C6#1azG9~;a$>0uBR-{B z_r*sFTiJiLAY@AmX%3C@N;pGAg)~U_%HqC?Qsj5Mx4h+cG3dwpKUH*{`ad$I{c92a z^RoyN5l8Uzkkdk)I`LO0!Q7Nd98F>%vl%!cAye}rOUqSvJmU14LHggQRPfVLE?u>= z8A&xfgV?rKo=$7v@&7)y{(ql;u7}A|*P*9LUrumr7?sPVyw)VJ+<*57e*TPsR$L5gALq?cSnAFCFA`qr?dOlgA zoCU{#R>tmzzxWh&sgJXkGW>$q=d2{mnK72XD2I>o2S^dkpBDi@&K~44Ii$W<(%wxh zXCIX=+5Q3I8N}IhjKCfJ+yHv^tm2uA36@H1L{v6Bb@g>U)e*Y_`zQLK*v9pU!c3ga zf6b|)G?j=_EpZ5tjC_Kf{btm)8g8Jf>b&iYVESK3@ZV)b2$Xh? zN#z-(E^-~h(UqhCrZD05-v-E($4TNsol-e2lQO| z%9l76HRF5CbEb`b1MuNnasTIvVysfsVOVixs^n7K$o`aKCO<%SaK4(GEax;A?&~8h z$6m4mny{c}+rmBUAOwBmh1z5$X&9lv0vDNCR;U5fH}Kq?Txekh7bw9q6VTVB8hXK zsWF6UJR(fAq0xw(r+N-347$B*UfSr}OV zR9$NyWu%dLlIm$JBoYGm_ZT2yj%7~TOrgQuGzOWInL}Y9yZET8>Q;{sVy1%? zG)o#yp4ly`d)_#x?x!Fn6Lb9PA-(M}U8`9AlFzD~>+Y9w9*AMUw%|{@y+6JZ^m$|u zE6wdcWA`V#tIVx~w+(@TTy?z^W4zStQgcs}?~&KP#fq9&m3=6gBy;k*7kruAO3pn5 z(E=_0IrjwtT6psK_wfk@P@S}lBubkG2EH-Z&ymDAWX~-v`tuGu_vDr^QA&T1u@seQ z29#qF-cYZFUqJH=xEi)nOs0u5qcp-9JVt!S9+beXDe7#6kgt!;%qJRCn`B5Wt3~Cz zGv(G@@gAm%-yyny078i>>ZFj6Pt59r8HcZL9E4|aGkKPMDTBqbb{^Is0!9y|44lvo zq+^Hl0p3yV$B!8%wZU6|xL5=`j#xHyeT3RwDZYIrgEBu49?P!=x#a}-dyk~KmioOK zIk2@89P#8yD|m!Nvz(C+wQ4)UV%1(~=ZF7$l|k;OzZNj6@--s%zU_@Y_v+7y!Dj>7 zzue+~FMo&o7nZ-}OCw_cC!Q!#Q0c&vTY;xPK#na5nnhF4BRJPppJMw;s{5zEWBRhsh7ew5cTu{Qg{RGdi0yVYK} zpEwd$&KOD-3ZJ~4XZ3=9D9l9r9wJIuZA+)riAj$-L}{3J)ObHt)6}rw=3y-FaojZv}WKN@<7%2z;dWH3#fV2hcN^)t+JWgeeeE z%<%A(90K&N8PbD9Bgg&dBUtVz6R{{mpYXcxQA?Nqvu&-fsUf@GF(@Mu>yEKw`30p> z6}B!gKDjhmjM@DJV&g&2)}hhr0>D)$={)gHGrWB(b4PwybyfsvhfRo#DKE>sa93pW zeh>eJ4J#IUXmY8Pz-DVF1BeK8mqAh7(3)C2UX~Lp+w&WUO;7XI)k;#HVZu`T^}S z9)tqgT`S%auLgLnGb(C%9o-gn^k9RmoD@|>oJY>_>}p?unS4;&U+X(iTy~U(!}fi%bVp69K#QZYX4F%8TS9ux`18ay z|0-$l5{?IY0w8TtYo8D3wzzP$3p{&%5Pe;VHGLqTKR!VhQeM!?oakDO!}_*}G-%9G zuz3OsTNHKA>gM=_aSRe?i2!$6OvBxrHAU$g>r9k8BKW*XK(HL^hpf6aDFp_7IsRWF z5_SMl%J6;HI)bhF!9Iok71n7o^EBV;F^Oz=O+0M5$HDi6Wyj-iH}%UTfWI)P1TM&L z+bpVpFI}}n-4X3drQOKO&~5FE!wvvOQ>%Y&s;IHtS0q+3Nu;Gt1N828C$XzbhgN!P;kABBg-^j13pSnbEzdf zTqA|K=Ue?%51ty8;xvWOCX9@d)RP=C;tL0DGeQ-vbk5zK(5Ax`+x;bNPNMY~SOp{$ z-|6UbYQBu!DjmLq>cp!GBh`*pa>nnu@4y=sfu8?eO{e9U^V7KBHa=1{Y{1*xW-4*a z7Sms$KtPW7te98e>r|?X`-8O4cD-QoI|SMt&|{3GofgEsd=K|q;d?-=X*s190aL8q ziuW`FO;KSE+VC~t$dK8*-F47s#Q9E3vzc87NZ3|92gJgMWp$49uY=-m8=}`OT_}|n zW3i9tw0oblS~QlwnMP_ObIzE!zo=_Pl2+q(e2J%hvkGGF&VTLCrsZXR9RzFHjq`h? zvk1%4>vo&XNbD%zNRa=E``RlAu5y9*3A}U9%G%QL`3Fq#uO%wr7zuQW6j;Lsd!+Zi zGA#EP^+U|3sQfzW^3u{6yuJ!{n{v)Bs+ICIN*(n(?%2ymC@hUDxsM#5wt{xAc}j2V z`VTKjm~P+wFY?|pF3PBDA02XN1nCw~5Co)~0hE?TI;3Oh4q*sEMHos_q#K5iE&-*b z8M?bcy2o?-y!kxu`#0ol|=t$AaJGxmfS(#)_Z0}K~!lht?6c4zH1C#;xI5y4C?-zNkN zM$74p=Nh|}!_5zRU|UAQ!U^}E4*dP|g0tUNu|(Mxh~9PS95Zv;^F53z){<((xS0rK zVf$GKnNMI$%^xASrPnq@TBJIKrlJk4f?iW?OZJ^3?x}0p>Rb+t9#FU&mISTqsloy&G99P*PT`<)wp#b^+grg%AzPTL8;@jc^pWJEuEh2U+&A$7ORcV;Vo&p zwr~Jjad^55Nkuqmp?u4;#t{aV@j>N3#T6>g%QEqSg@~r{-_>tPL}$E`5Xd{>+}F;GBmEC+L9N= zh`bM3*o>@ilX{n(AYIrvg&s>xy*A!Z)FN)_ zq#0y(^ zC_a(Opl7COwW{B8@D5LYJ0QtDR$*z3joSD^4PmRp62Pu&EZP-&8)St`6;(IV89&1! zDS~m15P$0YWZdFQL}D&@bD31(E%A;411B&Eci5EXT{P4lazaoCptzxT#b`rMs>7SZsKDb{5V*bLU-V)ofRispJ;k`?-`)ex zKmDa2Nf;VI53=6+Y}HTI+PoBumPhlfMijYymeZK`o>xBu2scyT3Oa!0r-|29q5A)i z&YWX2zj%=Rhy5f-u(Ne8Cq`TAKmE9`jV1DUX-Y7>@H;l(xLE4xz7)?;|-pBPk{agJp_Y=Dc$VslT9=Fw) zC+;1tWaN+47lt*ZDQig(_9KB+-0!k+7V|-LH688Ji zXL(xj)CTGrRu+)22H&cyD+yho-9Z+4>!;l?Wzgm`uV{-KzcA9RSO>9|Mb$0^}R!;y0-CD#Nk%x-82Oktuxg=&K7gU zRil2F^V(s=o1i=b7zY)Lip+R!eXUV3_Rc17mau`-VbS6yL2gLi%=s@z9If=|N>&5A{{$d7cg5$vrI(Az_iBv&72<5w&Z8saJYO9FncjPS%Odo0=WgB*eRm z{R_+0QSYGDSz;+-61)nMJ941wb+?^hx0^}Zj~loi_FL#95zl^2FGrygtMXyxt$#oj zVRzxiEj6`BW~hC;qesVvy5)g1i3Cj$!iHS7#ZgMDRc~iojyn-^a*||j=y_kD76Ny{UVY`gUq{K zU15gL3LD#EP>*}-yt86yK~?9ot>7s@!}=z1(is`w=}axDK8PH_y!l>QGGoFjBBvlq z%QqnJZv}6Y(Kds8Xo+Z*R|FdM+22%}0YPWUA^<*!4bPmqiQKIK2_<0MccXMXiSh{_ zY6dC~4}J!1&l%AZzny>_;J2!s9|&xc$PwT4x#F^o%B)u3NgOc5c<)T8ki=-z3cpjn zPkJMDahzEol7)wQ6r)jJi~B6c*QDoJ7GK<46gu%(lM@?CCTZz>5SD1;n~Rdk1n87& z3-f^Zgl`VjSaFgK6fej$z+ZV$UB}pmaY@5S=| zXgOp7l~a1?GsN115uQn$*xa@ zeL-a;J8V)VSm{E;w`}-9Pm$5=yCh8s?zJQhYQ=+YH1Lk@TUNdOYP8GxsC(Wnqhp1; zFvv3W-tNsOT_^V@xyBkfpOdNgtV9{@6Z9C`EdW67fYls*MduB{)-l3~6EH)d%LA}O z52?QZ9FH@zO}kxs!ax_@q>h~WdW?bOt1q#+h05=>#0tJR@Q`l*s*K*<&Z0Z+##Rt? z(QU1&wR<=6af|IzDOfGg-QU9?4GG+a-PQe=O!Y{Z){cnE{Mi1Aq0=Y!w$#Aa30#cV zMlSCi7OdYKIqxiH^;_>5i8&0aa9rR@35({Fs#;g zIqNd-RsMhgn3MY=31C>j{c(x0KOVzO6B&_chu}T6cKPf*ub{opL3L1@nsu7re3EY- z8yWdJ5bh`Ve)auaprcWuX697dAwYTIlSO3vYSNkJxMomEc7O^0O{^PgoXzC1d;B)~-UH<+WZ`+W4VN5~8P$CgvViFwo z1;l_0QB92NocRTvp$)(EQ46aE<+*=AdSLCN=(aT@|LFNL0A4#OF|Q~pnQL~$2ve>% zwD>_S@BtY>{lqIdq0UB`NNfwT9-r{qt*pwIaxty0N9cq(!R1~aA>wdiHm z%vpy0-J{cl_qr$gvNASX2RxUS8bpCZ{%Cu;&$5wZ;P2_;KCd;vTTz)~jbE9M_q~)5 zf=m)KM)&4>89$}o*Lt6UbliI@JICr@V#TazX|ulew1PFsfOn+nP?=n)Qhd|E++Hok zmvY)sFyBhniBeoa+}y^96{ITdzAk=`v`o);597{B|BCk;_WRQ<^d3md`|NvY!RWc2 zmw?5?Ux|KR=a+*Z4;o`(1j1N>yn9CHxjnUy^G~#qv_0n|qGavXJRM7QYf*-66P%63O~KiM4@cdUwB zFU0i>z}mHzARiAbg`=4r={`#njb6bsKj!_ntduuV$u=O`OT`Qiw%qY=GShDz2!sA0 z`YjpZE>&x$4(w6g@CvEX^{1@N%*EZ17JX7E+mE6$=h5#gufgXp51cr)bppIh zSD77sU7C!l!fvj|HGZCynf?9mtzxNl-3>A{)58VsgS@&RcwV3+@d{9stJ&ewg?^lZCQb8i zi=k|<_4NJ?wPyuoUfVxzEum&rdL7I#VS0k%{K5txM#mHzNK)o1rl^)L2VDR=Y5o53 z(a2y_=d*I37&+&kchDz4j`T-Q?JkBnFxorTw%bV}j{xZ=|4GRgS`8|tXD5$794a;^ z?Hlb}>B2rB%DWC)?^|r7Q>Zl5@X}9X4EOD_2TLQ9j&F#1S5hz z2H$h=P2_O(q*qEV#mP)dxd_3Fr+2;(K+gwA)~nSv5naxPVEbXC!#{ivyP4qPyD(BD zY~AO}((-;T&lKwKO2nB5JkW?%IMX0OM?dO$uY0nj;B6G0Wbff83& zY**WxsKqNf#@&1~1$(a{ER0>pLaU_Y!xAa(u9ectm4@v5Ab6 z?MZCRwap9@fo;BbQJ)8qccmWYgZARHZC3|>c8uYqxz{5+)QDWZ&c)!^pLK2{lMM|M zwIS;`gn4Wq>9*V$KE;aNbCv+cb>QE%Io2ukF_S%%%pVQT@jbQ3^pk#;Pja~B}l#(aIAuDrg)()u*YSqqWG|1LcQSMX+{-fSKHz*)V1KI8 zY-x=r?xVU`-_i0dgRBvAbm9q_{Z+ zm#4AMZD*r&=-knIBm;B{5?uAv74swYXUWjQSUh6XL~pzhCF~dxbpfBJOi?NZo$7HY z#5e5cM7)L!EPmx&Nt3hGR*%C-Nj_jOH0=HLECXGRUR)3Km1uvmI%N4p>TS~V#`Ts3 z;J4Z;4CW>Mzc@i{stOk8%%b(wweCV?qhkF%4+*_LYk;|)l)?fWe!s9e?(OyA4<#e; zfQw3n=Kh`+i!Kpx@2F!FujlQcgZWgPl-EUqI7D-lR_&~)^{~`Okie%YEeS6|ytj`f zCdU|C!rZ8~T_J{zMYTiUD7uG)6+dDtS!v~BRDv<50K9~Z6S`r0(s3Y?ihRiTi&{WY zZbOKQBBC+`8L@NCxMlL13{7`v#__z(jhA`IcyeD%L zV6`#zleNh*#M7zyI}M?^yE0Vwa!@0MCtsZ}^~|>#Qa0{tpZU9H5&ftGdx$RaStub z31+vSB-xoPb$rk2KDevy-}QZ*8{vgkYG2y2#l!e&t6ZvWs40ydgYwpahtsV-UFi=< z;;tHKt>ehHE-+biV4R(ef7j8kFh#YKS$O(ahj18Kc;aM2A3JVCWB?eKZMh7v)R zMx6QuWs{%qO zSY2Lz?)3lY@oT3~0%{x~xDZZx<4e6Ht`#id1|;UMD=D7n>> za{QWN;HAr>H_|#7m`T)r%eFwLgJf%bWCzwK;?*Wqo+fmTz`yh8#!fM%}XFA#XdvpZx?Q_V?;9Og;lZ4%sg{)&NI~ zHx##ro+C>4Ci(qQYWBy%@T+SXQTl55yMH{{u(dn8#1JcV@@3TXp=BT53x=@C`ZKox z`m`5*4`P04h1B~l3$72yE^Ce(N=bbpJu1?v^yR}B|T;4UiMq3a~DMf z2Kic!+eFV}yP`*(`0aDlM>sP%i82uhmkQ}goOp}{Kcc7?1A>Z;PA5Vjs4zB9Ip3Kr zI3`Nu0w^Xb2jaG3G+;&L@qfGL&c9l)ceT{HaKbSj^urj4&k)V{;z@iayT1I&&ED`2 znG1}Qb-;|VIRjOGUJi_Oj{mrOo0Sfq>5=6as&FfEND6sV8^=Px-#|YQ_&)2rd@>T$ zUX^k3OZ{a_{Tof$q8RSM7!7Rjmj3sDdY4YY3Kc*>7YK@U+o1kwpAExXY~(Z=+K2Gc zPrwez8ed*dcwXw%@G^BcbtXA7MDm)11Ggeye`u~_(zm%9`v(wlw|F&qG{D~3(Ic`w zE_|t;xhoIkvCDA{H_~&a{ob3`$9>dPpZX&9g`cCsS{LA>uo)Wj>DA`_^htaAx4!#F zRut}P&&}aG$>QoNMvT11$HpjRY4rrW5x>2qM?f*X;W>eFiyZadzIx&K|1i+CA;v_a|fPGW3C*b?V67{yIli+rZ zyb(BezSGC6s8~WhFnhaBaHFe+z{YM@li!Pd-0YpcD4H4=h0o@N9J(2L0yi~7)4e3+ zMUw=6(fPyUMjNRuL02>AO&C{t-6m%2N;(TSWg8oln+bvpZFu)GEivTN{_~H_-NaZ` z@Og%(=9B9gj0>}`Y$TlK!iR5rqIB=c(4zD0$L0j;eXImKx47Vul2{(Im$N+OInV(6 z8H*wHq@NNOtIa?hkuZ6Xi!AFx@+v=hQG-G2%u_YYjf-=RFnVz}39H5^2i$G7?2?8AnFkFhI+&%n(#-_Odcby!q2b zRD1lo0`>B8`iXcTC*Z0?OzP8go4JN#Px8XQFS9*@`(9$#3DV*tjMT7o^fz_qKV^iC zWEuhl&_onJwI>Cxy7KB`*wrXQaW0046!Km4=a!7;314WT;e|4PfD!p89LhOsk%Iox zyt>XtCCIJn#ABY)OaC|<{x!T~BrH~T+HPJ|_hl%SFUWGiqtAyu|SZC zVwll~9@?a!VAA^nW8OnSde?l$9yW8MQyF;8s>=mmhlj@h)bvG=a$Z@(UusCLJ$DQU z_Pdz`-tB;05~+fZThM3wXYkKl*%56dCA!HJAMOkv1`IY23J`~QgN zB>q=2)RIe1_dZnQZVYNR34Frb$SG;7-~9a+hlCUT1$`lgi|HBK?5IZgm%iB{f|T1i z-{ErpduSZ8_N$p{(=ETBdMwT(b*5XJU~y%R{G3IcQ{U|Yw;xYIL9FShMCbooVE^gc z>UAfcIdADZH)&E$7aC-DBz-RHdY|u0*wQt~pr2)ZIUF z`=5y#b0RHz;RYI^8Y@-PYa;h{XlHtC6%PLSVH};7it9s@boEqw8RApg|28U6;Qz*g z{qKJxk$BMSqZrv!pokskgcuEbW{0yaOlXEc3 zH(P4tju7DetGxFAG%>V)A-v)Y9R)x!mS667AYXlIw#V6vo(LT?o5WuwNRi|MX8tbIiG2V_|nH7v!a z>5B)MWeYKUBCwpu$fG6}$Qd~@c#gdE5jx4?f6mpa!r%d44Hi^uxu|De(ykX;X?pJT z^NgM|2a_bucqlv9Y*z+r;P@`J8_80!W^_?m+gjr)?#!O8^tZ=|JjSgQvzaC^&&KaI z>frTF+H|?MZ_S{vrtsn6_^(R~vy^z>WwgwJ)nHb!&ZNs?+aj3_zf8A@`lrLo92eO| zJ%tLJN_23@v~kEe(ElLyC~4CGVd~owmo$rS-2`t~4)U7V1JQG@LFNTKi3%L0m?d81U;)a`7dJ(?pPuYNZbjbnh&Y#lQV2UBIQs__1R zC~$$-#g2h!Y>XJA$XKb%nbh(BdUqUHfX{3pQEm)2-^GR?tghnKg)Sz$;16O4qzW0= zOkzOxBFHipRrS#vD8ypm+N z1q^Cp{4`mu#y*FhKIO^LrOY}q=t*kMI9>&VB^E@jZ7ivuP{-g0bH;-V!?ks`zp4p= z{08r}xG2WS`X-3a1eMhvC=pIqB_Yf5Lql5LY!!;^DE*fm3=ELFeVw^1l6YkduCb(P3}$PmWKRi7lvN1YI%N*#`RC#9!#o*OmGbxgo1JET3wurB`oZ=c zTT;U{?i!B8l>t-!c&FIr zsN%b95&wX^uX{h;iomn%h_s}x&;gK>1_3q5*R$7lZvSSY>=<8N9w;Vf0IO49w6ZI+r%1o=YdVcQdYtd^U}9&hBg1tIKi*2`QbTDg2UsU&3>qfJTLJyA>8w zDBy{{5RosGjhWDcJ<^Du#K=qECAqv%crL`*OAk&vM%J|Fsj3l{EiNc)|5&=6EEsHpA!x=9t1p618Tt zD5l$3rG)YiU*_uYw|!QRkwpDj>*F;hssP_%zh50DqpQ@>l?HZrQoi5<)DkjBlXWHWKKMz?eoGyKK?2kp)0Nv(+sGA*B3)a><>1sosWh@I9dWS+9Djj zPJJ9hk6qg}+6mw^VO3JYTSw8$$WlkH#L_>po zYQ?Dvr$_-Q?<#0Z|Kf)oaP8QZTBG>Mo65Y&#Y;F(GRAE z46f{Y*LT1X7c9IM_axdn>JX3u@WIY2p>g^$ZIEC3{UmjZYS%+=8$;|=6FxeIrJH0g zwGTz!X388HBlUT&bj1C zusV;Oz&A>LGh?H5j;ty0z&0g@!qmS#09m|Z2#QnWvdJ9D#5(( zOfnF|^3 zV+=vfmN4vGPzFYiEU`u)p-CA%tzxn=-{EIP)*g^0)$rBiIf&eFS7|jQV-_N$_A3^m zt!q!L3|jFN-@9yRN|$wJkj<~`aGRocFr%QbJ1~eCdtv1txcGFD^!C#1H}QR14CVct z%N65UcC@o;vPm2Zm$l<*7u9v%IuE^D$B_6WJh7y!M0O}rb7bZ7Y1Hk_>nlxCOgRv{ z28f}-6T+iD0Du9v{476akeyO_8m~d3{T6F^Uwx|I zq{q4A9C=nAe=#Zk8O58h9}qJX%zJk4yGseNfzKCXJTqtlx%vWR=)x6+?anWQEO-{L zE9KR|lZ_@XVOG%5#R`&scNhch9n?d+Yh$4Z3(~ozc5I)W<1!^L9d(~O!-?#-e z>LC`@QS+~)euMGabW1O_Qn>O{yk2r$<89q>luWu0`T4{iN?u{y*`a$+sUoIpkl80b zpckdibeG#Ah)<;r&sO;7g^-LDl_i(6CK)-sQyfY$1wv`i&t@gY#t?s($A6tpUU7dwC1zQ9*lLMoPm0Dk zx-EIviieFS0`A>ag=FZvs;;YR-eb}QktYIhtf{|ntQbmIS=|lftZ;{BR#`KQglpjCPmGqi4^`KL;eNhlKzT03p3U$Ao;XTi zMnZwBW;Z!01`i_7mhQjCm@=q2k_fBkAp&C^ADm9KQG{Hu$H4djqJ?Iunk~0+#Nv+j zk=XCHkdEOMZF{j6{~RI8UvW|=TM`V*6JMX970dLK@Bcy)pA<3yQq zn4QQ6)|^lGEK{cd8fT)+&XF({BY&5>FedKhJbZ`ZR7F8Xmu zcMHGKLf>;3y5UVc=g>2<8!NNjDL^<9mrlNwS+3mh*P|arvg%nx4)drd z@w}ToHk%Je=QxPB74}{WA`JxRUb-bZ1&7U33eNPAdB~3{W+z4fqVS%y>@yCU4ipGC zJHEXRc@d@?+svVAr#qR)gG2bt|XNa_oeqM^1SW&UQfwI^3mup)LrqsY{78r!25ZJ zqhG7FMj&M|+!2LfNHPp%;=` zhw^&HSQ;J;@N`Q7kntWR1*yCszH3}x_nA{Wqg#lc7yk9LHiTiegaistC5B3pa>&Eu>~ z=vF&zXj9zH+3FPjAcAg4&5 zg5jyq|)W~@g7fX09|+?^K? z@+g!kSy;?fHFx?hxJ>Dq|KW@ZQj8G)r0>iW52n@CjNt6JnG}q=n2fs*Q_^eVP_pU8 z!##J-j1@rhc{O2yOq?=@>krWryL9Ix1=$6*C-Q%Zr!({D3J&_dZJczz8^FH)27Yyz z+xhCZLsGpOaZ%<_iLF%Tau6_8CpVtI!(EnjTDsHkHw)yd?qgdCRr_)Oq~qbG6htT`d1(Q=kG$=92Lo@RMhDb%hntH!@-|rY#g;>E z^BOEVd&5w7u5)n@XWCrsd5Fwd6O#-z`M@rXPs#3uQt!ut#Z&_2 z?RkPa+|VMW^G}=IV~IpW$OSL*vEUPLOqE0wr;-HK)sc>Z2D-i{)!liHlCEx<70SQH z=S_T*HkJH%IXN)qFnJiaT0v$_lRVEzp8Pe&bkcIFutVN^s11G*H&++ZQQ{kaLO1Ue zMES4gSr*i<`*o{`gf3Anjr}ZjU2T7X?D-bvI?5BxS<=*~kJ@uu0;xt2HAg8G7OE<= zWb(r6f_6K)CU4nlL%mSyTqv}t=@qSbsQMI}V9e_DgAT%8#^+9{7c7vm2y9e#aYx~k z>%_VH3D%%Mpsv@`xpQGAC4&E5O^1QpBN(vIiBF9*E`+I88)=VOZ;w@fciP`4rxT~n zN!=HruZwjx+m{v%5jf1y`p0`j5gQ`hN_Q!a1M|!DUrasjUY1l|UahX}_MUiFw#pHr zXx)45ZZ!TI@zKf4FUQx^x2}N$=IzfQX@`Cgen%69?n_LMBPnwxs46JxTo_L;MZ(gLTnF86YFX%Y2fb^fvvS3T^uOI!gkl_ZIMT2SYz zsZ6nYN7$Zz{#QnzZ~SoI3k?0G_4Y@$i8`hwtJ~E%!T}U%kQ*wLW`h^Wr&Q+arPADG z@5trTDMD~m?JbE-sBvsnUKjVW5N(O=H~GkP$>YzQj0+^?5Dw6A)#OTZ`s}M{%(W}p z#tVT3=YY>OY3$2f)KQz#?Us_hd-|qNWA-mAP33-8?j62+gQ^{ODD+McXE;vyTZpm~ z)~m8#VXEOPUpobB-q7qrGk*8ng(lx`qP#{%{(rdlUSAZ`=Y4V_r7mYY>~0&#_)!au z`h9y}q0`(FyKx%`oGT!9^zQ0F?PKo&$+5P(@uwEf; z%v8^V{i1jHbTj97=Mm=kPJ(N{3fiHF%Z@bV%G5^2{&apZX{ZdrGN8t)TYEn@V?>)6 zGl2h#aJv1@e~JX=Il2da9j^X4et#bFp$M&w*F>mf^%!wB4VUX6T89k_Zk=c0ztDba zQC}^?M8HQbzXopE^m@Pk z7IwyvJdOm_+pjVTof())JI$vPq@PYRfK4Y5Fc?MVgLr9!l!Mo%WvU)xCm{%fhF$NV zXu}}pZ;g%haHxh#`&^~i+T6TF@RJ7QQCZfl>O>YU+*@k3u+En;-vH@cj@VBNP`s?0 zUylo?8uHsaE5lcRFt_+}(y-{{S7keo$U^*ROTSYEh1nsK_-!>ACr!kVFvVQeIzWJULtT9?ksL0fq0Z9;RJ1(CH5d%k6Yxn^?DGbXQG9IS_Zh- zr&Pt_ucQ!Nb)dWg)R5*##NAa0m58;ZOxYf^hGKPC@neBv#1iii@^XusZJSg42jlLc zRO!8ah(C-}hOmFGMlCq5F^|@j_^mrEQE0F8p=`!mZ+9yG=%bUZ-yLg<)#ui_$B&EN z;5_2byo~moEdsx49KJhs*-*(MlW!&%ee^AQE3x}+EsF6nMn-dbuY2`udZGPILKfy$ zMuPs^)#={b$m5!74g};c^ z&E|J-kyr)Na6qLXn4{vt`RgB$)F^BTACUgFH(}KPummG;IlnXhUn>i`FB6qM-iudQ zNOt**bYL|lh6+YKj3)!}Uumg3xwvO@vlxKZu|Xr)tQjoT*0>flewpFYI9p4QvASog zX`EI@?|ImCrMBY1mD^9>RM*t8&UMBmAd5f~X&}zwvI>iuslqQBtfU4nNWKI~+>}V+YWIz~%{GkJ`z`AnYHEo7a6c z^}Ie(OLfg5wz<07d+qwz4z2ZEKeCb_Jka6m27$Rep<* zWqA3Pm%g!AFyDTaUJ7dd>;z{)VW6-)uoddc%z-l@t??X0XE{!;={YD$r&xC!K^Im7 zU_=X-eloOj4V3hMI|Mngo`&*)B}+#XB#`!)y!;Y&B}49=L`s*%b{LeV)d35l@sE;a z>Cl!K?;U=Ow#Z<$Hq^BBU%%zFl<=qX=<+^SHWB+_%cp|r*i+2GXNSQpDR&M{jQr`| zL5m%4n%eV7VXCdx$+$gP_sE4!K?|U`gtL-#X6tx@-T+l|S3@ksPcwms2^+J#PObGB zkOFA#RN3OCn$-vEk@p}Upuf_vHlBUW#!~}7y^4PLFG%?-ZRZ1{xlby070Ey9_yI^g>)BOpK#c~G)0&*GY_+;eG0dz z^>{>I?+Lz{dWB*dOYl8)*Oqv58l%Rml=Ffq|jwd!k+@$Uib+O;SYK;`deX!y)9050pEmz1gMAQRGW< zWUk%JA7x9TKw@|jDJ@G7yHlZqg(Z47Y2szP<&B1SzD3H$js$3q$6r(S&dU!H4U{W; zSuh#S|6@w9e@qFKU!n4UwfB`#QFiUWL#LD?ASp8ff^>IDi*$!{cZam3G!h~mg1`VH zUD6EQ-JQ~%p4;cG=Z*8}{jYV-I_rFb85T45eeb=meeLV|#WPD4R0Ax3qWP^V;~f_0 zW#~qQBubAR`|uW+b6ha$Q^4z3$yP}sdK|Q@)g%UfIkzR`m!$ZSMgERgifDv6dvaex zV9n}k*ba`p8OzaCRW#81ud98s^X`5n$;in1S3|sK(9b_@N_E}?tL9BC#@(((b_nvx zvV;U+tZ<`<_I$Mm{1W+jfGZ>}Lu3u=s;&%YEU_g;C6xm2$lD6FPB+YT^ur;alx6OU zG;M96C$_87m#(u}<|kP=&#s1Ytj*3gpWMnZ_q^;CWT2f7mJI321#cr0nJn{Dy&rmQ>R&!F!y?dQd3N+J|~@6AUz zE4+%{I1w!N<1SL_P97g~r5Y_zT*jV7z8qArmt9CAE)XZ{E-k3QjtC*eVgQ~8UDWOq zBoRCP7t-6=3!fSe;hD9BKL%hlFxD@%o z2^w=*c4Eu{y2wO#bci`t==CL&k843iDXlk74C#|4j5*_sT^Wc2V5jgoavXjQW5)cF zj?Xz#w51zpJb8hGqti(>;b7WtiU47J5cBo zo5z%LjmGxVZXr)4IBF zk2Zm|vZ92z^Ucgv@03z-l}R^*-By4FjTquK;KpcZa_V`ANy47qDd{Xet=ZkVUK5^j zA|oPmPe;66;6L|jWy);W-8?!G?qQ_)u;jpv(D?qx-9>5pPcAWb^gzK6TXrLnH)~HO z#(v@lB`PlL+5CYwranF;wPUrOjZ0Cc#1e!FcJ z&-l^=B)zv;?YliBez*56l1c>JHO#^~MyOV~FPKuH(;>4S|M7v)if1hx*^y3y95E6B zD8u(pu0;57seTHkvQze0cm4&MEVXaDiFmEwWrj(7Am-y8u}@P|Qw!3Sj7J!AK%-3m z=N;PoD&%)yV2f;g$8&?zoUe8?uH5*@$3c!g;MwDu$1z)X#b2rLgGN7-IkpATeWEXH zLCVTPXD;=HTfP`vm?NUnATo8%N@QP1)>?=BSm;Idew1G9-r;^Sqg~=~WCnS9hEysz zAa84!EsyB=@<{~R;+k;G`(cCvt7lZ5`9VeIIm?lm-^fzuOQn=xqc~LMYPVX3<&wUK z0!eTX!k(TjNQy&FfEmlhmEI3O;`E;H!4~mAPOjqx#x-pE!f^MI7o$eT=;RiqMK}a5 z-$JH{3Q1jEjI+!OJn6YJF`znt3uSJN7F{LFCs}(O81+?xAMHw&tt{4CFhch~K^v`! z1>OAFRfbU|6N_{fXG$WM`LLxiy3h~LIKzO%)n+ee_xTs^_mA#V%6KTl?~3yh&<&8JY ztR^hpNk*baI8p9VQ&TPVB1g>cufA49!^&P1&tIGuegFRZyvZ4g%vXG!l$?7R&(y(|jK08In>oG9wdw55j=rfjWH4$Mz7?xiv zmHA4VamiygLw_v;m6TRE*p;smR{ELO48&oBCMZLux-?Voes?Plr1H#HDpA27>2pO% z)CPwY4a`e;k)9zXHAu{@y^1ls`-%d?UeQcJ33|(OU-SGq;Qa275dn@CW_S zpxK1K?sR|M7CMeqN1%b7adl}1=4l{Ls2;`Zgf~IPNFP9JFhVmq(w90e<&s;(eRKl2 zg$3=lw)UA79dQSHRe)XmCj>#K9}Mxr<$2yAYg9ak{*PSMlzzT4b%vBn_It(qCjeT7 zJ|B8rI6QRiq`jg$Fm<#54wj&NiW3>t$817Nq&B5cTbkCFCjP+NZ{es93O6l z*p2-)+Is-oyc9}>|89ZQjoGf5&aPuCri^e#1^h?K_~CcT`0@+^ew8z5(^=SpdvImM z?;LC8t?)!k0JN~l3t9(B)6P*&)&~MM3DTCSPR%L56QM1OEX-=Qc`ItU&!6DPK{k#V z`QEUbi(ep`IwuDE~afd9d$^ztd^ht`-9AF||L{ z=tg*DH;E=5tdLd^ootFByN~GNyw8r<=k~4FUCZ&6hS956YFbax+p4)iL1^Uco!tL4 z&ORa=?`5bvi?n+e@KA+CW=hk~<&dkn^_@P!je9QOy0;qKtJsh9`vlpVZwOt0+2;3o z^FBjxX3#CVGO5Mz%6C*F)mGbe)|}Shf))_x2sRaWNHq^D@NO)sv-3eB3py)(C{{4H zb`h-&hk&yUZ~%hP@zY$C^Y8hkdiY&CEMi+|NEu)21)pt_rVG6Yi#L`~Uyc^J+`Cd+ z`mTL#jr-{f-iC%ni%@Mb2v8p;!woI7{mH~++-OMQI5i>DeRC>VZeH^vRJ%qUbD1*5 zIIau#tAtzBpEntGa)Q?{sY6s+e=D4Ajdk1@i*NFpzW3=TaQ^9~@_{HdJ*qz$)Mc(K zL9(6giF}(dSp^;|dF}{Nlq})H%cU+IM#IXog6-U%-QK}Nx@lntxcO46dSZ)566CPO z!ne9U^BFCdrGwU_{7Fj!bo*m3M1|9Nv}QpO&qctH!dVuKvn=$VOxcaqT84uxXjIXQ z_kNfS5>F61tbf9+-4bNlnA#LZ!a2tWzG=21)OyC2^0cA}wL#^cs4(@!>G=?w_*SV4 zaw`C#VXo`<6-9u$hF^UpkpDJc+Le}GKVsHX(CTluA$%+Ksp@a! zrCX}+EBxJpg1QvEb~wJwSr==r1=J9)iOf^r@cAaXLTytiD=MXp9}ZB3>EE-6+hypY z7ygJ~p+ky2=4?zlZ7bx4)t$Md`ZxNMDQk?6cL+{Q7UzR zqeBXwTB|J|Uz;!_BXgbuM^Y9-U5`mIfiNGkmW; zrmg%wIBWd*phb5ToUcct^3=11=WK{GA;;>*wKT0GdTGGIi(Ks(qdk56=uCPFx&R!rGgVrkbY7**T@>hC$B>A9n0M^WM`Tz~-;C#9~s4(t>S& zX5HgRxN_Ab)x=#Vm6A*8I>rofpN(@T=V7kKwjKZHyz|&5Eg(>+XJ(Ns`}rd_pP}cjLcaE&ZamHA73K%7V?y0! z8)A~ZXgVa)b^=M4;ee10v+e^18tW9D&X+XDRnmGhYy6IPSEX(Iq;%2SSy33oy@h0y zSzrCkc9u@}aqo|sS40TUzpNP>jOBzf27^v`&o4@iM(rO#5KJwRSgs$ghQ^tRq2yl2 z2Jw&B?^SVQSPCDo*N{dlrxZqLM2tW2wxltP|qL z-f$l=C84FoqdJrH7U9(G4=HHx(IMvFRLr~$V|!x*y6kk)CmY{?wf`c*eRl2=Au_2O z32FRTuTxbH3-V=1T2UtL0Fyma8?@f2MK8_xiHwUlQBcxxvi|B%Y3Zk_a#-BQ9{b0v z>%Vt0<@=CsNkcrT^0fU~)=XWa-wOg;j&kk*!GJp^S@623e26*JRf|B4hW6~BDeRJP zo+;BHlALi>9gXUPqUc?b);{z=^+ks@6`6m_Gf+oDhnr3ID{%$(qrxZAKBxB#MuRVN zswHZjPF#%-JgK~K0+X=#<&5MTWk>{N{U2#%A;N49bLwjA%IggJ0RSYdH)HV%TJ-Hj z)TfzK!>I4hz0n0tASY&1WD4p4jsv(iFlo+jOxy}N@f~yaXdXEGfm04#*u%$Juj6U; zWXng7NfxhtkXc?5-QSKlF_Zo*xZyvaQ5oW#WFUL2C(H@b>wLj#*ng2-XEXQ<#6$k+ z98(yF;VmiYUr$0s2*oZ4-B)(^;Hb*JgpEE@^6c7fK`Fu3T>XSwS&PbI=0zUcK`Dv- zp)Ebgg?cirYsbsy;xlOntvEAsD2HDpX2UIif%3Kr9l0vA#w~6uc3+lf%;P z1nWK=u#>m6bZFfQhEN&5%1ZO)=>BPw^E+Zk@)0Vc--%@582|k!v>Z2wMm5p*?(wt? zI`?Zh69idkh6ESCB zEocMOq0g!w$MOM$@!TS3P0e|s z>ntgIhn6IP`S@~fd_g;f-*lDWa*3okR5Vz zqI+3{WutZYW#--=!NJBPrpRl@S=h{21CK2qqz#*N*UdA$r{vWL+S<5q$s1+cW5qyH zJL=p2&##Tf3ygQ|H|^o$*9viQ_oGheHyYWhk9Fqe3+2KlMS#kv@Ss2Mg1N@`FADLb z=BN6iBc3hYJwl&6>{c<(4$IRHNNXG%1AV;Qq2psejU}U~M9MDxl?DZ^Mssq4MV{dJ zJJ2D07$dJFSC*3-O5*jv@p2r#7xZDsnmo*o=veYIvpfAaQ#z0-OCnF#BM{vbv%Fk-^H0S-Pf*P>{E;c8=}45&ukr)epDQPI_Rxoj!1&VK(W9BfTYdej`cw1 zUd#Lf8Tz&5-@YjAjOmpxPGcGg$f%__Lm0e@7*p`(YUEef=K^5Jlr_zSZnGrt(7VqF zrvgdDf}UuIm5=4iqdMYRfA7-K^!q?f?mt9oC6B;W+Hq(K_X4J-yS{T!^!dz-;CT71 zJUYmrI||9b-%{JZ6zf2hrSWto&jYrls2gucHjWO8sb%vz9@3{^6srx}27wE*tJBUw^ty zOY=e4(R=gMEve&+M?9NXBIw!YH5og4{=TCSItB4kV<&e{04xt+ov<&FBs%QtgV&n~n#PR{r9%LCn z_wuBx-ee;MG*V7>4slfZ)26CmO-dsXl@(cJZb8q&K7~Z4jFfSd7Jl&(Jpaep4{*%f zH?wG7d0u0?hHeb(_m48HVRBf+DFP0^b6kJQ4f#Lq_USFP=wNW1oD%$rVpcREdTy3d zz%^!k?B89S|HLDr{5M0g1gdqM@jn~{|L}=Xb*jiS&6WSjM@MCT%sHrSt>MmBSe``q z;n7@(J6S+RBPM9)ALcx~W2pYIu6 zUNIhjw$O1w-kDZ|Uz8?(mgUschQEkb23fR{d!5t9i2yLM6^hY^k&7S_ zNZLN_#?WZ>4jX&)*>87K)yAffE=n#VPaZY0Bq4vVVU&9Ms@(Y=bniU;3bJ4{9`XR8 zD&xOA2@Ph?Qhl8zDHGcc?-@zWgWiP5NSD$-`{3kt5AqKq{%0QBlYdr!(z=P7zi=SQ z_V0f}u*3pFxOosOwn&YfO|KYYD~jC^dG`hh(Bd`w4@@@9ORjRg4#PT;U2n>r9Zh)q zGbNk-_g;2uch=cC{`A_@vzn!IznyyzeDhA#cS*LQXNfXYf<+T_Z+H1Rs)c9OVO`)| zFjdkl|B=)NAmYzSnE6$&$Hy$0pde`=EvCS6U*f)Vqr}YGBS}r753z%rA_(nz7&Vx% zcgA{ZHi&I@mceFy<9KS9{#M`Giv5C6s7*Zb)ecNk={~mNe{QZv ze>Xmi62Xu$;u>Bh^Ob-)CyN+~cS}$KXdu)-1JRY--hS-U?4LY-RrE0lrF`BCn{rFD zTKoJWtG~KBVh;7~J96I~z`2M4zl<~MFIOs={%wA#WY-X11_Q~Ob=P?+nokjRfv`va zD&nBc+1}A6+kXf*R;qBd-h)OiOxfdF10i(nx2uw~S4|k|7bnKtNNJHi=ScG!aS`$t zgr5ua7WKO-1~5BGBg+Pogwl))*0fzfN{T}QZo`cyY4q5sTT4olFJKR&Rne@JN+ihD zBTIq8%(9@J!jgNo1GznpxU!>Wz5dmEu*?}U@>(A~vY(Icxu1&2Q@T=biHXe#niT1X2!8_{1>Rl1n~CASautns zz1pVvbi_im)^CdhuSg0XZ8jnMx^|X0F)PDh^Jr^nKws`hZ)rZU34S00#gfPvV*wF< z_NxMV;&r44A4``gja!vLhXz7o6!Ie}zplzsJHyK0f?9*ZrU8(GgtMa6O>~aTnC-YE z&*=+V%Gig``I)K{F3XgPHBvDRZEw+*Es*kZ09A6GB{A9>+g=Z$tea<5f3p* z%w|c(QJusJ1p$!4xgu&lAxh^P88sYn^eKf z;Uje4*cbNTclfgP&T;A(%14KU%KT_YiEB0Z_*Fsf^RPw!S~~$ zNy4IP?{oq&;z_&nYfkxq;>IcRxJJDTSV>;rJAgncGHCxWb4PO#+$7h>GfTYGcghAa z^!E^C@3~p^=htwF!j2YEBY~WO)MfWrdq^^fY2pFx$RuE2uSE`&T9FZ8Td>eq)op02 zz2reVFOjG0z}M-jCfYSVc~JZXdfU&hh^Arb{^$&G75gV7ATBzTum=z~?Vbhz<$%Pp zbulcE)PrlT++ybQcyx+!hG21MZIhmG$74OVgW0#B1KDN88l>_c69!}RjBUaLm(rj> zpSl69h<%gruPz=+lFn3E@U6Q&2B~OKIx57#e(%a@YJx zc&q|vVtl&wQ6s7CG>1+9uGAaWA&PMXr`jcg-303U5QvHT?Vu<^Q}Ps`;Iby5H#)cb z3p5&#{v!Wmfjsb~am%2ISnlapJ*%Y2?;rhAwZ7yzM*ddGEtR89M3S4bO1Q?{5O{?= z70%cB8_hOBH^Gb=#^Ai79oKZjPz4Q>Pzaxp3Fn*2DL%ls zC072pK6tC(t)w@_d4;{ZVTk~{*sW>6M_Xa=7>U6Yjh#@uzvtzwuaD^|ipi0yDxkBGOBgyXVZki_2Mi4tI>V%+79 z%BQFt_CTc%4N}J!THb^q@F*p~ZM<&PU0Ws&Y}ae7Jsfs54gratV9_kgntF>wCfWV> zcu%%6&Je;q-$SOBYfm;15Tk;n?e5Xm^fWGVQXrpgw|b2?=tOO9{IpvD)&01yOTX=` zz@yOt72pW`MHSn+N^Gk$3!3<{Td3K(3as15Qnja1&*07XeDO*PuY4V6wO;;zgMdZIEW_jM6Y~LNbHzEKj;lK8 zWJM+MbUaL1wnJHv2>3DPtNMXTJGK%$%4}mjy^Ti7nbElq2*~D`Xh=vAdT8URqX$^i zD4h*<>UXVYoR1cB?alfgA9(5jL+wo2R{kCW|CzvSaDCUQKC<ut zr-};B5Xr(Yjg=sk7PZagQ34DUYa8T^w5mSSX1qjKrGw=pF?8<=<5<_0GDrbq5OMl2 z>$hzbTj@kR2XSUyMiJsewMU*^;3ai%_evp`>fElLAZKnR1#GW z_53F0m`sfld(rcPLOe4SDdq}e-7grVT{&gTY>KY~De=B*MA-E#*0_n}bHbpNqVo%v z6uFrqDOkv_t~ZUuM``~aG#%&xJ}ymG0*0@-dQ6JGU@&3332hZ{g3#>%z_WGXthI;B z0f$;aI#Fxvt@Z*i=HsGB%CFMG`Nmf{W}eWW;j@wM+*>iD4d^3ai6~S;Nf5bz7N#3t z;@6yDXyyyAZ_6=lWm70F%`+L`DCm`zY&_Z<-d4)VhFMUttEa6gas+3sYRu zjDQvrp|r;XxMV0=Y29Nft~CziVN1>9h4{NnGZ++$psT_ zyw2W-aG_w1apLn^2i8+E$2~Qw$$i>}r=liWGGOf5E>zFW?4zZ-K;xU5Q_hIy6Yk8i zhq2tk=!i@0v7f|n_5`X26cXW;n{jrJz%~eWL!MD~MAqQ2q$e+YPgOu0Z8}HrZfM)u zSw4iWOnSqhbqihTp3zeAr*%8r4limA`eT$O7R~rjxg4=bElE3SAh2 z7D&0V8nGYm#LC<8=UM2B5dy{;Er_+5Po_IDqHpl9D(>g)?3kgsM|{5R8+ybEN<2Ry)T$?@#no?d!Kk3AAl!2N@cI&{ZxVozZ} zlSH2FH&H#mC1`E0G;CrmWm2CZ=3lNw>Hv6ao_|HXFL8*Uf+0R?QMb9p5ti*cW}%!> zcsK#5XOrKNwZw3dC7XM{zn2HFq6NP|o_^`r_~xR(oO?qlem7t}DH#O(Aw?+kI(6#n zB{E?$6V$E-bri*OgVNI&w0{#X0+ro_?%Q(kf3~w@Vlx+}H{M*o!A`hVMiQISsIIHC z=F$i;eZWA)xG_leIe;1)xXsIicDV&4=vK||bB94qPZnSOYiG*qk>FRm)LZ)tgzi^Td&7KQ+V*#UNe6xM@7ysgkL7TNn|J5zoHHsJ+q1Ww z{-(onkD~fr6N37Hr#c^OoN3r;tiO%>qT)kC`ZRWMT+K_kjrxyIm^;&)z3c?pHi+~&r;Vi$zzGux6IR=tS?3i_MHr_LW-bA;DTgwOYXfhsf@-JscfMi%gc zo<-0Bbli^*k0My0#~y3wd0W;iKg?4-Nh0gl;{}wB^>JUen}S_aJDP2|ZcH1@_aW@~ zv#f)Dlsl>d>L1CgzEib*qrx>nLE>ZlJLS_t-4}>7RDJe0o6WtIbVj`I@E2oXeHP$Y zveI{!)1GLDsmRGT_A9k^G~Q~8-G-l z#|+f@WSx=A++YqR=tiSxF)f%Zb734PwLpF%vvyMI*KYsfN3LT*LY5cC&iB1e0j@QE15zDH(u`E!)G>R4ZTUlRj3{}JSipO*={csnrK_HB${RT*kJz3|NeYArts{Z z4ymfD{Fd>2pv^!^Xe?lyWzbJx>cuKUI5Q{5*zjGU*xLQq!<#A5Y-=>-$W9^TZZ^k} z(6Xy3X7>}1WaeKWK>)+8G>$7)6Uc9&j2=P=Ld>Lw6pnOrGa#Z1wib+tPGhW;;-CZb*kw6vx%FN2m!6{I4 zbYWb5>}gMZ&W|chQCoE5TWtL;`IhT&VU0h84`9VF13smIOGYC6N*p?L2jnScSva$` z_W;Ge%q)=FOuH@cad3y0f+rG=FB+$g|l8Oq8%;+;c|< zI?6xTUN3j+(G`-kNc`+s|H1Ekeh8FI4#-6DB3e2F&hJ%yX2!1bk=EA!}dT8=X?%%?1B}bs{*Pc{c zBs=?^Ak~Mt$hIj~odF2Lo#F4(cMy;>&MxWb>)d}Ox<+-*V2!RjE9qu}LK&0aRx`Ro ztb;Zp1E#)h3-b*eh0`zu3$iu(x*(MQdsdNKeT)*duw3zhmob(&8i)6MXh)Oi;;3)c zBzWA`sUftCa8g%iR)pdgsP`Qk1j5C;th9+_=PKvz={p#4DmR{I?1;2fUdYJV$Nz1h z7%fEqiwQC0#>bjS#OL)~pCh(Xw`pvWn+<^<$YR_Ds-svhmlfBs4O$6bCkq<&=jXGo z?qCjdU=QdKmQ@=ZX(;p31})o_P1^cm^g9JdB9xa6%pWyJxhU^dA7#gDG7(C-MF!kV zGA`XkDfwtzYsi%Nbu+Eqw*yosKDbig!}NkACUrc{9ZvccP4U0SxS~a3rTb5U3l*9* zf81mEU+xj8i|Kd-b{!mPwUH%tV@sit-E5r|Kl=CdX=)lOaO^~_L>-9=2iP52yKLJU z3EGR2vit|#d*>8c!)SVz-&#FJB_no^L3&yalHNWZrAEI`Hnr}Z)NmX}VwJ}Dg81gG zh@Fyu%d+ameu2OafRFjfU~nzDX;n*IOh5Y0Aj@9mgJcO%)G}KDiV0&_2G9N|CS=0P z3lp?zZWX`RMoUBN8#WPwGQ}{8VoYrdtQ3mp=TS5UpG05XoSt<8=`r^ZgOCcBiDNHT_KGz#pC;FWxg@VfQ1ktd zem)K16`k)!@ojIPfoqYryPq-u{FsWoQJg) zl9&m#Rk$2q42W$h2^3}024Aze@Ackbtx7-?;&sjrir-*`O`1{G=T&gNlZwG-Idwgx5K)2~0R(AkFC zdWM)p!3*I$S*U=0r$m7#vyhHe^)PE6nR&niPqBJxQ~pBnBBHVMAgLemU4`ytFk@Ey z%Z)fBE-~jJWGCTg3k@UtTQF+|8xfMA_qky3xFV8p^_qqDCWB{QuwNuy>m?}G@k%{7 zeZT3B+S_ctAZQmiyQJ3cKQA>QwU+%vKrU!xahGcn;r*%{Z(B}e1*M_tJN>FaCBmOh zbMb6gWt3MtoF848h6p9c?nkOLg!bUO)>56U;GH(f+wL{X8^9aIZ1Q37th_3gSJ#|m zjZ@uLf00+4m_RO6p^;i|&bJ_6nV>2&6iEELpM&stw*u~Ve}uWxRc=2FA-{np&Fcm_ zc>^D!-mY0OPS#1W9_N)CNkM<#+5a?-9 zb2~-UmdVwV&i1*%^2}=sjYzK}=a@Fw`&X@{b~B!6$i8L`G-{@F?NpKYy5_;EF9v3# zgDmdl$;GqBt)71LHh)p!hqpF;A)Ib@<$GWD>h4>#>4$xrb1)2gv+9ES61vdV?7p?O zJ^y<$R{g2j_MaMX|LuE}zee{h0Ic)hO0@-rh&D{1^=sQZKrn^4p(e1)kiE#p2Tlt^ zx??D(qIhH?m9^tIuvG$f-~}c4`=$eC26YV)e)zO=U+&4#XGsqtOGT+}GU|$j=$a~1 z7x_I*G#qZjRMN2VXX_aTXxi>OYFIb?yF{rTc(N;_cxLZodJqM+kf_$(hjfQgL<^;G zqG@Ve-69|mbDnM9d&|puy^u2WGy`@fb%}=`Rc4IzZKP&Z(Nti+Mm8tk0;tq|lwo;S z(QecP;q=Qp_6TAyZCl8@iUCrx9HA!{L+^bP#UL{8m0mf4dglVrYUlJKW_9wLXN>jY z)`*og&WSliT^_w8lZN57r-i=%y?QrAGuyS3#*(U&&nZ_Kpp8xVmis<-;{E_u2tSpH zqnNZjXM>~-8@s_T=ACclbfpg;@gm*Ya|p5NdWJ))MP~? zo$0N>2&@;p;={f~nQ!f9tZBnqRG;fA;fCply;}JkDE4-+(7o38S46Jr@@U|u3wa(i&&->MvZUfxAjlP6~v$ zp(X1TWAIdT&$lPIHb=s;H53WPWW}sv2hA?sov^bW&U_IRhnr(0-Kw7L$HJF0rStxx zXiX+ULHW|1>tPiXc@01opRI^8AoAGI0gbi~s4S!&buKhMWyeTYF+SiG45M7p(`Ave zr%WK`mok#`W3%QGM3am56Eq|Z=tfRMI@=M!kt?)2*ep+BV4846c2(wGzR_SV+WD5h zq%%+eQvn}+?m;##cuzK^CzNdJ$Xc5y>6}A??lI#^t6>{~yGx2OR}SpkqTSGrVUAA* zI~eNZ=9$c353P(ic3qKPEf~OarZq$S!(C_qI2&{{P;jq;|N?tAgA}$50Z6d*m`rV z#w?#FxP;acrUM=bCv)K(M$+A;fHoObL|=^P=vwBGrsT2{0xjsCPZ5p3l#i>*oJs&M zG=pc}y1a=>t6u>G_M^|G3Ce%7DP!~s{*7rJSWOi)pUq!kSyNX?Ha%(ZZ9Kgi8OFxN zF#b6fa{)gofdV$gxGJfOd(>P!v}2Sm_bk@1s!@(xUV%M}UhWF+U5rklQ~B@B@Y!5! zjqG*Genk}TDCf5!%&af8H1Eg7;}r_3^eS#;s?{6dDf6-XChNE@j%v0pK72iQ)6}H5 z1Tx^AjKpui44yUlh8uS9I*;jO**x0a{6@oBOGBF}-IF_0z|(m=P|~+H3+rg;QCo6w zP~i(FJ`+E_W!i|nw-e^D3d-X%uCkG#ckOZdL4RUf6lRQKFe&dRvq_?>oC9WM`eRSY znV$MPTL)xZ2`X$2=np@&{*a(j2UJ@eM2H`DkKt=N6|WOIhI`pcG2?z(J`bk~Hw-gW z1fi8@;V5*v#j|yKQ%J{b*tqp9Pe~Q%!~HCh(M2JKp}Fpaj(?E11n{gGZXdtJ9(cy# z{ct2{igtwR&$6Yk{|=uv;Z(|o$Qvi~<-Q7J=$Q7&ukH_{0MC*E7gX}(&; z=5NX^2m0Dd_M>%TSy)(b5d(b7 zm=4)A82F$%&`Y1qNgG;wOeAr2iMa3Y3{eJyoMuTV=Pq^6PleA|H5!5T8U?H0)VpnK z84hxK8eB37O_%5M_<*vW>J*=3rpJS6zv)mz4W($BB8n+cS@-aps(I*96`<+6Ai=|r zd(Lw~rG8%(Rh3G-KOQXjksY@?A>!*SMT*I(GCQePF5!b@3W+b+*=^X;4N#LUxeX%^ zVIA61yo&CC7wDwa>G*xRM@JG)DNUMe8|@P|7s;=$QT`3I_^dWw3=&-q8jU69vuiSU zwrjSGGfI>sW^EB#&eYBzf4#IAR`S;1BaIAeJ4Jw`pTHPtd%ZZTGY*@xe?QtORC7No zg0i9``!R9b@zf4bbBt*)1cIHa?xoU=hCKlDxRJ5lB#_}9yaXbRbdEF_AD)J*todM) zAVR{mI;IQYvEwHNe*vhqhkh1_%|t2 z+PxrkTN_)OAfI%uBwjOK$i2*_KjRelsx>c@&1)ET%G*`pnM2_8AptV7;Qr=b&{cQ8 zE&|56a}f?J@p1L5Jm>e}!%g`r7k5Xs#(RUHGmvY!slxX>YDk5Y@Zq&uu?~aI3kI72 z=kaseT?HC{HUYURmn8j6HlSd)fFES3zTtW8cul5qQ27N5e##VEcj?x019EsdS#O${ zU{@uUF*hC4od&34s{Jsic4j9-AoZiKQY}5ZmHX-QO)g;08Q~fc1>~fUZ$MM4d!qJM z=yT~VEJ0+{*xY_18%{PbI&{ z%^E}jNn`=-S{huS6+RqgqwZ7v+1#uZnwUUdQ%4!c1H~l9c2Xnp?*XN0JX)D6_GG55 zYmW8oM^FdcTL#&$dS4oqR)@hOX&s2P>k=;ImPzU(m5L11Jgo?GXIn9C#b1N+C0p0Rig7;h z17+E4rEwg4T>{ISaczIyVq5#tMfAI3sGe*m^vC3jcg!TT0{5&_LK8--vskr}-00-E z##-gu&r}I8l*zuQMaGNVVPxUBw{xrSxI1q0h%<2=@C?z$lyNRhVEJ%$jnuRggwXQG z&}IS(nuyRnl#0(O%gFW{x_%$hpX&H`Ac`$h+*?B%#`c+E7x_$Rw*6@9=!8-a_zRQ| z9)4>>4;%=kC!Nod;{pqJWm-x!T@`AqifzrVLm-S`gmwylqJ->y zwSr{E_7g|Zdoe(dLTru?(_-5E**_fnwY^bs7>cVn9(24`TGgSf(!Aal@({AW!u2~Y zes>hJLt5iW%RA`)!})r$sN*Bv2kb)PR><)IFcW#-Va-X8RNbw^;K>x;bbB#2L|4L^knUXr(k}EizsGOH{1DOKi~|s ztO9lEW1v~E3Kpz^0W&NMNA^l6p$xjW)f-o1J{`%up;wnd7`l%FPmY?a=n`~(eQ~5G zX;ce=prTmkQ#9hTF71f#O+55rd5`CDi^^hGpUt*8d{v^J>)6z z!qgUs_hD5IZE`~_?I3Dt)mmxGi8qc(cC?@8ooR3MndGH zV3G9cbme}=mF~)tVe!2Mp_PivWm&BUYJ2efmZR{QfU&*d*;y9$)%gOGw7@BgKAYN) zqYJoSTfx|YSz~xXZO^exI>9gu%VicL{{`wwcg(e%QvO^DW`frN z8wH@aXFE`7;^*K;pHV!g-D*yhV2Ak?xHtG1WFq3X%{piGcqzwRO_vEHDeKzimi?86 zIDcDggDxJpb)n%aeO~?BOo4bg4$)a%JK8cNGB003wMOBu<(RF#WfDx&lu(fKkEa2t z4?3TS!&OUtdWb0#wU4>JFz}2+171i{l1uV&Urp&@v+8tF2Z&K^qY*NKOn1D}_!8^u z%TGbqQ?1pc zPVS-ikrtEy*?4#>q^|B%S!V~tYY2D1zujPd>D--TTp~kk7uNL)b}jG29tjNydcLi? zu42cS)ezR|5HNhf!6NB*QJV2;@*U^fjTWW8$2-w=rOs{6;!hIb;G&A3vly*{UK(=N zIXl%(p2HHS(I4U+&j#e}^QX0A=EZ=J>|N|@o(h6bnCk3HE=enM9V{^qW3Qb&B#!=k zC#oep{Y=r1y_dft?eP89!wu9_F@n$w0no4Ayi+i%(DccceD*lA<;ZgJ*$*PFcV^C~ zt?&tNzhLWyor~n@TjgY=>P@5*WK1Yd_*b3o3UPc5m00nbt6_imu`}~bw(f~8&elD7wMrM?(G>zVK2keI#%G>g}O)@vJoE+Daqd*|8QiZ9n_zQ@DWZpfz(_LOa1=Jl{7n5d@w= zC!W&iDakRL^;3f|dvXj&+}V(el8 zxi!~=E5EC8N?bCwau2CK#ek%3d2d}$<|#q&!`22zcI7Dh<1Jlq3b3+vEx*dVh&t@f zug_eu3k3|8_tas9(E~}I>nr#;o|oXPVj?fe%=bX*rsa2tD0=5OnQopxJxwq>&{c3sy>MRAq9G*1x#<+hqs(B8onPxrACiiG<`o+U=20wk zNmxq|h!RH&EPKVnPa&yU|C|^B`Es(=jb9kWAR05psLoDgW%WVCpoAK+NfhBF)Pbv_VZzhWjys+x;D6 z(@rOA^|Y8cnvD0K8dF-JAuZH`Pd6L9tu~5fCJ#N&%b3Ipjh0Dv46k8^Wua=$ ze&mkyAG&+&!BeIzEtC8@hg(uA>&HzEDy)v%p)l3 z(O^QLTREWb{7gh|eC5{e^D76uxnzc#AG|=f(Y<|kpb_srSfk}7@%jaFMgsTiIoqz1lfO^uu=6gL?X1fba z2Lf<()2;?!{rcC#9_qhX} z*e#s1s%YWI(9c`p)J;Qww4-mk@KNZ_B}o28uc~zFq%5_9>8Y2 zR?q)eADjP~*J!-1D-FDx+;|Ob>suQ;nMym?V+rje8B74W)vvxEFgD8U`i-ULS&M!G zC<>{0fE-X{{_&_0kll{N4j~pu(oXQYJHWT}UyN=3=U?Ogi#zv!HI4pMp#T4$^N(i5 c{|(RLUp>bD|4{wCm-K(b@cd6*!>^hD1;i7WzW@LL literal 0 HcmV?d00001 diff --git a/baselines/models/albert/resources/create_pretraining_data_roberta.py b/baselines/models/albert/resources/create_pretraining_data_roberta.py new file mode 100644 index 0000000..601c328 --- /dev/null +++ b/baselines/models/albert/resources/create_pretraining_data_roberta.py @@ -0,0 +1,630 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import re +import tokenization +import tensorflow as tf +import jieba + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + # print("length of segment_ids:",len(segment_ids),"max_seq_length:", max_seq_length) + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + print("create_training_instances.started...") + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline().replace("",""))# .replace("”","")) # 将、”替换掉。 + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + print("create_training_instances.ended...") + + return instances + + +def _is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + +def get_new_segment(segment): # 新增的方法 #### + """ + 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 + :param segment: 一句话 + :return: 一句处理过的话 + """ + seq_cws = jieba.lcut("".join(segment)) + seq_cws_dict = {x: 1 for x in seq_cws} + new_segment = [] + i = 0 + while i < len(segment): + if len(re.findall('[\u4E00-\u9FA5]', segment[i]))==0: # 不是中文的,原文加进去。 + new_segment.append(segment[i]) + i += 1 + continue + + has_add = False + for length in range(3,0,-1): + if i+length>len(segment): + continue + if ''.join(segment[i:i+length]) in seq_cws_dict: + new_segment.append(segment[i]) + for l in range(1, length): + new_segment.append('##' + segment[i+l]) + i += length + has_add = True + break + if not has_add: + new_segment.append(segment[i]) + i += 1 + return new_segment + +def get_raw_instance(document,max_sequence_length): # 新增的方法 TODO need check again to ensure full use of data + """ + 获取初步的训练实例,将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。 + :param document: 一整段 + :param max_sequence_length: + :return: a list. each element is a sequence of text + """ + max_sequence_length_allowed=max_sequence_length-2 + document = [seq for seq in document if len(seq)max_sequence_length_allowed/2: # /2 + result_list.append(curr_seq) + + # # 计算总共可以得到多少份 + # num_instance=int(len(big_list)/max_sequence_length_allowed)+1 + # print("num_instance:",num_instance) + # # 切分成多份,添加到列表中 + # result_list=[] + # for j in range(num_instance): + # index=j*max_sequence_length_allowed + # end_index=index+max_sequence_length_allowed if j!=num_instance-1 else -1 + # result_list.append(big_list[index:end_index]) + return result_list + +def create_instances_from_document( # 新增的方法 + # 目标按照RoBERTa的思路,使用DOC-SENTENCES,并会去掉NSP任务: 从一个文档中连续的获得文本,直到达到最大长度。如果是从下一个文档中获得,那么加上一个分隔符 + # document即一整段话,包含多个句子。每个句子叫做segment. + # 给定一个document即一整段话,生成一些instance. + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + + #target_seq_length = max_num_tokens + #if rng.random() < short_seq_prob: + # target_seq_length = rng.randint(2, max_num_tokens) + + instances = [] + raw_text_list_list=get_raw_instance(document, max_seq_length) # document即一整段话,包含多个句子。每个句子叫做segment. + for j, raw_text_list in enumerate(raw_text_list_list): + #################################################################################################################### + raw_text_list = get_new_segment(raw_text_list) # 结合分词的中文的whole mask设置即在需要的地方加上“##” + # 1、设置token, segment_ids + is_random_next=True # this will not be used, so it's value doesn't matter + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in raw_text_list: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + ################################################################################################################ + # 2、调用原有的方法 + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + + return instances + + + +def create_instances_from_document_original( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + print("document_index:",document_index,"document:",type(document)," ;document:",document) # document即一整段话,包含多个句子。每个句子叫做segment. + while i < len(document): + segment = document[i] # 取到一个部分(可能是一段话) + print("i:",i," ;segment:",segment) + #################################################################################################################### + segment = get_new_segment(segment) # 结合分词的中文的whole mask设置即在需要的地方加上“##” + ################################################################################################################### + current_chunk.append(segment) + current_length += len(segment) + print("#####condition:",i == len(document) - 1 or current_length >= target_seq_length) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t))>0 else t for t in tokens] # 去掉"##" + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index]))>0 else tokens[index] # 去掉"##" + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + # tf.logging.info('%s' % (tokens)) + # tf.logging.info('%s' % (output_tokens)) + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() \ No newline at end of file diff --git a/baselines/models/albert/resources/shell_scripts/create_pretrain_data_batch_webtext.sh b/baselines/models/albert/resources/shell_scripts/create_pretrain_data_batch_webtext.sh new file mode 100644 index 0000000..01645e5 --- /dev/null +++ b/baselines/models/albert/resources/shell_scripts/create_pretrain_data_batch_webtext.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +echo $1,$2 + +BERT_BASE_DIR=./bert_config +for((i=$1;i<=$2;i++)); +do +python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=gs://raw_text/web_text_zh_raw/web_text_zh_$i.txt \ +--output_file=gs://albert_zh/tf_records/tf_web_text_zh_$i.tfrecord --vocab_file=$BERT_BASE_DIR/vocab.txt --do_lower_case=True \ +--max_seq_length=512 --max_predictions_per_seq=76 --masked_lm_prob=0.15 +done diff --git a/baselines/models/albert/resources/state_of_the_art.jpg b/baselines/models/albert/resources/state_of_the_art.jpg new file mode 100644 index 0000000000000000000000000000000000000000..70cfbb1b7b1197823343b5d3a571bb2e0a23823a GIT binary patch literal 121174 zcmeFZ1yoi~*Ef8w>!KT^TR=d%Q(9WOySq~XkrGs-OS(IyTTogAX(Xi^1SFLF4*vhf zec#XXe$Vr)Z+&aM?|TnxuJb!{X7=p8&%~aYeYpN~y#QdzNy$n95C{N7fq%gDB9Jcm z$i@-?6chjk0058xcnA&v0}%xL10ciz{0|HOauAYVSQSG5cOEDJJhK7d{?4NdUT*?O z`KI*WS6EC4fC%2IlH<80Q?S^PW{l`8N>)6 zmjJFS$RLPmZm`u~nC}Lg{)KP&#C26yl>qY?gD(%w#KhGK05ER&qVzPg0_k9cftbzF z+{O{aEg%*)voOIh5wQTED2V8-^R_^ z-o)$2&HtA_4)z|PzJ3I1@E6V6RYn!O;@;@Z&Po0Tqk*{B(Ns|h#9#}9jM;z^{f*bG zJ!CXMe3Sp1nVZB-`@{zTD4My+eR&X*f|$zDUhzhcH|fHbHc~eY5Y#fiLCk1X6I)xdPHF6bi@R}lS`C)C4ELkz?>G}EmuWPj3Z^0rpj0@Fc0p+lZF znu;LC0P(Dwo!Si#H}!6LTT9%e|H$L&p?bqJ_|9Q~rK{A9j9?xZje{GAe$)#Su(pxC z$q(`Xd*E)ZE(2n)EX>f%CEx)@Q@{zZ1#AX)>UY4qzn(||CV(qo1y}&gf2aH`q4nd5BlxQZcmunD zBbdkSS2^(?Pb~mXF#YmZ>R)A<0m~mxJ%5z21J=MA8~}O19=!Jeu?1M>XKk9m3YcQ^ z>;7l$W+1QjV7z(uFKLvRKhqE-5$+?%AV?wHg(HR&h7*K)0RD=Ckpqqoj{hfLzw^Vd z!4JbP!H>f)z)u<31iJpr@sl2)04#t!{-qTQP-A~c2S*E1<%Sc769VajwBhjKhyi9e zF0dvsI1#XJZZP#vUOj$E{9BiQ>m8X58H!AREQ(D3&m3SdU~vDUa-%6q;I}UR;>+}x zwj_RO>F4|WS^w`RHh?*pOa7m8!k)vbVLh<7us&D|tRA3*wZR%-A7HIF`0x6qe!Ls2 zU$v?Jcyo53R5riz{H)IeOu4D^=Iy$%m9TBHUHnZ2aJP8m4z@=?!pYgo)yB%&ol*=O ztt}{J9nDynC^^{J`2pZ&?7zta0AKNcjOh^k_rKEua{xd{`1<;~<#(F=DgadP0|3h0 z?=*U_A7YFEKyRFxhpXpb^x$qT@ZgMs3HB5sfE=I(=s}4%03JXP5CtRwIY0?e2Xp{K zP|wz2J9Pyf0*`?JAOv^`&YAH*GLQk}0EIvqPzBTjEx=o#8|VjyfN@|3Y%8n4Hn0zz z0Ot@01QCJ`xdkDDP(bcL7$F=GK8P?x67m3|3ekZWL#!Z<5D&;>ND$;DBnFZU$$}I? zDj|)Kw~#)_Fk}j{2-$=jLe8KtC>j(ON(QBcvO@WwVo-UgI@ADa1$BXXL!Uw;po!3I zXeqP~`WD&`9fvMJx1q-{0EP;~huwxT!}wqlFeR8S%o64bdjboECBU*_<*+93O^?DB zU^}qya0qa>;3(mk!S*TxrvYaM=K}WxE(|Uit`M#ct`lw)ZV_%D?g}0qo)n%DUI1Pe zUK`#D{vmt_d;)wvd@Xzz*mqXpPY@6g@Db<`c)`A>gJ6r`gAj(0hERdfjxda{jBt#I zh)9IUh$w`pglK~3ju?WNgjkB$hB$(_iuer)6^R0g14#-=7s(MR5Getv80iht2+}&z zIWiXV9b^GyC1i7CFXTw%T;wL?kI1XY=P1}HbSOe7>L|7-0Vs(mPl!(&5VD+Ty;%EyEqe-N(bhO-1C`hoPAjGRoG%$Y2Stb=Ta9G_g2+?G6!{0;dg z#Vrb93LAGI&ZoXx>*%6wsb zz5MX};{1O6E&SgFcm&)9ssz3WG7H)X77K3Nqq}EuFZbTE5S5ULP?pewFr~1uaHjBr z$ZZi5k!+DAQEE|h(R|T$vAbfnVx?kx;%wrs;M?qRaO;5?5`afL`mJTin!3c2;nS3VrEaN%cbDQUbFF0Q$ zy|@ZB4}JfV?PbEti!igW_u=f}iQ(4~mJx%IJdx>92vLqv6VW2kg)!JM-Z9Iu3bA!@ zlyRYPNAX7SeF+-75Eh|Ujwf_UT;+zS58#PSG}v|t1hp(QDx6gM%cCmL;_vrU(zKDMLu&=$pdf<9+eHiwY=xf0d*U`IU<>UDihm-Tu z&~HTF3cvGx?>W;tTR(qzfq0R4dH1sEO8#p0+Tr@z*~Hc4hb;gDf1#G(`ueZ{0MK;+ z;MNdmKj{6nHv5s|$2$9O1TIs4puetv;J?;yKc4XbKsg9*%y5Zz0C){XS}>x6@x}yK z7X_f4oWCNNaueJvIXQj+2%0NsntZ#y{z?V_h)n=+>3V&Ao_2kGnG4$bQvlHJ^4FU1 zCil%6G?VKFjo?4>zq$SyuG;}@B-j(gL>Pn$fMP>n*pTZ^fCA(l9yCnA@G~<63WI}3 zKtw`DK?O6^VgXPH3FDn=aB=hS^6?8u+?SM+mXVcHQ`gYc0!=bAa|=r=Ya3fP zcMs2pkG#Bto(6|Jd;TIcCN?fU;ZJF$Lu1psj?S*`p5DIq zBco&E6O*5&rWco%S60{7H#WBp4!<59pPYXCes&`l1c3b|)-TEaNiJ+qE+`xv3=Z)| zE(r7?7-86O@VD6!aKuy)Ob8sWt56S*( zf(8DMB>N@UZ@Feb1DfbZU?d`9yvNAMcu(~n7*wrPZ-Uihs~_R<-y!}-D8C7vzrr<0 z1VT&>`>s}@&9}sTo3*G zRsUbo)USvCCrpa&x$rjp%Z;_|YHfJ}3249AJ|d=ggxV_nUUb1-5W=zJ=FVTDC-jSiLjuLd+FVXEb(p+CAp-^{+A2yw(~K_ zkcxWp=DbwsxJ2A}XDVb@V{uGr`c02}3XKrbhBZ`3^H^wYhQ`cABBK(6 z?p9gTbZ0%`J6qe~HGpk!XS6gyRe&_`X^<$^jI57PLsH#clPESj;ZfDrrY1Xw!aTRw zkG7wx!760hW+~n`?$TX}gLmz1X@CbcAJMkVOkG+k`BZ_5DCyC3{E=36zb#HD;Ppjs z_yzRpDA2bgvH2Q^Ix^BIFZM^F&;Av$lT_r{+sPEn?_7561T(3_8KQ+>!b?^R4o^RS z70I@6$#9Z{6w=L2)~&QGiY+%C-}F6$rriTDzQQvdJR$jX+Vhr7{TR=+k;bXgF8Ee( z)Iww#V7sODDuLmSXzA1CS&e4YvDT+DEo_U=vI;$JTD2?MOYPV$mheY(&k|FQuHqo0 zvqYU1nKMzGLF)F7td5IcQ%r-R5YP=HtoJCtn(=Gm>B)g|)0}0jixTCizTy3BGMy68 z?NuJY3s2HZ_lEJ6(RV_n$ulhtQFxOrtK(D{>k-S>E1{jimX#zz+MJKs)7yf13Lj8LYVj#Shh=q>l1ck(D-fc*| zoSSdbyGllSf0?38EbBgXCFWcoceNmj@Vy>>Ykfs8u2LO=_guFHqpgvV3D7U@10f{T46F2`dEd z#iQv=^->tLAS)B)LaN$wywIEO`Qivn~G!C zzzQek1>*N1gwkqT=eyB4vbY{IsbeR+`=4)He=`|ZvaMx$fop$=1(0ng51CmuCND~1 ztzdQzA*xB@DntH`Fd7`Z%V9|CHuSS1Iy{3hN=t2uy~oVmR@4B+P)pc)Pih*)r5=R3(nrh26j z!+u+PlA4cyK%;uh=?mF%xH9f`>cqM4=ux@ZO{Yn^20nTN!+K?6EHU$o%7}x6J{opu za%c1Vx(?!cg!fL%*2Br-6I~0a8hsxGPncNp0jys$(c8Wyo%NiJQZ}@l8%^!Si!w%O zwd(PDW}3fmODx=Vq;8($@E5Pt5FUfLhSFz3f5vJjr(|TDY4|y&G|nN-csGIfN=^^S zv#@%VAtzmc4X3ShQPj>WGIccK6vZZ=Wvq8H&(%3~ zJ3h%kj*BnrbZ~qo!UOZzAvMcas_Rq54-0i!Q@Ct51mfwaDs&CHya-YV6qiS0&a{_mqw}cYWqQiolI`Se@@Msr(BEBJ z!%UaeIiDCX)5C zpTAw{WGmS3QlAi6p`aV_XVjgu)pGuR@VuRlkiUe7SBq1YiB}FUr$@>AnK(F2d0zu9 zcIr}r67&?DXcX`KIi^p1UgpY!4DbxwQ&dB!%S3co^RtICCJ6*QE!waaW{szzX*fxFEav9^ttpz2>PVAfDZ()R5*_HtBzVBlq!6GoO@2SZ@H zp8&8;@nrP1zmH{=absJ;mb@ntoUhWo@o<6ftXXrzJ!p69+73XrZyJI+L|%E!iOE9! z9ov&<$)D;uu~u%{J7|ZN}3Qdk}uWaRZ)5Fe!a^ z_XNGD&;D!(ev2WjKCuBm3VpFixL z#s;E&RCs7?63PhTbAC3G@J;D@Ny0C$xCx$z+{tg()bf>dU_-57k)tc!XL{p%kiOuIkiHjBdWdTrG+r zSYO?u2M(Yx+p33}OT`0>vcBdqN7Drd+&M02$uF}x;$C1ZHy54c+a^DbeVRP^f@(vk z5N4=!>~O>#kV&f3D_wB}P78iqQnd99-`<)&Dy-4-X||r`k7B6&NS6!*NrxANNaIxT ze2o&?I^T3@>35+C`;dlwQo8Kmn?HB2?!}taMYJwE$;>;r&6RF}i1nFGQM|`uzie~0 znU^K3l&Ov?L0@#HKWGuqM~mI!ndrM`TK>S1c;Es3J4wdF3|pWt^z4fQ)%15>=^?hG z@)&~fC9Rk7)RB2sbAtLaT(tP}9fZ_bXX7P|Zg#d&w_b-bY#CX^F;WdAy1?l1&m^_F+M!N112_BFg5KK4w>yienT; z`(=U+@NZ%cF#0}Dwo2kBP>&O@Q_%8ao`&m;qmAeZwXj!LqnW#BB5`4U+rI_^GB!0? z+Lq5|wm!cvvEFuZK%82C5w}M}&MrARC$z%R1&a$yJzD32WcgT;tsu~Ce>x*xGj2}k zyE9teWE?EAURmBs;vMo<|3t`+Rbw=GM{o4f^yu^?6SdCoV3XiH=)0Cs?lsV>>GzG` zYuPvX4jXcnq`-iH$vn3c6(c=ES=W;tiS>X8u!ivwN1~Z@CEG!}nJ8gBExIiRMv=OE zRR)Gq>H)R5-r(Hu`QlmjS(%5%`&kJu+Wh(#Fb7p3&MxENE9? zAGueP=z>w{+-O%P^erks**9pxXw8=@#{{(|tu1TF^eY*^?c(wN=hTfavf|8%`^S>k z#9ug}^7No75iZqcNA)&`#tDaulIGr$&(*Ijz39BeJTBX1Ou?o8{C%_*6}QztYw;O_ z?`D&?tb_3jsB^QR&qzK;(y`&t`%@T;Df`g0+} zfb>?zv;&H>#J2gXc*mLDdfs%0F-xv6vYgp*1xv<~_UN~$SOqkx$_Tr^ZT_OEdm*?H zGE+qAu=G@o_rLbp7%B9_`b@k0o(AdRPH1@0cTj?J0_%O}GjGTRP3S}^((WEt5!r$f z55h`?iz>S9-N;wAVTa`Y2ckqx-Bx2c0sNGZ@5v9JQ{Uy{^z0U47_A&lqqUoBRq1e3qB+RSl8YpgBwXi@dpgbzBPk7<$Xyy_$ zM!qm%nCOA}U}pCI2mk#JQorLunD3j-GcDF%2C7=`aMpC#O2kY3t9bOe+`X^jpIm0h z+jN1=?YMDVo=Tk8y|?|+Jnk>L1s#n;N`E26Nqao_3*-^$7KDijuhhu|a@GXuUd{%7a0u9{*hP1YKO#MB(5y z{Ij-NKx53^LfO&&O4ATrRt6gkR~YWizVNah;v;5ChQq3I1;iG!*eiJmt?{+gOvep32lcIem&nc zFaNgOOObB3Mym0a!5nDLtPn$|+q?|Nb3%D6Jt%#Atzn4rl@WX04_1U9aq~nAObsM4Z1ArZ@0*|0tkLmt||}V*5$s#OPir3;Mvb4`t)%Mha3=gmzvqa6nAMlVkVb zd~xS;8Y5I7qo}0jml{>s-bZiUwa&zlkQA26qhWnZS+QbQQK}&Ez>Ah3;hB9-s`ps5 zEdc|M{!D|D#tJboS^lsStzEqBogQsXOMi1lyz;xWPnTcXRs2lr6RtF!``^cIixP|l zW=15IJ94vc?-UIO&>fbc>b-n!f;D@;Ys?sLg;aG~S3v2`ar@=+k;*4!1jW2-;C7+k zg2-S~@oI%hYl^a7;eta^M8i5><;hH#Ur%X9VkSEDt(5jsvC*J$ooR;e8x8WcLWH*F z<}i8eP+Y;rJ!v^QyCkPQ@SPVPo0(4ufawUq!?EAI{NA;xADTZyu{dXC3wh{1pTPYAL@@NCW>Wn4S+9k zFWDjrVUW@b9-iZJ^^lz2%*v9Ga8tH`D|_yCGLMv41lUy3XK^+OO!q!t>XAzyl^O83 zH#VkQpYeQN)s4L$+ieST(h4ZeYQfGCU^MHBcUOaAn$Lejc=`KT&}` zw=$e)VlSPp$X}+>(trY~dmOMu54#_Qpaf{3tECC6T@9$FD~Rt&`iq)Ak8X(J46QBt zUSWT--#w^pMx6(Ufdlok3p8+!nJyI~b7amszjwfKF@+fBrWg0C=YVTOh~kwSd^PU* z?qT66x9Y4d{(dPV3RzD1UM=aa8}rcv6E`#~p+@0WJ8ROgAw)`I2N89g4_WwXy@=EZ zO;gwwVL7gZrj^_!UXIbKNK;9Qt^%eNHXoghwfDfYOTxYlYZ#~8s~uM*3-+E@2^4Y@ zmEu&KBJ*)yL?V?&3XL`ggm%=V+Z`#$;~M}N)Stg+&Y~pcMca6P_ZPW@@9@X7P|OsQ zel~DHvE|eIB{5gr92Nks#wYU(G?zr$Y~Kqm`SWfRF2&P^TPTKcFv$Hwi}=q@+O9`` zFCGf>Ig$GO5uUt_fLHy>ZS;MW;_}EECKo#J>*g2ucj~{j12&HTE!snre|rg9kj}hi z$u)2<=NfnuPW~GLswtA}11B#D4nL8C6F8xHE5O`ooT0%@kyH%s+R|SGUvi{>Ls4RN zO{d?t{|7wL2!5EI8{>6Swkqtln_RzU+xGRn$oa*^* zN6NoGM~KJ!ck21Z(7*NYqZ$5N4}W#Xf9v7jdiaM?`9B*UZVzHG3Vs;(TaWWQ9nSd) zGSRlZ_G(UJ1fT1Qe$ey&nmjDuw}*t4Z07aZ5JE6_0D~x>aBaMspHS-aSV|W< zg%-DO<5TO0yX2lEGOEtjFy05rOFoVDy*ku_(i8QJ6!)L!MMXFD%4_t$>WJ{$*W2ay~Il^Fl;@%mnU9m&)UlEWDXuQl)Z~&a@Fjg(JdlcUnorKSu0-e>K&mC|fZIV{fZXQe@o`w>NbyBR5wrl7_zsv#d!~@IHRMPpAYn9NVBx9>Vq#)xD`izJir{1>Ew1^ z)%=LA&NjX3L6w+gt3j1DOtEL+oW_Bjsnb06i{|FjT-g}sVi7TQ%|pGXHV$z}lYCZR zBISVL>>_Q0lztO-g8~}tqnnKqom;lIq#q>&vVT4O=%gz@C#UC36UsKc*S+L@Ne?p( z9c*lkrebwW2`yjy2>2au>5=3-w6A_Wt)wDKKwo;`ZI#$z4RbjE@MPl}AaK)YRxUhuB1L{;rIfLC5^f|jeBs0ekghHy8~nBZYbqLin!}^xR4S_F*`;j z$C!p`Z7Vz>09OOC6~V7+CVWrejCCR>{X|dJOfM>rhHtht9IzQyuP9F+xy6xbEbuin zgmEgbUP~Q8ng~Sz@3q@W@5Bxz43r-7~@Cp&xgaWh2wLWH*|t)V2{&!OpY^A$283! z#-e_yl*5f_4lvqQAQ)xItaHtTgoq*db6Xb^2anZ7GgamvG$5Sdc`1qt*8nwtY<~WC zJ*EDg|6je!{I19UW%v!Z(2UL|QzN>tqt>mI&9sYOTqgQhSl~r4Jm2>apGYtK=`l3T zmDS1YT%c!_(fNboBhUketNGb0B^`^c%&<#_^w4}Ax>l!Ud(1o=5+jz*>9;-aMbE|^ zI3|{mH2Gg=$?}nXYz5RNbn+Vm2l#3BsLDS_+Z6B_=9&rT4wJ?8sQL7{)S%rVt}Q)P z&K3)*trE*buIYXJl76(2NF-dD-l5hrT~SkVnKb&FX0|0#*d!VPBHsqE!$>Qyw`nC1 zoU_Zsr+Ji#OjG|#xm8w2PJ-aUF>PA8jAUW3cW=8I=x~={X5*Q12PWJdt5VaDzJ{C) zN#Kk@?gEUInH@xTiT!SC3usMSc{v4!Sn@+yg@$$XVm)$@9T!5b)WoRmiuS{v#( ze-odUeK$(MbI$yUaD0&$Y0PvVsfbrxfQI{wO=bX&J-=|dmXk{+_F1gn2+(B>mcK2fi6mIRZWfl{?cLyb+S@wM6Qh{+0XeW0!WS{nJ>N@~Pm&>nCeV3mQ$JO{0Z@;5X&O@u9Hnz*cawm8Z5=a`QpzzSi zjX9;i)@PN%`g4FxZW~t4jwJJ(SX$~pH?;0-bY>6}fg1HmtqtV=ajEEm6Yo5*SLd_AH{vzQ2faW;0A zm)A9V-voGSA7yjuiRodlGIRQL5Aij1Eg+w-hib>AG9I#g%>Bw4eZH2Mj*49SsK`F@FHZQ=Tfg@5 zeBR3$@pyLbtrZ2GU}UYe?#pH$k9zC3<_o0zw(@%HsI{F1k1NK`d{BUoljc-N zyRos<7(&5@(@dpFGhY~1t_&FN?&N2vL@ZXWO7R^JC=|`~yIBu)oJI%?!8_nkB^a7{ zL?v)ns+TS%@J$Z3Gh~TJ1eOmW3Oavw8rKB<-b!93kmpxyO+x4UF7+KK!qx2r`+ddA zcjE-wlojd6>>c-Vq!v?oWOm>I1?;PjBd}A{jLvZb2F8l&nhk{S5f3Y`;tTHN%9Q*Rmn!i(Yyd+D6~%2*iVBoa6B zHyOcJTRg6FvdGVu&CE08fJ%f_{PqsV$>S_P5K-pYAjN#{Xlz zP+U*^%dAYFJNu@_5iu(hK08!gbXycvl6LhN*U<<(zcLVIahS$&Y49jC@hM!^UWs&3 z2@87LWzgGgJ^Lox|E{F{+RD6w|EK9pPdx7^EN|5z$gjd68LSxI;{v23j3Zs(As)@r(i%0AazF?&-92B1V_XO+I}jyP|>rS+cSi5osu( z`-e9-$}dcVp}=iGvKRFv1f+#K;cnC-dNu?r%*2xVQ8#xQ?qVWlQ6}OazJGtn!X?1| z?ls*fu)xi_GZ&6Lbp%w*Z!)=X*wF*bzt0SRn4Yo?82SGAp$h%r2`iADXfDZT30&G7 z{Y6$r21XYkN*NYm4YB@vE&JD*%Z+loK)J>(OyVgo?~VZ~3RYlA*p>|lZ+KN;@!w_Q z+)VcPr|SN;*woOXg96(r%7j5jZaUIr+a{265eZR@AO0>NaV6Z)t!uzN=;%L_6|6#% zJMGr1ztm0%vM9;b8|wT40_tny2djaiE1>k;=F;b37((a*zdgo(YWr^KM%$x+l~cx5KhWO{LxyZ6>G-b9q|IrhyjoPpL;&V_2j?wmi9>#$-1vmCiNJSvIU=J z6f8(%pgPa!=!9%Mu7Pi$A7KDWX!qKdAmN0d>jDObG#A8B$8w}1=8o%uh~+w7n;V+j z{eH?pNSB*+zfU1zo3I}1oVZAt2ZvvYQ&uL1nnZoXO5gfw-7@LK*G*6RTnFmeTl!be zn1;W1yaYVZDR})s;k9G z_vKweTl2tkKjY^i{GQvQXpypY5lq55{wp!w6FkvQWFHQ5r;JcHeivSnR~yC2>lZ;8TOw)N z)l6I)@6lc>!UIOxmAlL<3A6ov(YNV}y5V_c8WP+S2<*Q*1Ut4UBaGvHkeYNTZmmz2 z0lsdqZ|Y);(Wa1oZONDH7UhCeNSf+Mo@^V{D1I>Ldy7bTrs2R|o|i6Tk<{ky4W58+ z(wDw0>EbmVdeaGGkv7V+GJKzqNB24^HY3T4AN=A&!ov} zb60m|!LpGAF8zy2!XhCw_t2}yM3iJ=0-5kL|5tM%dAIMH{)7u!A`f}83a~l(5;&a2 z$QT{}wmrEZe+`g-xT=CZj?mEM&DiuO(0%E-d8C@s*aH5rZ`I_=qd`T%Nbf8X-ym%_ zUSw8@ndGr?x}Y5nmVJ3vfOH7%VBRZ%Y7;@0OlT{`2GyB&PNrL85;5SYz}om4uqywe z7W!#(0B|Gr`Djr{CM?DDEZ?r6R8Tv7BInLEuuu0s{pz?;6#IcWb_U}Ya>V*pE8$2d zs#t1H{wt=_GTVVMx|Rrzx=KgB^Us&IKvA|V4{e~h_We%?^>&z&&7m>_4F(Y{8Ghcg z|MFtd1pB6a#f5}qk;e}{PnR0$bTd!uxm3w?t4G@Qx6t-$%hyv-KfFYD)YqSR>1nzp zrZzMLreIRx_ap^!mKz*JKT4CX6lU zRub|{v(+>VzXmA3T>}*Bx^X}sDg#as4Fvwn>()( zKGP49`C#krQujI1)UPxUo(!08SBu+!FbOb~0@iBWha|(qnszf$KIY8y=JtLV^m{3^ zg^nXz$%TXSj<%iKNjd$H@BMUF!}1ruTs`yA4#1i52 zL5(GG$_Q|0r1rr%(HDb_bVUUluT%#}-m>@KcS)XTNHPx2dlJehB2#O9+a{0DD`H6q z9^CBxCsRbzvBdRc!v0{t(;qY6PA;=CP%u%K=*cE7M0b81p$G z0Z70|^dNR*WhNTmiaM$akR$&OO1h?<0DiI2l!93>kW61P@I8(Z2(Rq5k0A$=>Q zSDT^)S@D)HEDXaqRW%L6x@x|CouR7zk9ORV+Xn!1TjC(`Z99A32bV&{K6Y?nPO0Ck z$nzR6N)X=CHG>N(DWK?Hy2~ex?Z#Skz86WIm9c?HU=_9U)mCZ7IvJm3c*jU?0J-Y4Yo z`f~2OGJ3G{Brn*|=#gGScwxOK%0Ud#ywTeRo4kSE5yy0MAFl{Q>aerW$#N@2L64-dhXd|o zkYE>1Lt94E0_+0Th^uko9Q(?dl$R!zODotLf|{N&@>}&x-eK2(z~^S7+9-;>WWXW5 zx2aC{nJWL;c=6l)ueYrGH^y84$18___<-S$j{Lt$r}z)P&hDJN29|rS;27rPZ^p(a z851$5O1wGdF|YO40h9=49vL%iX%|YBAs11iCsb2!nA`l)(_9ST^sai}Sj|s;T-7ig zaTqZY79VLVA#S{<#DS}IHys1W#PI9LIGoupkGZddl@k^;QN?`ZR~2KNHYwz-@$7ZF zSZ%ra+oLW(H*}FUeo66h3)J_c?YiHZ$%S8@qpl8fDgu4#YQ#hGTBUPB-<(}Tx9>@V z&h$_+!>+dkCoKZ5E}!QsHtRQiF}wvX7cXtkico6@muJr=gc6P#Dnn=@L^<_AJ{?cs&N}XQw5KFjyS41wzldaj=uGMGe>->hb zVq7t$B)d4I)FMDgt1d=RJ06?doJo#3g7_oTQ*q+A2tWqO-Xw2I`ld+SzWiqB;Ayr#8tEzsO* z+%r*>{d%7m6&#g)kEVGMqtqmYIZ~}AKjrS&InZ*vppmj^8ZzE;C4t`JK{Eu8F2Gpw zXv(-VDn<^++4eXsW9nt^x?8U+xW%IH%~Tv@(ZO18qSgVw8aKR`G9WD|2Yz6R*V0u# zNild{6$Z7-G{L!R2|(tH4L663AZzAq8y#hu&t&dX55}*+vSYgd+xts09fj>e!tr;& z|1?$;$kE%sJl-6^KB}--M@81&^UYtj4ZTN!#FVEG>}To>)z|hX&?`d za7N|(cu5Vklg4v8-f!U6scbMSSz2GBW;4yrf>B% zt2EiQ&DM9TK+wfssnv+o`9B&~>7jQsLtN_2(S)|s$Ebso+_!NdiQy}rc#T|II58!( zUBW>bgD>s)n8gmC(Up+PW+F2$>OE4$+sMJw)#6ojSmF~qojF6PXCR`cFI7(G5SXAl zfiufJR7Myyseg`vr|zP}LAC6Knx0hp++906-h!ifm;ddiW<||WtGT6VGo~#tlwG1f zgPMp2AG+SK-K7~gxP^B((1CY`cI6IHMe6`{ia6 z;P;oYHyF-UlC-}{15F6s#Fo*)iDJrun%uqx%>!CzDf{_(D&?L_tjdgbhhw)`g4=jX ztSEeNVRfCh&#{QwgK?4NcTT4YM#T?3^Y=Khql(irA)!QS*;0ypkT4e!&DGT~lyMl(i)K}u2G&!y^V zI^rY4&X;ON2DR1tLR4?Prl#>l*RHD#9p{mrU5On_1`?YPP_&`m7{~E07T%|{OY_D_ zge#W$8H%!dBRiGwf|$0$nHXAX^<&gEmJO)_=H_PeHAlTp*x0LaTozIUWDL8j#q*-Z zuIi+&LdDiO*!L@8QJI+U{b~OAgas(1t*vjk2J{0ex7Eh)maS(n9PUa?Nnx&w624NA z_HwP`v4!65f4ynIQ7DPh0?7ZEaIijpn29w08SY2?e)=}~BC=ntk zJtH}G4&qDW7-lNTBcUUYZoLmRd+pu}7n(dhB52&@sxzn)sU}Kf5!Uo6!l>zGJo#0v zaHS3ILa9P<2DMl5TpeY23TOKXUGAK4C-mlEHWZ}Eb8 zv}-PwPpDaYB&42#A%3REHn4lZ-sCX;rTIFoKSzznxzJo!iL+buQAzSl@N0{`PKaiH zwd@_er*~+KmdJXt@<~{EG!j-%cUH9siH-^!oCDLO0~GkmcFEaGV{at+f;DWhkBMX-3B1GvUw4_54G9J%L|G4C4Po+gpc4xp#fv zGjx}93n<;)pwiM^D&5^6Fn|Kmjes;r41;t^DJdxoASp=4P!f{+J=y!ZukCj4{ocp> z9?$a!hvy79bH*>$`mVJ;Yr_=7(_GhpNGvK2gRwD2j^IO7nN!KV`k{2T?(|*2tQMM- z8iw^pgs?CLi%r9s2xpZ0F5jlDkBf&^D(S1~MrdjdSWc6Z@wv!}&Z6<3r7otOH7OtB+rAZ@~^Ma{{gYWQtt6}&bVPF%LPtp z!0oiBcGv0aOOSCvUd}qZTZ?wcPL!o^f*L;|2l{hLpe)g*0csgr{S|#IOrJ5)%5>{Z zK#T@Til)v}t=ibDW;(lLdZ0*^6^vr(DTs2WF^4ncD(NOe%2Iw{`p}&pv;U;W;3t2* z*sauPj;5vozt}{J;vwGE7}BKipyK}BqoS22#%idJ2AY63*&*&LPw%)4>d7CRj}!u` zfza0buj8dT|f1m#4Pr&mdYGALJXl3NJc|6S(1l{?ue zY@$Fo5fq!^nXkB?*`Yj2wHV{+wMAMW*2O##7x$iFJQ52TxlO%6ZG)33wWHwxF|Of(u0^}2n*P@qIZge@HG8O z=35=%xEwwO3hfIGKnKp;t1k1Q2LoH27m4CF0r>si-1RQCZk<=+!~A|pj9h+3(F@PbH+G-;O|U6 zd^<+|(F*-pbNWAdjQkIUG^oAOqM~O<3 zpgf#h0`Tf1w-ErDVVoFT{`5@F+&AJz62S6Q=8o5`zuKrhDs-y&gk}p~Kbld)j+5pd zsX!uk)=qF>cNCg>YH3arnjk|_T6dq%z$r+^Tp9~B^?mv4+CEjwia?@Cm(=rg+{Q}w ztngWWm=Xs0W6vx0C&spp&vF8EXM2?Luw8tdC}#J-H1=|tWE2Y8xepDE#QHcRhLA~_ zy#q3fi)MSdHsm!v2%W1JgFM}ps8w3zu_U)eDq$P<%w4;^D{L!i1GaJp`65RqDVB76 z(#||g6x^IJW7FR;W&*-}kx2sP((hq2V8;LZHijxG)6Y@N8;B?&FK-d}tYlgG!dZ=6 zuaHFd-VfwThs4YgL&JT6FQVz-P0+^jB>DQFGs!hT|cI;lg_{%zi z*vDzTvXnv?T^zhIld+`yG@&*>UI~DqCcAek{D%hJy3AhOZ(-}2yc>R`+Zbu>Hlkp* zFM~hjI!NsHoz+#J#=+JGS<)e;wkcV?<)|KYz!^Em?KjX=WfgVI&ui76#~Uz9VX>i@ ztH=;6Y31gMZrXv|ABE>373cZ6>TuOn+b6=8Uv`EK;N0WJ6ezY|7*VQvf#rt7I-nKx z`Eq}6V2@$k6h`th4Ta}tP%4t<~ z*qV_}Yc_eVguEF;;c2d$GFk{)eZR8UBDUruIMo!}=saW@D7xNK{?l6g$MtQLB#goo zkORN;y6sQZyDS?bnBnjNAg zLYP#C=?Z z&;bb&LJf7S7DEJLj6FxT<~c*M5oYaITa#R#(8HEI=|)Z!zL>O+ZL%ZbR*gU~Up_bv9SV~12b-dg*CZ>~^RP)Nb#4eMsAf7i zoJ|y)501}>zU)BTyyP{a&$2H4o&D{g6}bgFft>z%B1 z!Xc1n2qUL)yLQMNho$Xp`&luE`i_=C#j+D>W1bi$y~Jmf&_LTgINv6WAIyF2AFmws z=YUMP6up^nOg=Lkohr2#(obi3UgnB^*DdBo0A%YnNR#kgWPk~I%a|m#a8boI9<}dl zbks~9&b9Kfp>aBO^|C^n-h@+VX=}^fx5+sH`1bj*Fu`4dJCNWqn389f9pkJpS|4tX zEwWZr=jTdm?t3X4Zt?ljsV;xjmUK=qGZ#mc^h>FAKf!!J&5gcd&eR&Hk`yvk3VW+b z_OWx<(fV;Z%U0xO^4T8uqNv21=AEQjkr|-&d{k`@Fea5RY~J06h&5<@wA*yZ;4Lyb z;;=xbxMnY@bXPwozjOWdFb^*du$u6^*WK{yV%J_`=)>JZRn;5%S-#A1J`fSIAJb53LqO7cRhauu&g1l_WzienE z#I+wmXx4xrmxpOvUKZfMh@wQCJcjiPJm@cO%dMNEqcw^HqIk{aZwb0q71GgWB zqVM3wH5Z&gwlD0GR}t{mp#ICk2v~FA${}BITo98S3*Agp-Oj}$UJ}{jtu2_A-MH&} zc|Y;BUG*2k?r)0Ubj`*>^fZYCF(4o@uo@slbTj!8;-CHoDmH{yGKHyrzq>6%K~t+= zTvMOXs6uYBuTFZvVND-qhM0x$!Aj~>B^;j_rYBCmDt#}_WDf5&aGv+wHyWEUnoBZ| zVU^sB`!vNcO#O^>M~1wL=jLmA&e!o4%xuP2IqEIo`_NdaexXC6ji~~7Wl*0(v%I_a zB^GF?Hb4-UNuO5lds0~5)sFd)Q2|NAvl&rlQKpN`V%pqd70N2C85$2&6>-H%3n7IK z^!2X(xvv6Xc~-z278KPAVocc(th{2X>(YCnD;@exklk^VbVB7mSEky>6i~Izph08 zF;S8D*+Pe0(LGGAy8@qBh0@YLOmopj7!w&i2EmC-f<-uNa)4HLb)Q9@bzi$*sG8BR#f&AOJr&Ln<(&DGIlhvGc+6h(z$PK<_ zjBz6Aj`qaEr;LyA0`r$!{RNKndFUih;2BEUu~}{j`-yA zQR;btH)WDNE^vP1hf)B4H!h+LWVHjOng#i}f3$%;F)XZYh%~9vhP=P{mP=Fo-jPP1 zL+sA5RDiGZ9m`MT0b|JTznuDIg?}|UG|Y5FAIUt3!Bqbw*=9d)6j&tfCN0)H4q+V( zcESBHr>N+&WjH%qMKBywAU$j}>V%S1F#>5h7UXAZV*5ycDgYY(JV3e|`oaql)ibuj zzR?2Y-rAf8g$Wi3TxxczNs0&QI_YYlYC^;TE#iL?;%@)7Oi~~sOL(a{Kd}upoNL`r z^vOMqvqfec3^dxZdM5YY28i_%!pipU*2fRMMm$O70pZ}G@)=lUC>$)<&)o=q`blG*Crd5X()*c9?kAY-x>--)AFYtO~`e z5k~6&y$VT?PEC_av}K~YBb_-r+pKh*1Cc5Be2j%u2}!x5lM3lfsQ*gL{Yg$hOWO%xz&Hz(1Z1d42B!Y@%P zl`vyYkVEt=Vi`>T__TE^4Z}Flo|CZoyeWR!RYco3?|EN1%}Jteab&+dHB!}{yMYdk%e=d61X5tQmt zEOR=h#5S~v{qCU*`NcdQj(%dyIY8iZQV?+OzON;e_y+z0iN}vbdzUn08H8Sn0E!r` zlL-UL_IZ!0?*^X3F64A&NaR7gw`rz-(R}2oLTKAlz((79R5}ZHH>&m=#MSH4l;$*B z$-OV3o9s#)h!v-0_r;QAJzG$vy*%AP0<&bbDI?1>-Db4%V4E4|uWMX^wn%H*;WvwZ zi;H|d%tm$Pr9!2hB*flha*QgJpkeqJnm8S!F}^U?&P(RvCzCJad)4`Aq@KRBZ|30r zM5SdGirAO@3WQ1w_4DNU0E{pC#mRH{m?8$y@BV(5L3;Q-x5ahKzp8$LCMI@J}E7*Z2OR&XlDB zdM&R)&EM0k)JgxD=43t*Gi^SVkr8(oWdGT4s^twCAGsnc{S7IM@((w-VsPi&QGdwx z?7mxqUDa1pRe}u8BiO|HI#8`vF&C&tQG6*8WuANq)7xW8W8x-q>0`1X21}AeI6zhX zav5+CJHti+&{eIkHY1Xh5P{P2v}sNcM`qnF(R1~B>+C0*UZ?7eXBmJPKvk!w(-L#K z64=vjtj_4&VJo(e1yeUDu zI3C;EtmiI;c36n6&OBVoUl)jfd(E08)*G%Mg2kt*sRGT4yxQ^Y9hr;61U`W+yLi_} zHl4~;#7a_*Oh4LrkSO*@dg1p9$}N}s^&jW+zNlepPwW_BbwR8|(G!7;c9IOQQt~%F zE)=PKCZ5FL5;?Ou2L`2=ZE)-Rl9KH1|034EISUolo8@hn-=GbH!}*>J+V7m!fyr0g z--Mjx;j#f?3?wm1iqO#EPGV4N>Nw`B_bhx|^d`eGSWAc1Jv>6zVY8yfVoig*Md zA8kx-A_qnF)=xh9wrVVJ50heJyy4e52#oAnFI;7D5)c5#MU{06^s=Kgd0gKArQc-L z8X{y{`YcC0^2KLrsGj6${izSH@!O*g-&106o1Dif|D!>)00sZ zIY(=#ma1DGURUc|oIUA<2(c9D;pC^R(#iptU6e3_-YL(%%%s1pp>=9K_ zyD&(eCIV+&Me<{SaDUe2u9@6$FsjOf`)s9vuy&t^M4uT`Ep zsvOWSzqpJ7p0$2+bKHz@yOIP*EZN}5L71S5e=hamA|R@>ku%4w-xW|`n-m0GO2c%c zs4fqh^4OtO7AJI3G zcK|Hkmd+k07g{#;f#NDEYPw`wDL5TP9hTp)fnpi&Tt8Kdiu!s9uRgJ`tti1c8`jsV;8gD+}3*TYOz-!YvA-?1(7^iO&!e2A+c?OhE z575zCgZqNuV27uGBuMvz0P1gutR5}Ba8s}UpLa@k1$OLq3w?0YsFxCDi0c`+yhPHb z*R6ULFQzq_FB&c=*Y;3an6?jSK6hw16cb@S`%YHK70fF`y&r)=>$7Z%+B4*RNI`v! z*fD93aKIhSS`T%L@rask8K?Js^@3mpD4nqHqm*yecy=duq&fHebuitQsjfR;Nya(WvroC_YB@N^l5hO=jS1ohpE6VN z_5y^tNH7Ks=qJehHq~BkvHQgz#Kkf6hpyGQjYMz!3>_!kf{eK9m>yf~y!jqqY}g#d znrFMdjImL#sYJFL&Mr7)UxenI8vz47ZOjpW4J=<}Wl($>=}qRDVaa+(g?@j+(n>MN zLFoq89$xic9LS^|y*aV(aXZbjPGMEU_~=#f4iA9B0UaCKWGHJEnLMyPQCB4pv^mG! z?N95l?NWMIEo~BlDWi1jLJv&bCwMwFr)s$~waaGfAlwpQ0FOW23U}K5k<66S8Eo|O z#8yiu-h$us%P4 z|2mvMvw(We!ExfC28UIWO;t@=>h#OQ!La1G4-g?O-$icmg3Ci)+N*NEGVD5w?8U{V z^4ZmxwDRcQGtgu{?s5!Es`m<=Pd&(>p*d>%E(bH?xo*_ZMBR6@_b&5v^>+X0P@-|Y z-)#_i`t2F6tpVba$=KhCxY2vS2U+I&@`7@>E<(Fjy%v&0e_8a#9KzO0F1XwnLQipJ z=R}7PgVzX48PEr_MJ-siOhs2Ota&k(5MQ$pPdaOyY@Dw{gz8a<43$eR-)Z>*>hN}RxC_AEr_>`-qU9~d>On4);TWVSv!YB%0ODCta7I(5x|7BhIcM4&=YW{* zJ!jNrxD4o|;)|KEu(b2*XWWEk$0snczlr;|M$`H(T~MuPP!qjgPDRM>y-~B=Pvr}C z6VcUymNG%x{*)2gnTL*9qgYu@ACA;)k@n)*Ri3?!Q(g;ZOEY(Vd&zg*v;Od!?FAo= zu#&tpMaKiXsf)MSC}zk>{>gDAQ-H!RLq;YO9DI?v>5w$RjmXRKPOV~k{%+l&0)+r$ zUk1lF=|OR?_nUZ_(s)#H>F8;ILV{N>I(!UOQv8}>FJNLqoNSZby3Fn%FlR*Ma)o)@ z9`QE`&2{!8FFeafHYSigz~G~uamPZBQkk97^GA_$)NZ*(z3}0MwoD@vu`1>mug*N| zCv@Jv))69`3JnfH?6zhxkIFIjb57!yFTLXy-a1Qs>>JaAE%Y};d-L_Z1YCB|%0?qL z^qh3%M=$SeYk5B~IKU3j9IGJp1_xb3(}%46lAGXUg$?=7tFa#`h;wZqa;v*kZZYO> z%gLs3iGy?)J!@PGFUHV0%`k7KG4U4^q!TK?)pBrHR7Jjz;PSJE(aTcMZ2qN&j6K6H zOLdY~sq1L{hHVQXYQhufPV!4BMRVa#??9LX|<@Vi)f znvKZy9;fzBg6R$MFI?56amLWq4O%8wjAa;NmRDD#LK~p3r--Ero!_0Q{}@BfVvnZ6 z_L6tWXtwV2*f(|?r0!>W6~?;9`VK{%A~wZ|k-W#_g4NcAxo-5+`}h%v*w01=VH}vD zc>%L=*md|UT3^9HwPe9)Aqz@WB)usA4MOv-(lP8z4@KwO*{AU-M6i%f8wiPgv{pD!?+gx5d9C z(28bPO8dY+4CoOQmmEG0IW;q&q<+agsUVL+Iesa*mug4ODKw0?moC6v>pba-i@}%FDA%=&JFC+T7BQ4H%!RGFD~h}^tn9sgr)84Tw}Ap zNYzI}l_VP3ak}~G`v$jdk;})6_`W&GA3~LX8h} zat8})FeT>tIxZiXrfay1=uY(I5eSX*gfFK6CNaWY7TPMxKA7H8} zJ_&0Bd4{9fbt6VTLnQeZvwR~@$2qcvGHVSR11d+q6iCpW#BNFg6|n5KSUXl}n+A!- zp8YF|3G!iJc~prvI6!DC<0|D`-n2<`+ab3tY@n?U$9=cE`b3n(TvBUjUf_EJR#GYN znE$n^7F-b9Zl7~4wx&8a>pL-c>a#AQGQah~0X4);?@tH(uM$Fuk1r=uQp{yX%WalR zMX8{y0F*45VNYa$S`YTO#q_T+37x%t)I2TCT%R*}Ktmu&MJv6ZYJOyC+DE|{1^Ao4 z&V~P%*ZBWX<-8%d@!i~jLC+dD8RDR#mdaGG7~){HC3Ezs&&02aQ~>f8fP7qEOh#8r_YmEK|S(0{4IE#|&|@{X|xI*b?` zQmM$)c*KY^+++iq*ySz^9T6!~LT2frQgsG;O0})}7}af-Q7ERjx*6BJ_fDG=6)^#U znGO+pn|XLD@>#pq>XOvXy;9o7F69pj*^+pVu^g16Km%2oN{<@TL3zwzMF}v%VA1Sm z=2EZk==-$YagW!r2C6vJ*98$p8CISXqeNMxKY~Vwf^-+*@6^x6l+ri|TCjspdngM<)e} zc?FQ$xdNcH&!z*oaeR0(7&TM-z86R6SH~$57Y?gNtpRQm*suJsfU&=XF%cYvc{ zR@54ZFHBNJ(oSm_|A5!3oC164# zGPtQDRYeZuaiQAI#r3@Z$Jbtk<}Lkq`_^@~Q{;^iB2tsZav*w9ii$uvqi@^TGI;a* zE_tv_9FR=k{W=H8Q<3_f~id0H8d6a{)U!@10X{wXYvq<7S> zbkNy(kajKZd5;20yfe zN`9)95(5t$SK2OY2YINKcV%5CcOF-6M}fX2IjQaGggRYPKqLs*>$f_8QRvuFNsg}{ zC^2c7XZ?VT;J3K|MXN2*(O*zuRa_$qB1>THCtEjxQFA4WF}Nl3dapsA-_js_2yR_7 z!uil7`Yse&w=IpCx%Z9Z_C!c|2auCKm}3IIriu(jI$7;QKbo3HhEJi|f)32hu~re+ zmNtxw^`&_@KkCUdImeMI(b>b^z$vYY65ls^;KEp!{IRAwN8D8Yn!>D|bIRflO$Tn> zut^bC-YOQTuc0P0Ah+?!!N)!3V%sOi(NG`AZyk0Ya-Fm=P|uv->taTzfMQ=kf>RC3P!-ja@Ig zAZa3o3d`Vqs+_`{dg$K~O(ZDOgq{5N{Wd`Zf|9@-^gMrJsxR0(Bu&7(5+!+)_xR@F^5dDmt$w7U)n^1i}l2@HrP|4n96Y z@Hfk>dYuWPMF`GE0+n(Dnq&1Ynh;}-h=HuMM3WfOzR z67kQ-jw&RT=K~w22jka9MX0@K9P$ewl%=IS5d31Xf6>SPx;_8fw*CM4edK>2Bd`?I zctxj&!YiHzURaEI34md8S`(FWlLabY$)z>*G5P^c1HsA(wvA&xnLpG9=Ew7g+@n&)X?=_@+}4#;XIZ<99`>q6~v*S5m+-nGxYEp3X@X0SnhnhDypt#F8b#dUm~^d-fPLV>1v z{gtC-)WVy2MME8cg;Xo&L8+`7c;|hsBGwyM`|CPk(&=8|#3K0z^81=HRK%JZ^J-~_ z6qZ0CDv)>!b*!oSz9G$zM^+PmSbmYbAm@10+2tK``8#d3sCB?RZZ*jxDX5UQ6}cb;x%X)hF3pgXk_9&e?r}(#-exyL$P7CIl&Zf~oUm-WP#g^ZF?k@F*^4%bXL>A5^y6yJ9N< z7s<9F`pc8?J6`tPJKXnd z?a00v7EVcg-l4q+5e7QEno8}lzDU25K^%cLNs`&q+;k6(;eXGbHzZ3}n@-$1$S!WSrja=;wHE(v4|I7h8>ZnRG9r;V&N79KM%0@b`M zZxwfl1Y$cilMPAloe)fGC)bL|QRCcVEuxqMs7~s=If*-bBx9X`)BZWfl_aKC5OI~( z6V*qsDKY@?EZK=Jy>n~6EyM|fKB>u~RoatTPmK*_tAPOA-P3~Ff?7i}{luAUhnF8fLSy^wO&WC+OAX}G3i-C2)1^6s?D7tO^#d5W z6Juoffc4FHAP`V5N1~wFB9sPWyWR^syk#+Z>N#+*9f|W-p?e1gvPtrNGo*B|Q(L+7 zOt>@;_h+a5k9@TXgxc{;SIXs{YaO^y&IX(2Aaw7>jA9|@gft_7sRvOhJ@v>fS93>( z60RpWJe0gxnTza1V;W)Nul>#}Kv8uPN?@@X>W;CW37Rg*YshwfrZioMl>@eviyWjIfiD$P2D+=>HqY>OF96bLEJ^BH6BR~PC)Rm$N3;_X zC@|^L18Ek9ps2O4Kg84q-_^y-2trex119gW9#`}RP}sztPE}QOfi?TfOkrbPH@It; zTM#pbQl}@GRDUIH*KF}Uxn8ytDT8Dv&F5RMH7C{4ixXNnxKU2CHObqmp+Ur(0x%K{ z$&3O|YJj2+8}od7|Ie+$?BUwB{oAF9XMx$cpr1Ab}&K4&n@uswJ8 zba!Q+mTcR;h|=@$5ur0hKmuqRuqd6=)<4~-pl|lA-K4_x@0HIE+aE5TzwDug zcj23tN#>TP0NfHSpkE?f`hac&(8^oVufp?x0|8c;*mfzX(gkRL3nbI~H?CzcfI1Ed z@&YnDfmT^f9gBI`c4b9+TbE^cz?xI9SuI;?Pv@MN;QwL&>=JVGFY@A?{q3sp=Lsc`bSANaz&yufzHo>0_=dg_}!^)nNEN1%w3Omri z+1_APNkfU^Q&qnp$;Orcz|B@3&QrYLzW2HNSew<0&0qiLxD-@@t9mte86dsfZwf9Jk$Y#ADA5r0$|erVo&tBfV(lJex|DZpY-ePwXc7C z5uVWvS~#}DkEXjiK%hdk?SqMN#Y#Sf^i+8t()?C5b}$jvMukl_kt%og&==apI&eM@ zzme%v;lt3fn_D<;?SI>@YZN9;}A4de0wW#yFjx^nu_ zL|MFVNXqVZJF4ebaO1OH@;92__TB!wsd>96fMY9rN6nRg17V_Evn^)vsoPMoZi1w~ zKr8Y?^eE73<{no^c+MA`h%fIH0mbTI-Qh8LC=92hRjDmw6+g&aE;TsDmK_lQRHBvU47uo!6~t#9}Ma3hsq zF6*VqSDU9Nc7wT3T@Il&`@D_mXVraERr&FTAwmp&+iHIIAw(Kz9prTLz@6B1h)>v- zo-H-t&Tb5uqn_4wwBg0CgV42$X`mecju!i?>F<6DJ2kb}Nss3jVg8zEOj)LB#)_pbZg{uR;b~t&eBjV_&UuJa!3BtXZm8AxH<1SOqx0%j+=U!2FqX{j$ z|2WvmIRSr}`r5t+^lZyNW}mg@J$ZuX{YEI3ZJ2`_pBHu-Y!>t&Z-r4L4`7MOs_5L~ z^zL2ce?b@6k(Y-hW+`vnynp!8wxeRSf}M*y+C`{ zuJAQcK%f&$XD#Vhc#aRmY20GPxLwtE)F&DE4f>-8dKH#6HY=&rWAK;z^RQ^?&yCP& z9_l}ea1TGBs0=XH7VK$jIWm}jgf>R(eT&xn)5flJ(P~Wlv~npAKT;oRuOHVVfo8R- zSqL-Gk_1X_B2fiIp7^DY-hK2u@>(6{xM)9}D9US248^m)!n zR*_%Ybh5#nX6-~78WO+`q+ZQE;$gn*STUvXNDA)n%-sX9Ri2)IF^*#RFe+{2f`K53 zp|80+|5uWUNvc?}skQDTF7jXkz~okW!hTAGx#8MbYTe{3AYgWu@!-bG=mul0NmXi^ zV`i{gwDV%}+8oEO^-huR)k~26#NGC&Au2DqMIWn{%to6s#WS=K1d^hdMLyKuBckS2 z#F^WqH zU9+z5rb))W828|auQ;tgY|esD=zW+~5e$J7$cXMIFV=d-^UWSPAo;Mlvmh7z{7i0- zfo5eGa}2LN_10V1EE9`LEq>2#0a@N*8mlOE#~Ai#u%vO%dT)kK$ErFi=)yh>VPsmRV83S0b?e_lTQ>humEX6b*Dndz;otrY1QY zjW;%VyX?vVmpf#7EpoPBR@i6rJ}ad9xFNfGnsDB8^FUS{Vu20iuJz`hJ#Rbv#nPw(7Ngokh$baG^i)~bGW z3T01cRbNg9)n+^SHOKtBMVFL7Ra-sI{R_21)Dm6w><&PAH#WGKCHk)Q z-Hfg|A|IIC69|fhID6PJ6t^O7@ip!07R8DfV>?Q>?%rFUa zoO~MY^7=-wM0GwtbR&5-TL#s_-exoh`|g!$b9-om1L@#3vqsM5v3g=K4CLgw^e-C+ zFlV$|!?!gGW}kAzy|)nnI;VdMoz_&+EpEcgJK~+HxBVsK?$0&hkFi+dgA31>t7|u; z@g4bH14peaNP?=k(#Wn_!0i7~l>4jW@TU{;Pqo~iS^=taHWLwBh0R^pV514sIr%G_ zb}{YJJS?))n#Bjt{dD^bX)2_DFg&NBNk+&73J2Z7BYEVoTgsz3Q52x z_~A{9^yPXTpj1UxaK+BfFFXeN#L09H^J`2vyL{S9)YZ=eO-%xMVQCjh^3|`Bv3RrV z`UO&q{$`8k6yk9!AbvC7F5oOsWy5`^<1^SfsRXbG~ z^RvfhJ(Ezf@l3R@sgFk*{TYms1sU(7N^_OBWuk$whAd1RYP+}mdtXpN>nKovRds94 zxx}RJ_pz|P@k-Mki9-QRzEZPLMW)FX7Gx^*i9JVQw*HC=cUiOsTOdDm5RS;yRf&uk z#hjPy^M@Be7^CDe1rx=1hIQK;bGAuCg>?0{6`A0Nozhn4>kgvF^%?y zrk2Pqs&c4TKNH-@E43%ePi`{d&9Nv$~b^2SzDH_PPbqHY~Po%O1L%;ER z-3pdvV;7f{z{yyTI=J~3u+Hd7tc9KkKBqUhs#^kws22c5u$QbbiBTbSez$%Y8=y>W zRNKJyE?qxV3QfiSImCV*dZ>o&Q{6>WI>Ek2!EA9;AH==+s`(Tb3|jySJQ8-~qOFn0 zmqNlWn(~)*VnvLDeuRDI9+^`G5O!I|s#-<04FJdtwI|g!t+QqoaugK=WNE$TIKR8R zoy+8hO;db!Vo_Z1mQGs8w45BsV$N<_No>$dG=-Z!V5;ZuCxq%3JGubjp6vJll`Sb( zIgJkI8{ED{DN?e4VXby^U5Lysu_uM=0--yoacbbo&?rzR<&(T_Os;5dw^4?7UuOn9 zoU1x-o{UT1H3IN8Zg&;agvBGtnWsv7Y&Z*>>*)*M>^k?&)|CP(uw(qJ>RK1>eV#l| zO`}^~<>w!3CG#Dx(?LC~zHsz>m>3JPkb`k_C(&woFP`$d=ep2ngGzd#Jt>oC0p=Az zjmlV0Ac{Fb|FSQR#~YorH1mZV8}ZeIwX6%QR^V}&7^(~I`ml0MszHrZThU#f`U7*5 zux{sVJK3VSrFT~^yOoc{MsHvl{I8C^&RgRd_r%oL;5PW=EDKU`af%RRcP(q5ciVoj zpud>fk}#ANu7PGknl5ChVyzeR=B0DtVcD}~X@7I7oo*1q(9Yw@-fWESi

T)2Pha|^@kq;SPF7b}IX&D(td;7iFPXY=bHrWt=mvnX90n=a; zDR5Xx4rG;u-khN1e+gM-IH-RMLPmxF%|)?<^`3aXawD8!^OS1uEzFtvcKArUrowL! zb^A6X7OLps{48?!+!X_%dwgl=M1NEqxFI_Hv;e|mHQ|NHY}eDT<-0_SDtyVWB|%ji z-`j%4g2ZA@&|QtcOkQ?7<^RD|<+rKgbmivEvJ9*AC)|HDRlA6)l2i^^eYtO>qk-!1 z$5Y~I>g=c<@=L9_aubQc72H) zh$Pp^_HaSfixp*{v(F3|Nk7k+-;WjD!EAhXz%n}HR9ShW^hSZsOReM(TJ@3Sy*W$x zBnc-mIIGmiWC4<>%&0jXr>c1~1P2mHcW>&)fZ;MFUmZ&G10BsbFz08NeTds*VRY%>NU zT%Wm2j_$Nu8T8Sq|H=Ud(EjDo>p`3{&ajl_B8{MWNe^0vSfuVHkh>hnB^qkcnHbEg z9ZGJcrckdLJ~E|Z`?0r#O*}x4gX5I4V}{Y(!d*nNR)&TtS&oc|=V{qnfNpx@|87IW z-mUP zkuxHtS?g4h9;Qp+hD@twLMS>6XuBe()!s^)ut(iWQ!Y6SGG1(Lu*qpE zW&$U7mU8ADX><690~HnaY40!sM46b@ja9F2)f&*4`r&F{K zUG%N}Vd0D@y-f@y=(7@$Y>UwC7c`raTGzJ(Q236jzxxgJ1SkmyC?y0#F~oF6=JPV= zUzS!XaXrN5>buNKR>P3DdmB6>KYRX&L}sGC@dOuDkfUiQ1dLCaBw=Hx(lESUxdw{R zcBYE|mSsSqxewDw$ChnmtA(anf8oif=1`#$-6HR{p7$7sG zDKWlM>F<)+bBLSZqMX*`K|bP7?OW+lDmSSg$!nN7CxrBs<>6(D&)JzUNHgb2*nEfT zj`)T=ki#kK((7d0GHUS}okb97Pv>6%S+yF&9w_ABO}GH0m!KP=X$bUu+62<=F;|s` zSM7o@ylPwG-Sxfka&FEg@N-OPaqm7*D0ZIl%e9Ihxts&iAbv)s{?Eojs3M@jb#B0N zR+@A=t8g@V(06+lm0Ektm0lVZLAtJaf$R$1Dh(oUpO6dj16kj?X{S@@N9zgfUR~uE zx)Y{`74c{3Ytk<*+ET<~#!V!;FwKz;0p222m{v72k}h;|R5nk67|NPZrTxYN<|zQu zWv#~+p|Y$21+sPbAIWP8&E=HL*1Qvda@Vx7A*@jL_J53_6$t~_ksk&?6&p1E;2<5r zA@A+9G2V*xH^;;>iAnj%17f^&DE8%pE}s>j$D#ZtBaQAyPq3d~mBmR8+dJ^SJGHN( z^Kh0<1-a<@?ZFV?Hm=Pl6);l7dYzR^M!{0Rg_ckp*LT6W#@F*po=@#TQ=xma8`lrR zdkiYUq^8oEgkhhmsD02{FF)bZYCO?~sSD_aQGbHH`!jR?D|-9y}0XLJs*cS>)w~ z_kzHm(4fUp`9k)A)Pduab%^(lXuUmo7$m{q8gwL0_J~sfR(b70ni+`|cA0dN% zUGj|1CbJpZ8BPJfJQCo?o^=tUA+JD3xOTolHc2yyBM)c+yw>8+{%xsMQ#g)a1#tEr zTFDUgVz;#9`o*dmMUU^)K|u5r8nw4*!&~_LWA)Nay>+u>FwcUV#B*L6g?dlYX%)`Q z1!-hwK${{165A$S8jM|zK8E&H0dQ6!=SsooY8MJtg4LCY#THx6{6?9CQV<|K5rGB@ zssu07qflSsNqig~(?H9BbCQV$p`$=tr77#;D)Yxe@gK00slIxu$fB@_n>jSbwO}Ju z;axzIK9PGe3e*$RZ8HKWe;}Lowz`ms1!#|^ zpsil1ZPb+H8+Nx_wgDh#1%H5u8E`Q)#}B>OGg%Q7G`FQYT>cG&EIr<^`HSybT&m^- zV@mZ8>4T49GH(SK1YTM0z)OUZwb)!(7 zhc|ZH5Q0RuUIo=DTr)E>Ah<4}GS~+m@)ZDInA|j)IiO$*un}yK(`+J_D(Gu$%A)5J zZ+{#^DHz%x!G7Kh`vmM03PkC*G%Gma;+*m_;khJuMLOFe+WVY+_Pw{Y_ul9DL3$P5QF~Uj)*<@ zYS9myArNsV2882XSbZX4_gX=+WhqcR-76i(UM|T-bTwFlmm<|gTWe*U+?*>6Zz+Gn z_L)f7!E-`8fnnPdB-P4!w*%aRuR~~ZSS-kjSmqd6h>5=^Gi~Eo_~7s!HIkAD+SO;bvyu?m%gQtc^b1u=o*a4^+M^YDBzONrp_RPA&s;_C{RlfTR z7k3|r8CfW66P-rXZEN$ZqlCn4U#uZf>%GW$`e9)%6zPbP&0!|*>vNdM2c+4NgSCi*(Yr8ht<6;&ncP!cvCg})+P3F->hHSl{5m+LA#vS zc27G+YdyTGL_n>tE4??Zi+?(HE`(KG*5w0eQ5reu@|;+Ko_qN8@DgRq`vejq*!|Mm zv8J`Zlx#neJBeM==8W(-re(Op+8Yz~xum=dAliIQkL=I)p&2PZ9mi~(cf(HOd-mXb z@^H0(70%8lrz7Yk`D1(k!^94XaJ2N|epw`NA$#6V11cJj3`q-WT7*fQ0JX;=t!$QxVI31wB*p-Ddfi_cJSv$i< z^5whWfH;s&x{+}Ixiv$(lRu$i?|U3HT24HLUQkuERsvnn-N$j2BH1=F43En2M#m72 zF??0PS>byg6_gNq+zzr`i|W2iK7Y!oJOvZ2r@ZjCY`w(gACv#KUJZ%#q(KS}AJMF8 zn8$$~@*xb`dYurD)4^Ol%Q|q)jp9sZ402#Ox85MTzxrxov2FGstOOdMj*}SKSb>Bf&%5wIdP?T6VXe;MhUQx*Qkm%qSVC z^QgK*HmQJjN>5X^owV_5c##eK+V5v@&Q;HCFl z+c5-}+f8kL;<2DfHtnV+@B^u!R!hIV;rifJoF>PztpMI+6Pz)-RgH6G?pspcl&&^N zRvER49P_rxbIkeMC5yE+cj%Wj#>I-#YR90~wU=uggx1SQbwC(bAj)i)*Y0*GF+={@ zNlA2F=eT(!Iymx;BZ62_CK3qgS}^8MF&9_is+^Ar24H0?$9y)Z!?xh2VR%?nK^Nos zcQHR3Bw^V4RtZANeefdos2^L#7&JV?E`fNK`-$WcKpGXy+$u*GDq=zH10P;|T5PT} zw#tuc{Jp)$;q|ixdYv$sQn0iP!?2vShkugY><`$S4yYyoZHfY zZev>a97a_A!S*(3wzjQtk-yZozNhfpraEz5HnMxqmxfT=|MZ#YvQku4ZB-x79MkVI zKSJ%I@hq9iNVQ21?dC=RJhQ9t@~9V&+_E)BUVf*3nTRMEq3uxJB`XQoa_j?kYgn1} zA!VSW-Epn~Viay&09q+fZ%Eaj7pzAh8&;29#h6ks)#-Jt7(asv?%u%G`?iS7sJVmA zABV*uUittB`BhpvAHtM2V~yqAQPw@Ia?c||!%Y{1X}{zQ8j9NnlCKfoNJ~C zmxSz}A3{_k19F9X>`&^6ecb5jO?LK$c?N~T1Mn|sVhqUIzj~Y#l}^(ib((4)rig5* zCB$5sA01Q0BJ}>l83BX|{+-4CyFWmW`bJFz;F&$oH%m?YQ*6G!ZE_*N$*DL~MNXw+ z+6u)5^gPx{k$qqsMVxUtZ|@?<9N_QR4Wty@{qDdCdy)we%B@LL8b!{zB{_kNGkWQ| zA(BYHUlFJ=Y7X@n=_kZbWTas}Dp4la^RRDqUUbmA1o%EEQ6oc>o|!g;!s6%vV&yCl z7KZ*`_D0t*DYo|-$J_Q^nvmyNKR_Q0bSgsZt=}C}_}Slo0LiH^l$eQK{g#g1Gk-LA zik_BUvQP_lc$t46P{e~vU1ds~$n6){k-NR4r$5g+)z%uQA!;0-=pjJ(%!gpNufD^+ z2+lb*Uu(Mb=^cNlU`^n~~R;_hK9 zvY(R8zd;+I+OT0$7oYuFfw{tKT94WLd=N zoTeVWEN}n{B$(d3J*P(+@gi!}gKL#lmp^ob9#D5w-rNQLv8T$dV7**GgQaLW7SA<+ zl+qG%ahMTZ7%b+vB*MfiOozcVQI??s%R;1#mHEFcoNs#N|7AD&t!Gf7~mLF~XYdG}_7Ph~4%F zXd?c5$GXIBmd?$4<)J3pI^9nr5$SYQ$WBwNt^<$kmBnX zih?!_wjgm`-RLOx3&sak8}+;_75wEsS*UULj+66Kc@tsA?$;s`0bDXnKh5)p-3Yf2 zd+l{*QI{olDKt-r%ce@YF~MSzf~Yw+-#X<}BG_RvoxN`JfXe2=~fhZ%+Zbh)zi{`xD>e&@XqpuyVELT522bmb=q90Zo zk0Ek9N`1&7=C9QNkJv+EqtNKIlKYOej@i>yT^X3<01t8Zb1?h%Q2(=3fnukpfnbh^F|I&A~S5*SG zt}e=`5(V=n_1bL&T6FJa2oXvAyweCf7Dj?0wTmPnW5oG`WweX3prthJL#TeBMU7gO zo+Q;m%1;}B6lCOXS|e8T`ETE=_TRmLq0GCe#>MzVnU%x=oDN<~yMGeFbnr_%N2k|T zzg`!uOQ1j1xY?8P>)l0Q3YhX5J3B({hS7X+02_Mo-*f`LNqfe7ctWBUBY7j+xsi{h zbg0|%P=6f;s5#5ojjdt{?Gh9MG{jn7k>KvCbM z3h%thnZVmfaceUAH!eY}E2|M`89}Dm!eLE2CQiGAUG)F*p$KWjVdhvNtSmjr;YxP@ zZg>65$HvL}iNkG?+lX8)9y!~M-yDiWs3{1MN}QIaLuLKCNeF@%eLZ4?i+n(};%~0< z_hC;LFg!DRsSfh<9ZR~|r!aVJwPOa4-=KNV`$N$>$o)n{>^#T1s35^jv8KYy0?N{t>wy{}M&rCs92Md<03H8np#nh@$I zM^?Z3V^)b97G%i$w=W2Iiaf{0i?EhQn)i!uBqWYM2DtGekv3*pd5Ym^1# zWCFq^lZD(HofuKXUKdKgpMlKjHpp-GZ@1^KJt9{=)r_dy@PyL#=J$M{3@mn90Z+}& zI4T+b;?fJeawnfe4oAH6BBCHuszf5SqI$J=s`blorLOYa?xf__gPm#RRL(pz+1XAV z^S#k^Q4I~G-BS&Fqb>^T*N1lX7&vHE8fsUGoNiq8zhOrC&*!B7m);>8nX4x`;x}!C ze}V@9c4>d2p$XT`O2l%7^!Sn$UA_m9^P=$|)FcW0D;)e+qrd#=3{Hwk76-wLUA4=Y zM^{_^@1S%D6o)0sE$BN(3WoX2uaZ51!sQR;w#L&D2ZTo>hbN@+LlJgyTg6jgoAD$? za;Ngo%l?%ro1N>D*jN2!fGm!p+%Y)fV*k3bNbP^o8OvV#oMv7gx=Y)=uj|9<(SoCZ zOhie2>sBv1sVxbD{!j;PARToJC>>*VqMH6nk3$E~7Mtd+7A93oRZ(i0R+@I|yj(AJ z5BKIU1elxtVuZh%<$uqaocj9UVF@1TL(1-gFCjE$T=D?HS=vu?N&NO!*Of^T|MJ90 z`x4ll!~FC;av5de^gm22?|zQbVXl^nVi^4}n3p~og_K{mt8x6yA`5icen{1%A&L8Y zW2h>Ca~`_TrSsdJ&FRBGPB$-ZYJnYw4L}K`jB0fe&utK0@^tZ|2(qzonCp4# zI~l`wY4)*2e%{+q2K@z^49P{oD1zM2Gt?8+5(nLO0Hf<)PDrl9Ko)K37y%-FjC!K^ zW2B8JcF7>O;Ziy{^Vg|9nf^#wKQKl2oJ&u0kdXZ0Z~Dk(kZ zX+hPO1!-4u2bs#h`37M|tofJC^j|mRpRV}8PWF(AppXU7kX@~kWcpN7h4g6!!0JSf z6r*>&_0IXd<#d3WNQvKUVDlfJK~pYOivd>E4AsNgenCMhhMGG2q$Tqsqu*wEFVXBi zIn3S1bPMId!g5bX|9!-ay6V|RVRZqf`ND+6KVa*SkPy@Bav+=*OVtA5Ln(>Xan|iR z8wG~obt^S4zj9T$+Pb)78Ihs^6|dsShXF~`$h&|vh9R^4`#X#_Hr zhsWu+EGkCmNTcwdSC#=DCajwv>M(cWE4%-vT5*TG0@1~FefPh1I;qM_G1S#CNy>hm za7rre36d~?XDW4M8^G^mQ*<;EdWq+jd4Pp6b!Db&x-`R4Kw$r$qbFCvJh$z>0vYp3 z0eu~&Sq-ntzQq^^zy7zX%T6IdfH7W@n7(4{5zWu zJ$)>iHohpcmc(Pv`<{m@Tt7&$u(s+cr}=!%9Kg5MaQWU$VmW#kTNnTuyp`wo4`92m z5|apxxF~-v{w4_&^G|I5FUI9x2I~JkaqTz%uf_hEBme(Z{{UUn;K-4*ghMpe{=NwG zkHS!izhRFl@Vg&n%kU`#0AW?y?9$iRrfZ4}?%jqLKMeqvG9VSztg$^Apv5kB!RcW} zEJ<~|8?9GTNc#Um^Sdk*L$3>0*NvSG>^Exis2rKKCNav|TS&}hHJHdYZm;paut6*- z5(?iyj*Y^keowiv4@YiL$PkRM?HVNAeq0yKq1@@b!;?1@{6;28f!+BcmH|l;r~YHK zVw9Xi!cEZR8OG#pYZ&Zxk=xa7QHe^v<&dZAC0Rw_9lOTu*{iGN6!|c9OAoSvPe{; zpd_y}{%BcUh>+E8gw*3N_Y*nM&CIc1#3VH_8({MX`~V?7uR;wkKuKabA54&V1j_TL zeXsqRi2P$&vhV0wSp1}Q9nbxDSwe%jUweR-%DI;A3u+4;Z_y`-y8 zA#}cd`9;eyi`wS(OkWJMLxZ)b@0cIm-P+Uz1>HB(U1$_K(gX(0Gn$g9qI=_e02%Yg z@^*_s9H}nkh({&WUku@LcME9tqH&E4VUf{nF&uTIll!G-!5zbK+fFOx1FF~PqQY@6 zZn)%#!jL;vZt=Lt2S=*_-V;SPK#gZHep`IZYhbrN1ycB}2X08Rz2l1qfL;{(iFcs0z9?&Tvg&(hKA?CF0 zyZy*eP530K=zEz>`fFNIzFpoeHSaujokpYlOlBPizdOqph5Dcs56q{lqw^4p_~p87 z{qftEd-^n_z)~V8EAOq3+Q$Bq$aL` zj*5KrRVbt19yCNFY>8+#Sy#V09d?!5-q@OOZ}@6kG_8{@*~0G*F6vb2eGDNb@u0W4ut8!{Jh0Z&q6phQ zsgKMW*;KFS&h9YL;pt_2mVOMyW}MtQmdBm(2k<~CmAxOX4l+4)_S3N+mTTNOQ{Y^3 zRdLGE_)@}c)_6xkpBOFobEmCyM(^i)F_y%7#b8YpT+5e^F~1r^y7KAdEm2iwtngry9!fjSo@F^g_tXQaxqDj*P2 zm~ZYH2)t;od~VtcCM^y#I62WiHqQWe)X;l)^ti7};ufR+04a=vHcC1D$!5%m6sg7r z*RgJ;9{m6Qgrh@o;UwKIG^dhVD(T0C2OrW(*j=MD!0~QSSXk0GV_}!SI3a@H^-zmQJlFEi1_H$VSi%@6EQhPOh5qk^F zA!ZCTK&t$!fvzq+Q*V%CXGT#?O?Wvp1ICPcWGFW2(#Y2O}ZT1BQAZP=}hbLfl|GDChk1> z({yz6+8DLOR}Sjv-x-{N%sNZ_%f8VhMTa0F&$I`A`NL3}=yptk&*k!iq-WY~vh0L! zNS+9;OOp6>Px3JrH#fKq+Ev^QJyD@p(V)WdJj2~93~d6#zIn^b@qdYV5hF=4eeoKs zPon9#4`@zlYzP>DQv}gVHn%Z1%Dr(w>@=T;wAm`C=PI(;1jF zBk$FYRloZB=;WYJ&*enLO?9p8v|Q=grBRT+V`cjec>C+T?$Qv-rfznFO683I9ag{* zk-0y2Ea0o_Y2SxBJTc^eHUL_Jb6VK=4?Q!cE)KYz0NmRc+h%*k8}H=^3^66664!5U ztMW}_cJh&NeL%wjqm|FW^HW~3EK%4NR_ z=)V0uh*JinGiJ>_Eg2n6Ykpwf_{2am!IqB+e_M!jQLG?=Tc+Ua4rPbt8NQlLbobrV z38Am;Qpf4o!F{7n{e{IY*u&i|5!ag!FwZ7Xv$z%#2YkF5du;nvv|=4a%;dVJd*YK7 z7U+sui$Jr`Dp7{N+p({eY43duF@?fjUGwTBv3r(Cp%ydNT5`*e<$rl046%9fzJI0m_g>|kfoQW{$YmB zIr7intw30HpX6*I%FKn6_>cu$oo0GeRj=p?`{L?nWXQdBWKcQsEp1AXu?tw#eb>|e zXq7}zVih28zc8iOr8mFZi$CCf0%&y?q}@|yU)t3|qB$vgOH@?>N=G$6Vk|$(<*6jL zhskxPqkYw@gvoP~mdZNg2m=&LF$_^NgSNHh_vl3zlfKp_iHMl3W`;5!9TG{)S3XtS zu_lSuh}mLBjL&eQI7{Zjik_+wE6NCF2=ruh8IMV7{gc~=jV5r((6!SCTCHINqMJZP zl;Kv>x(}R5>y^c&&$#3kz9JRGs+3QLr(0~(pWH+h^)MWtbkH3=_vvlgqkYpB4zOvE8(eIOYx2GQ>#u7ns6}?S!ycTG>w06jD%l$k zg3%rlJf8IcM9tUZac^Dox5fa0z2MfeXB#z=?u%#IBc(FzJ_t!IU-0UZ_L`QnT!25H z{S7SXj}__vmyQyD!yGk`_Nc1Pp9kFMKf~8FQEKkE&Lsio&$xz`<-n!6QV`gM2^5M@ zhi|)bc8Qyipj{(916JT{`Jp`fxzTL886n*fH29$>08beT^2kE41S6w|#we2Z{A)pa z{D%Vh+v)&Vs@&FdkUPbTo~l8aSH~mvySm4X4OSstS&u7esIKN2Znu^RVZgnC=!sp~2J>lmE zswj2_$lJBvOR#jct57HqY$?ax1pr)(2@VIOj7FACp{zkcsI0y#eO0)5+fLXpdFg^Ku8QCPM(b{O@R&|LRE7eP=EG)*w{nGr~(Su}?BG`Af% zwk5`_Cz%9`03)(N#b>tOMB?>+d4rmUovoVE%05%T@UyNppcsIy!iJp*lz!y(Kt!rvtMc(X{7~ zKEKFa73J>8HVlvt*d<{|PS)6Vn6FfdLOK#1D`W;0-q-hQoGgyE;5nq3yAGsAmDX_a z*P+>a2rXUlGYFJP;l_|O)06H3yoDII^4bZXKYqNCmyZf<#d4OV3j(Xbvw{6~5q~nT zd<8|(2U!ZTDjAU0xhQu{8q1WX--;M(NgkIy0i8%_tybqVSZGvn{+HUG($9rM6Vf0ZqcCNwb=nKJucyf3c zS*3+uWZLaZRBIYxc~@X2Dpj?3Wr@*!YK1R)~! zAVV;g&N|=#i598?2#oHmi0SV83zx|Ne(CuW%X30%@2kB-{RU(fCxTXLK!{-=iynA!SF-qcN6{- zJOiLp3fWdRAQ0^;C(l5w+I$btO651r05ynJ_IwrT(GS0pShytxY^n!Ow=rWyQ_t(` zq;0`%-g)SDh}RZF5vh6mKE}i|kxBp<&$mZ(_q{PAl%_^iH$H)f;e0YAt3ptp0{+5o z7owJLUEU|av;<%uqC2tnEOqDbZ1-lKD3V0c%)gfF*Ze30pag(KV$&s?epf`f%oWgk zBJvF%u2o2C3L$yA3PJVr_oew}EhV^-{72A&YyQXhVQeqm(S~FYpym_2v9K&U=}wOO zpcAU-cYDy5L18QbP;{&!iKZ@}O5<2uOg%)%vwzCu6*zA3cFaFNr?vtAQT1NQDRuhs z5x;T|%iYHK%NLraRZ<&rP>m*Ub`OS6Ci*-AX?CFDqXa>wGh2T&txo zOwB9-SiOq^t6t=qCIhjia*1l|AD}cHXw6#&RzquBAWf9Qj6^oBmhE711ry68kFxp{CX7E30~u2LH;|rBzT=TAoj+EsVyz+qeh9wYZ18FJD%y z7%#YMBtIZj@?@AK4@SD6TGqP0X>^nGibz(mb@ehHWoFLe800&)v7&D^;vpC9{H(4i z7{dSLSkacqaR|i?((`n%(J)E#H81@kQPi_mwf$}AbJJSh`zCBAZDBQDBHn}uQ35&M zcn3;l`<<@d?Y8x8A8ej_C(j!;n)9ILuRbpz>a`UQ%!nNvGXmWfiO=URPYv~z zK4Dh+0rGewt*`a=ccBBVE~co%_V7#!Ew3^#gQ?_3+a&DHdlS z+{-Xdv(x?x^h+f>xKX?Jo%eSZL|$rBYDE@d1qhY+OFhVSZF2KOoBzo+^#_Rgx1+S^ z-ej9)?7WPU;cgIRKucPS0V9O()5y~=-s^a9S~B)CYG4(Mu%14;k3sFJM`eVVw?&Sug_zE?l|Kc+ zUId-K1b8>+XhMVa1IF~gSvnhp*82r(i2p>Isf@FP;6xp-=j&_w!B1?i9J1Z-9z;|g zfIYpBIE@u{SyDyu1$xXY0}yYwqLZ&BLUJj zJN>~iqv#g(Mweg^{lAcqdLuh|nMNgca0>Ix@0iu7NqNfj@^lDIzK_t+F`#^0Gi;Z* zpK$4jDR`{J@gLgC(ZIxyz8ptuG?>x6Ak59F60c^cMU^$&?TF z?=iib7fC#~25`tZ{cWCQq?Uq6M}|xERm5wDH>HmcKKS?&K~5DQY*Z#yfBF-arl7gS0f&8|b&E#(i~dFW z5DIQJUj?5)A-VO8*;BOPC?|c9!0LLVoU3(;m%7JZ5)(Dilhe-S0@7L&pGWV}?R$sc zu1l--Ljf6o`bg}81~+=!tTqa>KJM?lIBNt~$`9WnvG}0Uk-USCStH3SNg6L4^W^c+ zyI==WV1n-&CaiQds?mFw*6a@T0ru4kBD$gF$p~j-#+B2j#y0z}r>q^HJ7`k|gc5EdDS{Q9wKpI-Mk4 z8x?-Z;vo{Y6JJ=YS$cqumIJ2MR@KR~11&MdZF}&4+=uhKb%|Ztm#UMXK9O1}t7=o= z@H9z4V+ApW(XEG-zV@)RI07^PNzeYY04s`YitH}PX|vgS8ycBc<>YTWxhP`s1N2D= zWHT-V;9|M`8!uV?s}+1bDDXShNkkIg;U{|*CT*V1(+BDWM=K+wV~%$j%e5TKNJJ9d zd6P^opgcdgsZZ?MDOTPl)g?)SC+DEeGdXR02b-PMGO+%|s4AL@j8$%tUF-K3hpcYu zo$Mz#q?(;1JJTxja0ss`0Id1B5Myo6O|d)v(!kQKs=`>XIk+kt*;d>Kje+>FF6U-o zd9H<*VC17M^C56lr^eU3)^#h3L1PkKsoccb)wGf(Dt0Ayo} zn^Pg-fAhDDXt&dCB_x|hCo~;< zdtIWJ%!r>7yoLMR&jB%S_e<9Pt(FCs+;rs9-k7icVKRhDZUvW>EV^Gj1D;OVa?(eS zsNealDBFr1E}xEdy3NVwEk0)FC3hhS6fCVnR}F%+-PcHnB zAaFPXQ*>4u6l9p&zzP5ok)YP=ClyAgsqR){Z-V>lM^j1!PI2~_9PncRssu8w7DvbD z(_GC@=rz`Xi1FYL5LhvkqX<|ZwNyhdS}`h{o`$A505vK3f0|zZ=#FjmBYAhs&blC% zqlf%BdHf3#_fv`Py96#J!`jB3KetlUoX9a&5X1Z)%o)yLN99ea0O~q3akm0G_#N|r ztcDE_Kt9KV3zc?Zlfsk_Hq1ZI%%8Ly>TCL_Elb@Z#fOkuz0j*C^YT^X?cjBL0TL>M z3SZ{Q7gcD_e3JMvC!l29gwL%G2CG1}S{$!m)C0ydUB}h?8ougVMoavRI?SW$; z3tE@=`Pf%*aW^!rps4asV94K>rKDu>u?cPRVP&19N5$aB(~}c9Iju#APdg1U)|ozr z6R2sW0J@8r<#&{7n`+O20@a?z7&wG(ghSUhrUAzijarg5?adF+;vsOAeK~qz1EC25 zpn*eFVOyV;)O>m`?VV>oNMjG}!XNhp?Wm-r+f)Bx-g#!}-72u6#(|``x(Tt^-nzzS z^mH3N@nnD(+omKXw&tf=V%Ud!&Vpwne5)y2UrOAXW}#M_PaJlMvLDHtM@ue17>DM# z;okixIYNLcMj{z{mhO0FsK(k2G*z*|YokZ=#geD=%afn{P{7(4Z0l#QqUE#O@9!Zb zQ~Q82)mNq8@uYkcTB*-=UZeEk0vLm+)m(f*Rn;gHU8@uz*3p9$VK^6F7b@ZQnMXS6 z^t$J`NkW(T&`SPV7`qdHY@WE@LZ$hrN!)!|lA04|HwAOMp+u&HON%@P9~__9X|yVyRmg>? z8hv%h{S?wM-TyQ+iC)3|#-J1RWvtSdOO9-w1%tkXLR!5|Cz@6e z?msJ4)ojyo8OY&p6!Pl9U>YiJ$D1=Ger<;O6$)Nlt=-L3K>GN`Clwl;SJ>k$sqgrJcqLJm z!dtgW{Z2Og>&5JR?VoO&>Y4*|_suUgpKAU9Wy$wVRZy+&LopR6cC9*NztDvyp8$y) zyI*75S^0V#hg5okrafssjgO-rzV(uKW=7(ivWt|ZjiklkqDY;6G&qc(H_F&x9?1jX z%8&ZS?=(+N6lhBLNOw|=W!TzNX48?J;p748ZP9>cdED~lr+G`KBW~(g|EFh{tR)%w zf(c#nDMNl+NP{)Qk@o7$@ahuK50H2)pvQ7wIyj5);4-QtTcbAMvlyDGV&1f(7g{2# zgAjmtydxb(w&LXykJbb$+4TIQQ8u=DGWjn@yDkJe(bO8>c?iqi<9OMS;mA!ffa4fc|*uU71i^FYDUnQH4#4Bno07H+ep7d9hX z;yq+~%In7Qd;&84b)NYp2q2#cR%b3h`mP6BN(=N6%nCu9_F8cgIhM56^2 z=E^ieyk{@dsuQ@)Grop8A&fx7-76IKl6?BD*f z^3F^_S^h8&b0Rz)ESgn8?E@jc+BCnLY{D1gGAJo#Pqq3uO_^?|`|adOpp6~ZqXaeM zww$(ST$C=_IiFr+h(9q3YF5Q@lO)hJ3hDKD=M3=J>9E~@ROPEE8SUKXg}<(^d4tkd z4IqalNylP(2(xrQFMH@rUMDfHITyVuS7+coux9wpZaUt5A}3%DFMKj?FG{(|c>7K; zIs&bMn}8s|Vr0*z6PxoPHqA#J@41-0rZczhaFOB0mkcxuwuZfQY)R7($?vs2ZUNw||*hX8s$m#?=M*7+|yBhvBGE2LkXCvxfCJ&*?CS%&)u3h6b z;9O6GX7WDy4!ZEf-BdL&4#BhApp`$zpchhJpN|&aeYUw-`<#p3YgVVLF-bWqknSU z&!?}q-_vIX@+dej39pv{w4fs(yM+f5c2WEVRWaeglbi8{raMW_zql<6I$D1J z@7Ml8d#esYd!+x*%>;-6#a5~!1Ydjo$2gV1fB%MS13`*(?h7D31#(9HuZ(tWP(*}H zNsv46o-EgTJbxJO9a1EV&EH0;Ln{Wr8d?PEC& zRiR4ADF)=utYF5&8G{w+gM!aFqs?r15(i)Gi3!{rbvvGht1S6E*F=Dg^B37A;HfN6 zQlQL-izLm~CYl81I0o|?=H_9_Sl(7RS#sSzqJ!|SMDMc^#W75?zjtZgReoAUl5YFN zC62UNf2c;w-lhg?UNfR6Wbpe@%;ktANs>D}@WSEBb2^U_#i-i;>#kDc$Q$)KbyQ%L zCb2l9auVoHu373TL}3M?)b;szqHV+YTKQH7I#HdjI4uC*2SUEjAxzbZr_f<1QN&h3 zIJcUjpPxzd3{5oeg?+mX0NLJKzH^?LTl>~3Fw-Tn1wtUCzJbqx-Re$PH&G?hOFlxl zy&{V%YW`Aj{*=@*el!(LB1o_ZYEL;mHp=6L3FuKGnF$eS637BvUf32J9NtK>CNXcx z9nmSQDjYa1MjVyG#;2HGuJ0Z~Q{>X~8F;8%+V1*O0ye- zcPpF!qYSf|CsjNOV+Rqd`e*5=Er2yWA)GRF{bW6FXkW`%Pj65eX2QOD$J}>p-PGdS ze2EioCGj~~i-Ts$Ofe^@YsV+%#jYeJvcSQ&6M}drnK`YV?wT>ADZlt-EOpO8Y5h?I z!UG0;zQHfD_TocRGwsK3DLSAhT7F_1O+gGSR(Ss82$%Ni-JbA=Xn=}FI!1hgt6$ER zqqoPh!KikQn|_G5e*4Flp>xy&^?xPM=x=OzniCA8K#M&R3%rAG*ovt{*rzA!JdLe0 zwzU#tIbCzsRo7XirKDDc9++v$;PRR&SFCXj$!>Vr5gfmmD<=+w*2f7pIJZL0-{O1; z&zmWJZ|YQ4Oa0?bOW4vZ$Y~oD8F^{;E#h(PE~(jIxqHlxof~b@cm8DyyJY4 z4}m^|`Dp%?CaAotOm}W!aWmnVH8$uLwble*9A%m{?%rM%wl7+M!&P&pw8(&gL!a!Z_S)tuo17ryLoQZ-3Vg49_B3 zV@1$;QfPzppm3^}bwLo-sNOoADi^Bm0)udIj%ZbdPQ85@xKtvdU6s-ahekg$cV6;^ zJt-(KcqNdYmiAh3O$3&c)Lh+Fy~>P+3gP(k;M0C&q~PuTO>&YxWS0xzv8tICEOC0^ zS91riX@*H6DjhT7&h7Q`Ecp(3!@1Rb{!(-rt~Gn|C!nvl*IM44sGtUyhm+*1UF0Kr zSD{7wm+L|X#1E)KbZ7$}(&ogaG|`@F6UQ9~aXgM}F-l%46ND;?DREi|?6ZW+T-}x= za}WKjS+*vWQ&>~l?cy-h_F2&-W?hn0r0RWuRSj!d*{RfK@K9O!sTcnNq7O(|n;hF* z38BV2>caxJ!SS5}Kiqd8@mq1FCGXh$GLy2#T`o*Wt>DKq{Z=#Q)N)PC`IY7zlti`G-9R8^$fLGy0oV^Jl&2*A@!y}Aw(z*FQ@BjB(ZqT zx8TfUNB;EU9l^bwd?J332{DY5GN*<$DrW!Xy4H>b3~YXIH_9ba@T4Re%bAVr;F3u# zV7xGImmEMv6^mTt9N-urN$3NS+P_;XA;xVVYAe*8nxANT@Dp;+OyTk@UzGaLxtu1o zC_FjPw)G@M_J%h-g86AH+9J+8P&3FvijutJ#3>V3mavQ8Jm{v@iQQIqi^>YCRpOph zlE+c$r%LXO&7P$8>am}3}$19(>ytUC66pci1c2gBOacQmQDx<8 zGaU{n-xO?AC0zK(T+Ll+S>0T=S(LM@RfangT8wOpXlvWwdf=mqto`RP%y(nNDrU-m zFbNhekt*J{*lMcr0Ll`_({U2tBC+!Lv7s9qrKo`!5Ohk=grFO%3_n0oFC>6cX)@5- z8qi&WwWD-S9+8|UWAqGv;N)C>KiJvj3|eEeI4tp!{sA&}yE*|fHedeEjmIGG&-T;E za(9wWcB0ke1yJ61RJ1u> zBl^$`K$Ggpz&htP%wthZF5{Xm*Dr{-b;5wIUicp=t=|^9fHk@Uyq#7e=}8ttw-*ce zqE-0gMqT7*EL@Y_M#=i!1VTKA5$Z{mp1dK;UiXy8Z+ot_o2DU2yt`bdrR(qm1QXi* zij$k~t$%WH_-Lkiz>e+hL#~g|FF>p6wiYO5AD{cjv46B4XloPpa(03VX-POy@A=BL7RgvMTsWqj`Poti%U&Xz2@}+<0P=YW^k%fGw zW3G4K$0mM*t+v+Ev|&mWwFp>17GJVmiH>i~4nPe_TksKIGmS11DJli;!|&0}>yR=z ziv#I(X`NC_ryBUT&0d6ybvSr+(&}MAtJ|i>Ol=&<-ISjSpy4a^g1{u zkPDkUUUX6cDie99FtQgTQ#-Px`s?;QmC%)t}=#fpj$ZDyC4; z<$^0Ls>e>$v2Yvz_bfrlOSWWDKfl5LAzjcwRJ>bLH3O}dD~nFgL~_J8grkq z$jeJVQ$_KHl3|mkQ$AR_^ck(EMN@g~$v`${LP8{lZD9bPvfb$g9nhL5TO*T!a!N`V z10J4Fw7WOy_XPHa)m=$U#xQkvinGqeXZg{lY33G(hEP4zbHQflZyau)jw|X_S3B0=`ntWO z{{~vi{3~M@61i&1%my2>$iw6^N|*J=qkKWx5&hNV3{o8;3&pc~qPvuR$hXWytuv1E zJ|>jQn(GFXk#vhWqF8}zaEzqgFr}C8Wm;1{mQe6lkw)GBOWBX46iU5 z;S;Mhga}U|%>_$jJw|s;BO5C};5nwdLrE^$0-wz$Hm_3ltGzQe!sS4Vep5uH58)xe z_bLLts+T@_cM#21SO}|*(wA9o6VIzF;=8%XWN=0AWXGYuNN4@Wzgu#5AimUqf#H# zA$~w>OcD!3e)vtEX3Ur`EJgjFkAb>k zspWNhGWQ4~pW)Boul=+KcO8rUa}47dk_3XK;!Iyk5*_Z7_0S1}i9&oD`18rcp`l~v zx{h0f&4h|15-bDg1+*1WJN?@ONC<&y11JC=zg_~qan+U+(gN*Cpu+Q-cESlfbkUbH zkb8Ll4{L837ggW2fo?)TBt#l%P#Wn@krwG1K)Sn2qy%xK5mdSvUruMJD&H}$8+BI!}-8(7-jbC{jaswwbr^WtdFY{t)uh=^G2)zEmOo^!u&TGroI*l zdJP>H3rzMnQg^H-#H(tk0D8XyRxs-jws(=5IE95p!bWj!Mb~|CeFg;9gfx{xuY5Z+ zM`$0gBXdplKHD8vBX#?4j1aU7PcX)FkeV`8*R^53#y^Mb#e-(%gEOuwZWO3WQ9P%o7QI!L$lne6aTpSx5 zhcZfuP&`s`lDkH`GWgEAzRlWQ-mrPfj&Rrk`r?8WEuBy8wO-axUfQ5e{GFNlsTc&* zs?q?3f%*U0|`OOATdgbREW5a%|V#^S|5jEh6Bv6!HFQgTee3 zO5*KjP4C6G!wbB9IdT{$*ZCQrO`nw%!R)+7jUas7=B^B-WtrILmL2^Oaf&w|VRJ21 zm8#9kjY!2IwbgJ{eny?>P=YnP`6^#55k_8M1OMK<@X8lP+bwjQvEZ4!3jOr2?TdiS z$l8*=y?vg(9>o4hnCVC4&8vfNpB}XbDMe-|eY96z9}he2Xj!>i51!@Cvi!;fNn{df z_9ByPC-rgrlw>iWOnt7Ou4op>DOr2%EF>?!=_Wl>uDM9n5lFH1w`Px_FT6wfWu6($ z>sLrTHWdd1&d9v<#`BW=gKM?+w0Fg~!v^ZNNEKZjRFgJt1m@Xx*t?pGHrCvcrRyXfbk6cZVBH2gKHNLjUnWJ8=$gjR2uHdVvr;jE|wVtW6id#YJd$@Y; zs=wZ8U;F57yQ5}el;3kc6pS#35vj47kFl!(Y+kWo**lLc&X*iw17Fmy9ajm|V=x|>D`7#zNxA@aO`ig;wg z-@u|W*d!+NU>npd{K<5`Sh50v+_v)0Jobt?+};AaQqu!1PRB)sI_1}9&yYfFmuyei z3|^DZ@3q9U;3;jOQ#?BzGge4<+l|88wPs8PCNLTJV~T4%qDMsfTb>h_>4(qg^V7N0 zUe4KvohJh$Ajt-4b|khZ@jHg*s}Ua3%T5tB42w3arpS3R99?<(mr;qe=oC4D=BJ4A z&gb`DjO-U3&whLf>6DB(=>B6xflbwmM6 z9GBzg5&K`9o}WhdKYy!YEh}V&2I^XmtII#B1k_b@X`OO(-!9D^+mwzU9{sP01>E^G&v?cNN#`ad$Xil_*XlT9}n@J z|HfxYrXu^SN4R_aaZuXZG6J+He}Cy;v_Hv%ug@ZAAb=-PPzaB3-Y*FAbsj5K4Ru6C zSNqRK!3uPS&`H_;Nn(F5-xD--k2 zKL9V!1eNs+tdl+mHJX3HBK|Y$*~x>i`aaNTO5tTE&_;%K+%d)C0LfXZ zC(+{jp<$3~#(`QEvP47NyhxX!d>fSGKbOe$d_@0wk^in_xIq-zo5L=3J`748<`Vq* z={81_r4N`pe~%sXA^86;;+x=&{gsmWpJ)F4``@(s_mA@(*G0bhZn2F8N8mkDd^+R$ zN_r_{%?MO~pF_QQ=ajJe@$m^rcS5DWzUHt(o>&jY%YGB}_RNm4V7}WVVSGZIY9*P_ zHEp!^d(V}z5{Lvys)Z41M-z3+&&J{xmQ)Lk-kTDmiFEiS>N);~YZ3TF@TE z>`-)GL{YP3tOnfBZ&i?B|EYfh(bk7^ildX*g)B^Y#F)|9P&4p$XB%fu^!Mhi_cPSv)Y z2p9eq3i|zK{TVhw4bSs%*n?~1_1s@) zL+LkX3KVEPiaO1o?7GyFVf0IWUI0|T-b-p`=yY?P^Pj+)-KA)IU!&+pJvW(OC`Mx% z=N+Q&1Aw;tqsRRUQkjd|t{V6y^SxCScgexE2;p6ERuux(Q@`*{#UI729x2Lo)@7n^ zBFj%eC22mb%eUHkbQkfQN~f5UNH#*n_=VM~!M(AJ#`Hy1yV*>1MrL*&)DJqoRr)hR zdR?fVrV~YjRou{z)h7-QHncr*}uI)2?r9dG>UA|xj-$%u8WE30b_LQO-n z$#`B1Dss-QT&B5F^@}m2l{7Y0)nS>CdGNnlMoFO!Ak#^FO;YhH=1dk)rL%mfW)a*1 zAIp>Fn)m46_e)*E&$F8lt#vY&Ikh>f7PmoTQ#vwfO71SRcp~SoD`U7H=aX-ZOcUsD zk5_d!BL3-QCJv(vInqobkJh0(1SK9P_uT8M3ZpF>svIi_&*XJ7Z+h|>a8nI5cV7gx z?6X;}DP;42AYD(1SMRpyPV9q0M%B84;Eb<@Kcv|>>K-3DV*-Ju&hYWSnQ`i&q${z^? zcZzdMk^XzqM0ALAGiC(i8ydPuvBBZ2xu%Zq+j6^gcN*a&Vbm33gF2a8plZ=j{awyw zX9_y2sJ={yb{Lo3@yY7N^t(3H_DGZgS$DR!EZY4R{FY(#*J3m9JxX@-V37~o6J@xS zTsU%>{%3GUk-(`eoj&@Z4y^b<*X9gymp3V%UOOVB)91qxPk#fnq-;#MS zPani)Fk>KI9-no_d@n{rMF%9vI+~5wH*;W-NKcGF>Oy>Mv|r1lVxkp{);WqhhSdlZ zu+AmSW%vP&zxX#O`XY@hGc6>X)H0PcN|Pml#$hwe8`2r4~0nNy_Ad5Qf8qE#19R$Ms;#1x(7-WBxF+Ruw1Rn8c0mY8c~_1KxCmcj%u&@p$*` zNv=L!29D|*lm%}9Gbbd*)oaaOBLK?#2&I&EvUtVcET9%N2|BEBF$vwiEEmKaUSqA zTs!$*q950QqP`z5$=J$}19bcs-{a3BS|&6;lsK3j?kxo3!!@N=;U4@Bca!^iw_no6 zA^a|S`hj}VXEM~>L*Fh_-IDcjzUBMYcRHV7F@Soz#s-QWB=a{PZTE}UYd_c}3*4y8 zNOO>39lFz>MrM`Wl(BnSt~`ea60*nT%!bvZ%bIn+G)p-DB%{UGE%usEb0U1@g#URE z_sQ$GA&gOYi`*YnGmBL$?MzvT%qgk=01QVLXF9x{G>v;m7>__xAd+IGo77v%65@ zd0nVsT*ZhbKr*=Y!H)hD)GE8iqLx_Z>dt;wXxYypl5Sn9fk8vpH#xfwjd4@k-PI_O zxGpdp|me3_?vCrwm?q~&V;snjvwCCNU69mjk~gGD5@P!Xi^6~_-$ z!gzdV1mMYjlT0&o$MDmErlG=oU!;ZgmJ$E|h+Tu~K%nbkwJo?jfSNha@FhFN-mLxR zfnvPOAinf5abzFIdgUzE<@Aa;2HV4<3#9?vUL{8Lvp3P>C^dk37#UHO&fu2QKk&BJ zPgshkQM--dF&f`C)aS?4Q|=#9V2|IJ>gP+|wfOGKgu|A3fT-!+`p+UhT#nXI5VV)?mT07P0xOm-B*F5qv-*us{4YIg zwv!^}6WNME3YGg?>+;BVYs*4IFofT_#|lzEP5#*@F#2>mA83z%!R&fFp!tC2s>*S$ zd$4ts5bPKUtgA{akt;KhroaJ~@Oy(qh%a)slogsi0)FwnGrtZHsbP?$?*zE=CH&~P zi(QaOqr@3h(G-mMc7!*nJ`61kM4L8xokDnoWDm{g+S1&5dwXHfD)s8pk7X0oXEgrz z5g@WKI=%NM5x-WN-7ns5pr2r_b-3?Lp+wQCr1HyfG096Ius|*kaNN?6_`cV}UM4`4 zp{g`pB;q%F%f<1Db}dt_#ED-J_S#9xt9w{#%d{7P3zC<1v$b()VYy5=i=bkWr#+it zlv#BM9~nvu2w8csoym_T#Xa9V*Z4n%OP~G=q!0I9K}Ph^wGhiq&XVxFE_Pppb(dG9p&_{=GjcCK1Z3!oY4jns4|=B}HXcBQK6FvlTM}&43kR>s?*4 zt{Drk#=8IP=1p0#iVuLca=FC39LRnsv~LR*AbpXig13HwWU%xH>< zMoJ&L#VA%vX;YNmM;dl8GaT(1K{2#ZqE~j*4gcbekF`N#{YrkUMq1ADqiFCwyC8{; z)ZI5GF?mm;?%=#eqG+;EW_~3#=ezO&E?o0z1@xx^A=A~B z&8Hb_y}$F99+B&88y?zDk;HJh|ci>ML6I~RhYe$>2TlsL4Fl=)qY>~drVBc{1ve=gjwa}z&Tx?)8GCej-<9-#h0j5p1oT69=BM)D{a`(L69{}rr_lFYn5V>>qXmNqo6^5v4 z2(+w>^ez9wJ&ry7p6NNgOK5GU%5~#(JvoN{kdBiIAnR;K_u+zNyVw1MUs?JdUHp8F z#dBF_ZKiTmcXu>jCm&~-6C#wfQmWxD)Wg9a*qb5T--YVqe0*7;uKGTMXR39`Pu_5#4e{}XGe`$(+jI0wmF$*{Z~1ZVSST( z(ztapNt6|y=Huf($KE=)3FKAE;ct_q!>B`$MK2~Iqc%N3jwci{r=W1r5Rv3_|DHuk97~LWz54wbI& z!xn_1qiIhFZhrX`i@LSaaWAV{CAI`}+s+JyObkR)D|^tZxw&?^L9v-_a()=9CqQ^i zocGg`Da@=?u&{miHm+wR_188?cqN6}!16%RMDxfhw!wXiN3M ztd@O)9aLt$rS%`6>F?so7Z_v=w4>v0uauVQiO$nO5vXkriC$h8bG7-Gg)k3)`KYJc zrZ3-YP=_G$)5F!p+sF+)hCs8QC3VBf9y`he41b*w0!AV_wSruayH(Z4wezaxde87I z*t&<%K;Am`xa^83#-*@zIEErAt2Aa!ZZaL83V{#~cbqodVX~8kMX&k^sCdODZ(K1y z2SH*A(9j9YHmD;bw*3q>{zPA~R3e(c-9Ft*(t<(~G~1 zX#}jKnyo%j83faIBZ}vC1@~$tXi8*^)G+60y9g6d6UGuWr&=Krrx0HZJlMRuCc@0j~VQqGpSf-b1mK3bL(h zt-ANQa+ZxX^;eX!k|LYl`Fknh2Tx8YZ_7KALFvMX)oi;TG|ks+j8y?T{!~(`j2{Mx zgol)RCrp~6g$DXtrHq54!|TrT{4L(XV7t2ZPRl`eEpg|hF7wrCY0V?eN=H$Gp+8sB zvET!7V2!P*O~0k@dcu3brHv2AMdyumvjfCy&An@1To~ay%!?O&kPy|5Au-{_Oy~dT1&mqC^_$k1EalgCdE`1fTFaH`=!en%5t(2vC) ze>wj8XOIy6BOZad6^e=W4?p{n@6Bd=E-J5K z^kbbzBJI4ZGIy&*mb4JC7k87%0kCE!PgW;ni<6d=09AT#0hVTkA{bY8wn5g#hf*qQ zB~XlReb)UoMnkCB(Gz<%B~LUKXI1S6l#PYoX4R;N=!M^M3Uvi~E?m8y0F)i(--J(j z3R$KaICz?0euj?zvip8^Nl<^y|NQy>;lq%AmmBJ&3ubRa1mfgfJDt~uBFv6@2I7mL`ze(k|8~qvcR&D zOCs$D)i1r2J$M|K;#!6#!E#q^+JSA(p6-SCAIPst4G zr%FIGouh|@>AjxRo}P5*hxn8*+-Y~i1ylB#b&cV}_`LUGWGRyiFp&1_>6)SB;(}@z z9oNf(SZV#7&1*W``)WIKT%Zn4%JUtaY;gnzZpr_*wJc|Oy%6^8pvLfioe7*EPp8nM z%v;A`WKgq5cT})ajVU(ieR^4NW(=}@*0G5UX@;E2P)e6$Jdw2lLq=J?k&oo@nnA3E zjh?D}trsF2O<*tCb10NNC!L#Jp7nRpO_8x9o}NM0TW9p>_n^j*tcyM=>(1F5(1Eps zLT#6kkv%Le#BrC64m9>TF!63s)Q=^uJ3cd>H&Gl{A_$u`ZDLm-$oDnK25ga_jp83&@#`H6%N$%FT{$>-G@O?=DIQ!Tp6MsJW4J% zwER~nzL!7xEZMjP-bt1$&tJ9r1zAL3stm@-Nnah)|7Iyrq~z0r#w)8=arj^T0%QT> z>|}UJ)|*vo=|3}d*V(#%{b>4k96jb{iTebj!Wgsn^|I2ALbVZ( zGht5ej|(^dFF6LZ-*lF@d?o$i$8;_KSH&*5GWg(3*7DoyD{;chKY%T5CXp8u&)hob zSSl@0LT&)^?k_$O?#?Tcca{3>6yH@0AIQ0QEXkYEU=WWh3Q5Z>7sb1kR*B-UC_OkZ zNLr{Kr}5Dpw|74*xXW1mahYuN=)ET?(m!_ao=xv4jJQ;oyPOZ-Xbij(3tW1L*fRcG zp0^q7_m=!Zr>-|GR?Yn({ZaQOK57ulb`WPF0jN`?=b9Q|4@`yzij-d{=t@e%~rygh6!0cR(6+6c}310Q|AituCs_DSxuZP z#IzM20PH_vOQjIzEB@F?`2%?W^6ORN1YTlyZ^_J8y;*zf(x>AnqYGFAl_7?XGw#V7 ze0u52lRh_F1#a-H<&$R5YI<=~zi`3zzF)BbjpP1aqNcH?O%W!axO0VgU^mjO}Q5)EXn^^&TN z&Dc-w{_;iA!p)Baq;Oo1U)l9*XtrHa9x2X)E1$%urR;JC=?hz5?Ew^pq!QY28r~$_ zF4G+YDHFi{P1ru4eGKQ4)2pF{5`=;Yxi3gl047BLwXOd89aGq1-Od4QPoj>n+c`4!3K30V?$Rz>mrVmyq<8&AaQWZ zyZqyWQ_q<_(=%0shZagyN$<#1Y=aZ+bK>V6+78RN#%%HzXKwXby%Z9cK7(JdIPaN} zj7be4gIXfZg4kXP%xuYiwxo=5+M_+x!%K)<|U(RxO~-}Tl- z@X^MBuA+E9SBrJ*kD4DxQpD`ed+U|>HJlwN!F$k`icx1nZgS&^^qF9MR=Ia2uOAi%ff6Me+4|qgONrlE0gqs zV1CTC1JY#uE&z((02ca!XsO-Y?YstMoB$%Tp!@#<2~cjf+^^ju0K2bwCD~QCKq2*F zd+&m$Vy9c{9ZYHixIs za-nk33|E6-a#rw(&w5qr`DO}jBN__XeU z;$ci@38a-4sUZI)5-cGS|Hp&;=hc1Y2mB7Ryqr@Y_NG0rR+-?38!|676Eks%)NV6x zVz8x+8GbBMwUO9j3_h4FnVDN;d`OO%DvxyP?y$Bs-I%bdb$C`gC}It!&1IcPbdw+R zaxb0ZwU&Es(qC|;0#ph>jNc~PeH)N2n3Gs?Em9sV_V;aMMm@A32m|;1udj^Fd0&bqAT^_l%YO<1v(2e3FBA>=?!wgrWtMM9D6C>J1g^#pmesy z&amvMsSqF%i$Z-etn+vT>;N}p4eTR}R=hg$Q@gRFvS<|(Xp|$FsF#yR^iw)W&F}ia zgE;oi8VxwQA68ULi*FCCJC!a)<2{yl=h{;@YNJMo%44DRF7+I2sp-uPSyi6f5TUdsL*rruMMIkTh>NAj>IObS`6B2{x;SE_XHx8ow$5-EEC|ZBV!6LhP6Jj}d#+}?sRo4;(K{Sd+}jdO ziLhQcR|^dS8`|uFa-4~i#`9HTkhVsYF(%|L^mVV`O!8dmIZPgN{6eQsmN2z0^n~aO zK!%1I#O3)t`-15W840`4frc$|GH$9fhE^j7od)6X_cyDuKG=V-%8d4QJl9dfXQ@)7 zx6y!7_hBcSvRXVBiYDb*OR==}^UV$hR^$S3X@iz9>|_$ud*7wLEOPQ<+l!gUk|xaH z(CljY>S4_Nzb+-wx;yWSWcTVyWtNy%F}=*D*;^EJnXV__8?9S{Vhm%5vot+6o-^)N zyCHfZbaLw%svw?rn5C`XtKg`ptr^`grti8602*MT`Pj7#WI7rRB4;0wC-*kN5J@m> z3KLUw=RCbt#WmtOB?Y3k=!ngb?dDKV3&Rzrc0>U+=-%I{mlapQXcx zedE(HgweL`Wk-y>YP!V>?+C4s$=(7hlPEl6rf)aA^eFA4C}6kh(;|KGd|N$S;^hLe z5DSN^4Mev1*}^W^WNzXeMipU#d?|MW%VyZ+VdRpK@(=AVnKLQkgLrp6(VG^_ly-&I zEB1z_s)S#FX_SFp(4zN7yq^Mw^6(;7@KC^JkSI8x`|K^B4~M+CneN{vD+%6ZDGX_= zv1pLxTtG2LbAP*TvGUTHNj>01{*!F-oCuMMkfcpF-iatjGCJ!ca%pExjbU_N*(Quv zKD|piPhxo5ZdRu5#@$@{8fTDrw~gp}k!9j2=Yq#58m5FUXIsIB?m-oLxrMIA?2ugp zEP^WtWlx4)LJ)8r;{$5R=4UFL4q8KTGmq|x;kaRrPrlBs+jEWe*I;z*#~chPi~d?FeBBK z$J=wg?y!MwBd441WF6o}(HQc?)hr=GAG_7?M>BrLtK3gFb|I;)fr(aYddDRp$(zw! zO9Tf}WHW8@;vbN&@fo;gFeRfnb8Ojci|@FNgfaRpSH=G`eb@rXSVbp90}g=2gifHv zw~5>)T%c`tY`vJQMFvMNLsdufKv2E~B+EJ>TdG%<(@;L6s*uZq$y?Gean56&6tCU> zV1~kwl@&*}_J#&zn0bzC)k-Y+tir8dUK1kVz5IQu6)Q@f@r>X5uZMCgD#ZqVsS{;w zI8!qywAI6R0&yHY*o)iYoVP8`GI4wm>*JhZYom!#63aB}_VQ}g7e>XcBI)wu+P%zZ zG|F}AP81b8$5;#hVGQm0m4Ork z{IuDSw(Ki@r`7@XOoCE&U-NCdnQYot0JBOYi0cGb2P^kNU|F(B7j~MTUt7=_e47g` zv%~RS4%0pPHO1AA%Z?vmpq1ZkEPg5y+Yl?dq%pOScP!ynXh>-?F_3fDwJVRlD02o} z@KQwP6YQ&rS@TtIR@Rp@n^6A%Ea6NpbGNuQ0R%~P@T_}w!VWZ1Tgy}mZ6@f3S`m>4 z+Mj$J|2U!ldZpY|@|aibPn8vv+r@=USm`@PHK2=jOMxyzf6)pKxbMfI%;1@JOoT$5 zN@G>6AXz62-^n;eXs|smzP+&cbfQWHEBJK~k6$tXrUzkNH)bF_@y0cK$s_r19)65K zj}8ywu6mctM%;?qdcOy4v{H7hqGSEQsOF?t8-IFfaPM_S9$;xR%yYAV!LIwe`ksNu zuVuFyx5*r@^L$RF%R!DYXYtg~hKT@?PKEhW&e~+vVVz_dqQLEO{5(Y|@|o}~syEM4 z>pf^DD>5e?=rqsz1lij?`GwsC(=O~bJjsR%NZVYBLf^@RAG!`#1& zQV?BbT8Y_F_YB#OoEd}$$cVlCcNO?QU6>Dfus|vlF1}6@cNKfr!}!@Z=`R9oCbXRy z!xwOP!+RJ$oFTfP!a{K}0X7ONG5gG5&wxfB#acJu&Hvh43`Ar$lk{hNd_~P?su`ie zuItPWeq$5bazrK|H#y1VmOXYGjr941DXi1EsjYJ3QNP3+(I949k|=xoi=n0sSA^0pen)I% z2#ZZ74lZ|C>KJ71S#I?jQx*+`U)!LE4jGE`xm*F{HFjN&e!{4>2m$TWwM7Wx^?dDn zzoK2{9+uEahpN3VdeZi6OA8R1l&+C-SzuNGH?O7&semaetN(wrx4&UpVa|qa3kI51^TafcJg-FYgNDp_)&$=- zUM2>CLUp~FyuFEcnq)?B*95S+)h0iS9jfZW0>ahTR^@T5g6`QPj5GMH0Hv$-HH5Yr z@u1?69k2<5+a5T>yWYFrDfedeCC+nnr5dNViNOeB`ZMHxPa9jh-;KHOfquL+uR%as zjcZ8kG!qXB?MdHJt5Oxv8yWUwePXwlH>VqSGkXh6*a8)@X4~FKFF+wnUvFh0KU_u0 zCP{rQs7-tmO#|AUw4#pm;9(`Q!UhG33i;VmQWT=v4U{4+s4GzlWX)#gwguI04$Glz z;@PNZ=hToON4`7w5sU^2z~J?>#jt$_WR+USM<($y?a~MQcbQ+5M|2Mg4FFCfdS6wG zR8WT~y28oPk}C@42HfwGkF(_v)2iM0|I`s-Wyx3eNw&o!mQ&GyD)+%&d`*lLWW^c+ z{fj(V>uKYFtc9C$)8-d#$Z_pj3+@taWzd_tGomlzlyz@6{-sft5NZ_+#9$HbqSEA^ z4_9%P)NUlR2|jN0--PG_rXK(V>+hW``1jJgd-Q{IsH+wU!`RT}_4W2wOzm-mAyOZS zP>N-_6yPCJ&oy{6?1n)8(PO^Y+i{^m3O#(_C|PxQvl?AOS8>}d^*s3VK}&?ln}ZibK0Y4*)JHkg86JmHEDMM3T**cDP+cLp->8_FPCL8#dBG&mHXF3qLfl zPej&z)R)+a7Cz=L0L>Ff9Z}(O!%}P}wJo5CwZL&+1+Z7w7xBtG<2*gPRjsuH`FmOZ z$KkQ&pNpU8&#UGM#U|EZf47eZFH^~ z{bO)v++h#bYK1{EwHshE^=EvH8I;9SpLuANu!#v4Yw+o~h=jJ^*T{^EeRs*Lp|_B? zOc+)25~nE~Oyp@faWXBgLYS@57}G;44Ah#+(=@(o<)mUjfX%-W9-nwC3EpaPN)6u7 z8GG>boFuV@%mKK5o?lMS|GY1C_r!%;=|^Ll&Q0i9Hr7#_pc0Am_E7+%*~Ny29C@|L znrY32u-l8uWN&z~Ok-E@@QQ>GjN(r^}0Ak&+ zHbFv3SP@cJ;$aZfXf{lJp{$^30ukuXr(Dq_s)i{qt5aF0T!a4g_?k$84?OP7jke`P zyAh@v?m!oKnh*TTx%f|aC2{gwjrsnsv!)unAzi5%cDD7&Gi$W9ZS1LRc*57KgEet) zd4~3y&A7A>wkGmI{d-ICZ5~aU?qRS67c#mn+WlzcieY#n8DK6FL}rh(!sZ!YVpmZP z1vrzYD>W_XOLb9qf;SFx+)#d}V{^{`OTXjwP5tjT(KyiyX&sZ4o*kd?5FG8ihXd2I z;N<8B(#s#7zh4&kzu{wGqxN~LR_BxQ=9McbR4EAxFc7leu0;HxFNb$#;TS&3 zt({YJ@&y|{DF+C9Odn`7#_SkTxt`^(kA8oUEA?J%X@h_ACR)cj6Ua9X6yIARzZ!6; zpL1(JCVq|R;f`irSHt@eB0^D?CHT&X(Ye7mV>0IU9{}w2Rb8MG(Cq2Xkyq3oK?(3` zOwCozp}GO{!`ScWdw{4~oEphQQ(KM8XvN)pp~J&Au=22MuK�`GeyVW$VdOZ8jo$ z2Hwl9Cr96oFIg);C5I^WQI%O@hNf4dAPn>0xfTy(oHnOLIE#^>zgR)&x ztFJG^(k;%6S~an3kQj+ODtt|bt?KFRknRyzOuJKg$i`PIL5p&Jqvc0&R-BAn?>rVC z(C52S3vz1rg8U?Dghp|xVU&l@6YbkBg3={F>ns-;bjI(*GX`<>EL}3A*`1zGbIi+N zT9IK359kgf@{mZYH|D1= zM>Aad5F&5ljAhAhCo$LAK3(`8NAjeA&3JrH6`=6eWo|CTqAkp*{bBgu9#{Cnx;<&-q3@_LN^M?tgC%!m9+PopY-a?`+9Ux<0?x9qwK+@xGGyIw(0bT2-ZuYs%@*8OR!b@c5{J)aziaSM-!$SzC~M z#!u3}PO9VtR-!iA?%(%}s1FQ|qZ9dxuj~> z&c#4>SMDYsimrTG&X~32(GcTvZR@t3E9fCLeB=ImQ=6V#h1_SOg_55&e4ed$m*yk` zs<-ivwPHPW(RCUt!wq~^#r3866PD3lb2Y))G}bk~W8Yt6i&XQi|K0SboG)jB1Z}g3xLt(B2%mrgWs9n({`&X!F=-&LABT2V z$cpPZNos2I^KJ~?h`17%Ta*U#t0q6CQBeiFQvGy=Gm!@+#7$ovuBt|L#Oe$XJ@pj{ zedgl?u{J1mhoR>67jTy6Q>LRNE8nUCiAFX_UD6xo)$uC(8)53fI{Y?|MKm5shT8C< z7%L^3gq1)9pSypsJuI{%4Kzpc1Sg51?AxGm_UUQ8?5@?gr}_pT(pUiL!GABd_*^qy zoHaLDWM1l?mxMoVN8^y45&gE?fw_BM$1vDQv-hj1cTcBf#0sphPu|={tmW#)zO(p& zpOtwFO9te~pf#(lr#GKmkMsa@qEW`-Gv||%?1N2^=v=oOY9rY^^8-F^y0JGw-oE<; zJaagQDkcj9j)Cq}nXy1rO@`(sn{T3_qiL)?n(mkh$@)4m*47Z!CR!F68Y;BdoI%-^ zuvsIA-!6zg<5KyWCH!vN;@9=X1D$@*P8eT~=`DaYC@}(m0DeJ&u;0dnvByH0cuLB_ zzC~AGNTTUoGOdOF0Cpz3Co=WXHQ64is#&}-$Bz9@%KE7AFnC8p{@LWrBliz|1eMmG_~LDLJb}ci@4JE|XJE1W zLYh6JgvC+7caU-yA<{a^S-^QJDAr(vU*$B6$zRl*QaC%C9ux}dKqS?sDJt2M9Hzgd z$tp<(t8H2O{SpV7zr;TgKQg-KE@Jm~VxN(URPr;(h;^OJ{^y4{t(b)3S$zPQQOK`j zk5KQFCgVd%$u6(XpkA29+({mo=y|PJ1D{3m3guTg-tuul{ZsKuE-PuMUwW=SS z$w{VpLYiZ`Z~UGZ{uy5P)YpH{y09L+7k>Lw4`WTKL0MokK9UbWZKIGnzhju%Vg2gS6}nn zNp<681kYiE6ULf5T1u+f^PH0$=r74KaT_%z_V*}+uD+~3Cr-13v&yU*7Tzi7#YEUH z&c7BlXC`L?`5uN6<+2ch=N-u%D_z6)DEosG3kr2A^0Ba*vwa2lOpCM7oownXwty?k%e->jNB6VJimsV&l(zudP> zaU8F*F5cLBcG!zJ595{UQ7%<@u9|r6PMn{wcL8CRyokv{ytVy@}-McF|-R<+G>*wRi~={)VODx;N?<+zLmJd%c9#ARTj_I%u1 z(B{KOLMlYOPY0yBrWT6+YA=Ho*n@zLA5`<5IXu2x2IUw_Wej#Wv6Ovk9Xx67Em;u%Sm5f(xT0o{nPrDHb|!g z6Mvpx1^X3?0H1!qh5q})in878il#2{<5v66Riyi!k_eD`=4NoD7u<(7;Ym$`-P>@P zbdf?tGN_~G!#M85FqdZg}{J?^JHomgbD$22RDUro2!cS3z*@qW9fEYH7CNh6Aw(WsA=VUIYn zff>uV#XLdMo=)L2`p@DMaYXip^_6K$0X`w)4$P=tcXG|p#ZhXKVoQZ$W3bB%X0lBizjQo$68@k|B z)+=M^N3BgSvTSWOE7iNAf}NSoa%5oT4UYZ&nzEQ(Bvya%nZ7T&W=>W7^2Od*$%OO^ z%}?9*PoY46P12>axdI_KO@p10-0*?{*_+QV8Max$GSd`iPu52a=}dSzlsKVK%S3p{ zy1{hkWI|9=NR`ch=Pq-Sv|o7(s193!Yms|Hxl!g!`Z3oT(*ea;2-)5;!6U9>kSmH@ zV`09{O+Ax?)aq&F_eq1*+hN?D&%Zj=9HUmkGO-wOiE9m9drNyI(pYgNlxu0{ed>)R z^PYmUZd4CvFUt|84TkoO;T+50JW`YDczIIYlMyxZrw5bv@raw)6m zyS~^C&W~Exx)oUq!jBn^=Rj2-E(Rg27A?p(935Ez?sz+l;ci5^+k?ZJ`-+J$C zeHZcs^&H!?9l7IAA@x4uas}c6Cbjb{oR%?oU)^3)c`&nn=(;AD5@}xjyeaWo0ok@% zZuCA6We?q4XQjQa5ZT;tM9=1`6c9UROn9%AH8Q!ZL9N zlnh$ZwNou$dpb*NG1N}1NWHQHXf`J6SiyDFQEKIt^z~VTOYx;Ghh?*Wt;T2W+ZXxZ z)i$jQLI1D9XjXv4->A{BWR<%Eru}%yGv5odVRt?Rn2g34TxKFmX^})iUH8_if`hPk zk#uQ1VMfS^ex(`aj@LzR3{5;0g5G7xs)O0}+m_2SxX;Pc^C66m=&S3Cw8b#T@v;aF z6%ll0IT&K>0?Hz6^?;92GcOVK%N_`3q8=-X4FTZ=flV%~w93Hcf%u&WA8IVwBso zn=Bk`Aaajr*L7A5u>27tN?jku<2gNiC_|NV0yjNs*RpuX^Lbf_6%=W9HsOYknv+%q z$`b&O@vPIG??02+KOXB7zf+lt-};`GTla!e@(e_{741W!lGd^GIyBIjg2`x>H5fKL zmyo`a9*)SgXWQ1r=Vp*5Z9$3xQoH1CI*c8$d2_OT3Zc@K!Gkze_NVJ$V_hONx_{$& z$+w^(b9Alh4VlX-+jG0D&BHfRF;~A5KP~%9BVU(^i|kSpSFzI+IRLr>4u79;z15An z%BZKGcoMzNWJX5n)Nh=i;t&^l7tToG1KJD=Jy~_LO3z*MDo63n?c%uYyuYSe;df^sFc2%^jPV?+;j1>lP1{G`z}9Nb zp#+)&?maDyJ73HW8L?}C5L0dBajJz$y5{SSph&;hEs2%144!#Zl7H=wCV zmRf&W=ONK1s4hvSrQ?w8c}DKM7W*M)B9};H31$AOjkf_bJ!K<$$GFVjH&u0mtj)VE zW@`YX*vLJ^pME|a@;W!uQ}W7SiwY2=WI@XM)BXSBfD*LrY>)J1v5i;q-YRybE_W`k z4X+Z%*ZIJfU0(IsgA&c)zjF;%k0TI+gXuII_CI( zM8EIgi_~S6yZLlCREt~p)3%CN!9W8dzWC=6ox!Z{xzNcs9h|clrQ7}oMUXwKB4PK& zyE zYH0d0QuKX2ShZ-LLSk&uwT(>)-=#&ar;v~WB`N=C%8-(Fx(ZKbvb%ERC|8;g(URBFhMVCSQT0hgGJT-E6ZFknPQ$5?@s{TSJ+bKfHu}vMxYjdeGpx?4 zxU^U(yJ>;Tb*-Pj%z1G~B$XF?VH{yN2uIPNb67z@h*?7DCaLN#ar;+v$UmM_|N3vW z3^qf2UNt{HaTQ&hmY|bNPXfU+up;d4$13%H%e%m-D@{V zlCv9-=g``{_VmuTLSjFpYOl3av%_X*4BNlEG2;MzvuvSm@t==x+w*_Z z_WqxLheT`~vR2-Qxho{0Sh%#cf~Rl~_YSbj(?B5Me^W5Kbrx6E8igj|IX8?ZbeCBR zGi`0<7{0#oheWvB6j!ERL;qPFDxu#&9X#vzZ->z+p}!#AKY%vKl%tVsFdzx7T#p%7 z@MI;Mfdmvl1g%S!H|4@7y|a5i`Mi3HA~QaH|u zCzUSWHA|S`cui6L@^uZyin`&F5lwZ@@k^we#oxS#e`wMu{fV?VH~2+(32F%nmZ9zVNd_i7l~ZY;1Q)vgPtrYfZC5dTB%Lwx>hJ!`~}3uq)*pLl*(0)cg}E8 zc};;vPwjYUK>}zSxnam1lO1-U$z0hMmW=t^QuWyPyf7tT`5REhIw$#V{jxGM`bzNW zay>?zp)-Ap{jsiq2g8b}09uur<<{@mMG)oIbg#s@ z%~gw2Qbs%4^FDf94(4PO7Tr(_R=tT#;ylsv(!~+(9FS+Zf}o)0tPkPu>acbORL;>~{r&M<3Z*!_gkudVk6b6R5ez ztX5|;|L1cFx{W z%vS6(@6oAmOm}j_UWIp4y~)*_VmGahfAQKxLjAxP_1y=z&5m{<55z9Z<7tUT z7(*PYSdrI@7?LmL6fC3weeI!b$i?;}H2_5++khViH(yF!ja}2uN|;>suQu*!^z9gu z+VW{&{GE1IxdJ%;lz?|ucPXUABOqRUG}yS<&%c4uv;IpaeJ z$uW|ex=JcWAa?Rm&Yos#tJb8tAP?J)nD%x(*-yCtL7eKc!Yu zfZrcofDpN+*g_#=8;j}d6<`Qypt-oHS6?;CCfiN=bOJR(w74ORJUI2HDzhX+_|=EK z2*}`+>PO^<>PG<^$FLtv3Yq$9t9U87`H!iUV}wGbp1#>pS1>cPxc6i>D8{*VYUph(XMtfXWBFj+V>6uh4?|!V!CJO}a^4ky_9#hvVv1+1v?%x$4Ed19a;?M-G zSBd%K)fK@Doe%TLc4hS&B^6irIbwOwUl$7-QiN+{Ka7MIL(5`e;~%fHm+N>SvuNl$ICW%Z|JIL87r zS5>-`L)w`(0xjN)H75eH%w@MW=I~H<&XQGXRl7mtJr{hU>RvJl#`R)XiIoj!Zi-KA zT3L+mj)-oBk_eP`NzWXd@1_n3I7j4oVu)#O_&MX9sn2X_X$S^oYzXIq=RJ<$Xhw#ua#eOJ2pmZ77h)U;j3|&xBAb>S%xZEMd`9Mc2~4k%_D}S|L4C=GZRWF0R8|_i1);E6jie32{ZSp|+k_RvWXn3t@%Or*Hi5 zre5N5y%N+i!%8@#XXfUgFkp;G0+TqkF$c%K{djF%C}eVi03kW~B0;t+JQ1Bct+I#j ztzm4HQC+f1;|ZwdAtvqKo9h_v=*tYfOV3pBb?bam-YmkrRcOuDAp+8uyr`L^cz48g zvx-3=(eP{JOJh&hWSaBa$}D!fQr#yH^LV|7YEm$fVype72fd4#C;Tsf@ZYDhhVq}H z6EBa4gS$uJ#+{;veCj*}&x7O{Ch9>SLv?K5?jdVWNr{UTYV}`heSb{`fAgH~-QetH zl;w5N;7$L%KBgwu`;^34v29faP-Jaw1T7Qn36*{F!jY`-3BphwDJD2j+n`R- z7kWoOPBxoDTw86$NJE5-FRY!yf`q0rGBYaY^-KswlIm37O{u}UaaLdQWC-8@K>8zc z%~5*j?YucCYn;Mxb(8vfuH{B`vL`!fSo_#n zybastngBujTRcoOB<{^@^v$Wt45bF%T4UHsnGmR}OA_4IR?U7HuobpASio%S3{ z^_RwpD9lCaZ}@N98mjrqzI0WykjN{x8~#kXCk16Qdupgcd$;|Rf|G(T%~thhyq71` zAvP-%=%CH!aq!qDvR$|qy_ID1{puNLhWPHS$hhF9Dqb3#EL~nV|We zZ$)lK-k@=zh9=^fCDyr=W`qI+lnhJC@3gha%2D$JaUfS+Va9>+E-xmd3^PNWvbl~R zp$~d2Ei||`^Zz{?wC+U)K>`X+IR;e0hsfp1T3pJEk!CDHuh?{vNN&MEFGiRRgT+c} zdxykitRsc?aT!kn-?~=G@}9zg4NcPN`}!(@K97^`C2cnte+kk%`_3j|ohGnl5p&N{ zaDF*OCy9jX9ILqbH_e3Kg*nWxyBJ1*u>O^1C;he02!F|WpvGzuREXARGKm1;-hq5u z3pBn3El>+9c&>+`$+yJ?@1?=`t%K^V_kexp+vBI*E(v>D+g1585PFB!*{OvxLzQ~l zCEs->IXeo|>P;vreD~Flp(;=c{8ReRQ&_hX*8yH0ioOI0iA*WyDr9GmTHIv5>7+X6 z$kRums1KuqS7HmTr-;8G_7S3eUum-V*MLsxgUPZrqhGF%RBO6IibPU0DNH{ze|NrB zdn*E5FSf(RBhSO$E+!t9UBHDhc&i5NV-SDefPejrudx^LJ_D|< z2ub{}&tDn$ANe~AQb#`vx&s@QLk~gB%L<@-`$YE~MG7!mU(`(vJ21TcGRC>=I$94> z;~kxEoXi$x+u4Mn_2iL@68^mno=t9`V6T0XT;sqTT`k{)#c)2mfIoPE`kcXS}I9Zkk;aV!x!P zn-!&o%DcnUEm-sNj_a&=m1uqRDvR-+3Yun0&u36xnDN!AUj{2wCiD)6C9%MkAjuqW zM4A3!FG>UlT*{JY;S?vr(mI&$ps?TuCQY#`MEKWj#{&y7BUUxIkPc@8kb^Q}E*>Fge6E`J1t zjh^#E*lr{?~Z%rAA4L}VbqC=l&&f%1jN`>Wz-_otCLlk8L5Gh z;5CHMV0oL(oP+7*2E=?KCG;B-(%}_T0PEKI^0o1VM~EO{rqRJ~w{Y7<(>`sGbq2-*XDSrzG=pG{$#J(7-HdLFEM zdvG^TV32?MhyZ&%fLg|~Iu{XsraODR*Qni-8)FW>?Avn!NdL#Mz303N!#eJrYgc$2 zE#dk&YP|6&LsOi|@}mrx53W|ta_tR`J6Yu43OLWJ2&r*~LNrJ7(Qcpbpw zqlP$vaoJT7)<@5Y_Ufc0v51tPUxP$8;4ns^+emv}L;K^C_;FmZE*8YS=yqd6hFu89 zw03>lr>2^!v8-cN1`34U9Rs5Jkg6p*A|NOj3^d;I8j9|F(jd%DpiOXOuhpR3oQ*|5 zMzTIogtMYWg`)kT@Rn87w0fWUEziDi2x%u!@hfWgLf$vkkS^7Bz+yj7o^0n8q?+FZ z`i*G<#g^#VBGjqo5TVW&+x6*)?Wj&a7@L|3kg~IiTU{wuUt@A_bKlA*77qE_Iylou zU0^HAaTZ!)W)VBSzwiK-RUw3!6`kn&aiFH=sx6yuV~!`j%AB_p1zsWWnNY3nP&jHA ze0o4jPTUYE9|7ujT@2*Lu5LseE6 zeoZQ_XNd$v-Xw_&!0d7|Mtp7;Kki8#!q~Z1Rd9VAxVHfKa9WJxdvwv}FPNQrAm+)h z>VnzkG6z|0F_vD%(!q!G5leyF>6TW>ZfABM@qWW?&8g6*NXfKC30oRiR?I4e!_^cI zcXdc9Rwuc_ZdRXZ8dm!sZV_Jw1MoyjsAUqdnR zlQ4)0Qx!wv^#>Ei<0#8f_;udpjqfkW;dUF~ImR9PH!k+oUqJeU)spr@N`~L}BClpB zju#cL4n2G?e!>9v%Yvz)DD@{-h4mE}Atc63P^G<`s$PGg*33;pHNgbD@TfHXYi-FD z8fz-YYE-8CQ?WO4L1RG2_upx_Gy&2JZkGX@xU%AmxSwhB0#fWCOj=a4d%dIe-6W99 zD#Vf<{z9YMPpueu8cC4Y&+b=ytM;jQpD>ukLI^EhP1zljfu5plGaIBEHZJw?r=8Mw z`jJ|MR`^A|LmK*56V!(!+eu3f00pc&rJOgA>(s1muBsMHY7q~H@%SoELLMe4khYNW0bR;M!t0XZ4#BH$S;J*#JF?$A27`QStEQd4PC=2 zEQejJ5Z6rSZOU7nl5cb#QstZ#YiuVw_0f6&ESMwA+*ac77sUPs2rH?|NQ-Vcgeve> zy3>6?8TW*c1e=`GBL=YFb{BN`_X{~95!i*WhJy|Hzx;^ZmjQ0pxs-dW1q@1lB<8Os zC@!Ms>%9_YDYFYvOTn;r=RDiTSySSD++nCNl!8Om7Qh&N)ibGj0Pz?0eQdyH6?$By zX2v{IB=E1qtOoK7g+@2eT{@FKpNLYXOdv8#_pO4$#1*UwB>=T0B2kH<_SMvRF9T); zcp*T`vnAa;AF;UUiaTI5RMH!t1#vc^NDuRyTM?Q2>Xg+UpWoB5zysZR~R#H?sl)(CnqTp*-^;C&PkiUEQZzs)5t!8Q!m z!b5Sdg>nF*s%)yQfW$mnkft*6%si({^*oeyTo+e0gR$KV&Vt0vuRwLB|GwIL^%&fu zhgVTZTi0xP3wPsaOGm72Ik& zR9RUo6b~wDmf$=^_&Krb&}*(JrlA&kx1dO32G{-=;u8ELU>PRE?g~wyuTyoF#8+2S zawTBLE7z}QSsl0a!BC_T(sc+7foP2L3$ce$>3C8gsuGq&*Ad1#VY%MW{etM?TuxLn z;29QI%cJwg<^mtm!d73bp^LZ<@0EyKvD;o?BHH){;?7j`@rSl(7v{ao4zMKK@(iB& zM24Ft&_$$SJ-a7scFzGpeo$Eo>$LLd{d4;-sJBaFJuB>A*);2&W@Q9_{QIF$jM*ynLJ+A5LSLrMp6z^6Qmm}JUrILJU10_MoFYE)li+;G z?2BDSis%~Cf2Q|gcoi44zYDUm6ZDpik0{V{+?BLV${i%ViAV?#ynio(CB zwfE)#0D2&aKdF;h&}-iC`f-pG_RC@cu%FT(Sz;)Uij%SY@ekf(06%P%S27_!sD&)0 z(4aP^?V&>;nqqg=1ZK^VXkZXZTLEb|fbHaFL2chB(DJ4g`Fh`IV=T6lV6m8^WzqpU zxTFy2tsEwy&iX#N43>bT*r6m*LDBGhr!#3XdlJ>WUy?Dul0L(crnBRdTkBxV2e=ZsQ42wE<#Ehh1Y}`cVx`aV9Ls$3!Q;t19W_H|ib>cZmW9ArM%)Ar z4oHRJzaXz}H7qk>vm6!OUXU5r;8=svafV~JMA{oh)Xd4ijJ}H4cOHXcD0<`Bc-G3e zH-v}*#BzdTaThNNw$AVM^G&f)zgIG$Sb=yH*9#7&1n=kzTl7sE_ggz3g^J#jByuM= zoE?pcM>apGugN?X%{lfvL-ng7s7XF#I{J12aZy&r5RoO53Kd-$4PYpMON>ig=H(VR zVAHtx*IOasv}YV1FkiNn+R~(nKk@Mw&R!Zi#rJPyb6Cd{YiRQi!;>VvpuP&8+P1zJ zx(Z&sl1S4TPA%avXvQH2(yMzOPi8sFk47Ird|41KXa-X=>+$_r8*I(&t@K)I=O>d# z@oa74sh6Jm!#JAOnns=@Im}M}v9KU$9Q9#LpD7Bv(HFoCYz?nzuQmaCEJLOA7CLJ=Jn|&fJN}`#G4`j=pfnBViKGZ6Z)LpQIlIWBM%fiV6ch2&Z0J77dv=dob zAHhw|$vfcC?}%buYmM1k>&$~zro=tcrt|9A}6(H=PZNbt`a~QGCcV;W5 zqeSB;%ZzkNI9AM7;4Xx0PPPFveQ3~8V|z|7I8@lh0y)IFsP$if{qUb)zRdoqgBGUR z3s#0^oQs@WCm>cx;KA;SOM-mn8+Fyt{e`87wxzJun~)L4TC%0{C2 zxB~4zBn#v1WC~!t=?EL@GtTufa98TlV zk^(`ZGBWZ&AmlhSjDbM*;8;_4YLBY}L> zP^V~xjZ3}Ux|f+v)C$41EJnj-(49x0gwWhI6`CvDy^!Y^(r%h|vTYm8RUh=!0_~BP z^mSzZiCp+7^jB>3O5@qp&&rIWqw))Hs%mQhe6r4>8@WI{{q(61o3SF{+bAMZOyOaa zT3Y%3Msqyc`A!pSi6g?gEB`d*te+#w{KaX#q~9*b$z0hD8h2=0?N%g=hEX!`{JXRK zo3GiPj@^iv{(?CBCz-ws(wmt#(@Y>|rYyD@NP?NNx*p-INs<`+vAo&%|9%VW=d@oC z+7G`VWEIP}%2^q5P%NKkKTadBJ8!gyFsjGg?<@BuB&3GIb`)@D+I!X{sgoX~e9k8C z<_Ru~$gqLDdV~?Kb3}x;#@1;BHz4=o+ta2E30Dw~wb=$~&A8SIHpw7(jEyr|WpAbj z8^JX}AxtbaY^?5Q#e6@gI$F9_gW+sR9^3;^_>v?k?*+)Bbkz=<3vS$9pQ2BSeCMro zRogE6*gwr-wi;tNtsVUm;d$USn4y)^bcTY1BgsYyw#-V190o5FLpEZ&s!x7FejG(! zkLaa%{epypEc|Rt4)`^u?o9#L3UVkt{@?B(QAs>p(E5D*cTYok2b&9^gus?qhV^dZwHN$M7-pA)X zJRAS96xs>&&iFuQFCimV6#GC!&m?=G)oooetbo5D0%!d5_!aB8BEvCw1F2ptE z-)GC>gxA->`p+MtuYS9>g&|Xcw&M=<;l?ir(z6YNq$Iz7r^Bzb{Y044yC^opXv7ru zoRym~YRM80c1G(_$M)~E)F+;lh-gUE=mE<6Vjn0V^o>4Gm}iWTPM|p6F{n!O^8S<* zN{3mF3juR*1d5*W=E_U?W~n3%^-juoUmu18eaa+Id5^u{Jjb801cvue;vfTPAo4d@ zRKGl+Ay;v7-qCRv$@j6MSE(?`_@2KbIkcwV-QQqI1+1Z9Gi+82u^G&e6v)^y=c{HG zL#*1ahv>OEU*~6(X_6$PyOD%o7|M?p)^RUe1Xu4xk;QW+TVK_RTRdx={|s2NmtooV z?q>ACQ4tdpOOcmi-=X%aI&OH1q+fz?!@&C|ZsUD{gasV0cs)2*@vC5qgLa&ONfKU#2Y{6IWf=WWfW zJR$+?y|x0Ok|A8nU8VGf5QvqY6uyC62&n~f;$lSim}I1GI>VCJ$|-Dz>&Hptb?a3z zYESH5Z!;Mru|x6m6FW4KiyOiUTVy4N0G2gbe>{CEaOO}az66T>H5@=$LhcXH{Fki< znd6pGQAOr~voJ%D_&Bs%AfeyJ$rM1)jiM7vK?XW-du?g_y)v#7Z{L0=Ly58%-oH~p`NfKMt_W<%lls3LF-gqe!nSQML@c?v^GSi!*;|MhcD1d`pvWOnXwp}t zV<`$T#OI>unV&6Jv4;u7<6XDvSRti;PTV7_6o@`22$Cy=ha~AMKIjnz6)|n9DfNg; z>ihH(>c+z5^=~xeWTl|kW~whLCd|vxR!v`C%RJN$m#AFQCigTGyqJj8i=Hgwn7Qs5 zP+l+lBc3BkD0bkAPFQxpH04gl_wTK#IO^B7FD6zKdKumw z5UiIqiuzgKWM2#;EE5`+J)lV5F;t&z;|M|_VH;Z0yP>^|1H}=;-W#ZYJ*at`ynK~p zy+VnQxyt-Z`yG_T=rY0~_@*9YIGaIe^*dq(qz5rF8-C7#Tap;19Wcvvs)BavXe&Kg zqDkILR94=I(zBN@W;mITAEgGyU#!4vFwFoC!SF^F;p8UC(I9PtzDD(@)!b!7-r~^% zocxo*svnE}F-sC_A3<5+sy~CPkEq>&JG!mVTENpIQ6hz#92?bVG9)k4(e;k+Hwu@# zU0Cw(ltgcgsCh~k92TV{v4o=&67sZ>wZ6|uIo%BbXMfhZF?PLV?)%J%nyvOIpXiY9 zHh_8nqQp;<*kPbK$Y14e1NMP+H8sIBqv1RUWJLe|> zWXX26n@#6qq_J%lJ|ZhwlFsn-Cn#%A0gQk54KnLZ1}OHa`vaWSy?-ye8AbB4xen$2 z#mLkSl>?%`ARXKOU8Oe77H@MaYq1yIW66Ct5q+OBCbuaswmrmY%O1%_?0U$GY5a$Svc5J89}3#V>We?=st4gv zvVLvQ{a_OCMocdzKn=0!I@Li#f#AXR{qOU$M%;Qv+!U!=F4kunbqVrs%eJk*>(3gN zS>fKN2X3N6J7LYXH7#APslGC9T#k1Fv(W{<9qN%CT=~xWNqo^&1RE}(v?i2+(-p(W z$@?jQcb0F@{jp0cE2e;Vgy$Q--CvLcoE?Q_s!zUFh+TCqBQrTxP8Na^ij!5*rZtbH z0tj@(YaGV(bIPKAzRJk8`qQ4s0Pc4AWB=RA%T@1X*!D3gL(qUKKCa=G`u5?+*&chk zZ3K=z^&P8WBWbtnwWkuSkgX&_T)HR-cf`8egw`aJHVM#okai|Gyc-{=!FvLCR85xH z3;*cUI*0|)uT58Qw+O%Y1&`BXgW_WRyx7yrVVLBBs&qqU5pg*=ma)V8S2032vh{eL zcD?I&i5;H;Wp~BU<%Oxunwe ze=Ue?)+8SPc?E@e(=*Sge-FnDNWIa-zNW4l7x!G1StV=;YvZ+Kp`y1YPyyY+-W(D}uT#jb_{ zcSm$|C4y&%#NlSf!DLhZm3QaOw;S!GWfZ0X`Q5hC8C9I=72kFQNzn^*NY|+UDw`{y zVhGnPU4;b zA8k*e&)rM)b3<@6yvHgdYUUJOE~8AimRE@IM$lAzYbV2|`kXsF0V#-~=St3|bHc4B zGit%`Ra9df3sR1H2-0YBp`nUQgppC|t2$E~CEEs{Dh?!vZUab#wxfOLKt-ERjChqz z1f+G?k^{+5Vj=7%>5+B9tp4%X{=KSLyY$MPtP6)cizxFa*l2Yi`%LuTMyti%4&Bx2 zC0J=Mf{CN;bxXP(mlmKCc>>iUbl<9ReOW0P7rsXpqFq!?H$Pp^ouak2i=^q5t`DGg zIOmp(&uvDKJH)n0icVT33s(Ka;Fub9G10`U9_r(+YXZ+krV`e!@F?YJMZoqhql7>) z-u?~EKrS(nAsJ5KAWqzil8AZ2!QH~^q?FO+li z2Og$MVS0aC;oGWgP{`7Y_{K#~X)7B631YK)9@#5(bUq|h6s!~^z=6c)wN3C%2SC)G zlIxi;#^HAtQz`ST53-e2n|6331?m75gyc!J8LUsqNW-ZS7L#eVFI{<#L}IW2)14Pq zSuCQn5SdJQ7XV@beY{kx+~~dnhRJtH!D0e6)K_HoO=V1oJL$D%x1t1uj1>_I$bE`} zH;gOCSt2Y6&%}B?cLxT=lK|>f79mh`ctS*3e4(f8HAM}~_#bR^7V!_(46(d${Xy3` z-}}*dax}9H2|QrlQ~};Qo;0QtO$;M?_NStA52Ewh zVdJzhuLMr=ac2WQH577V;mP6sa6d=Q-r{k^)>)5RTDu^h=2^$j&Z<0x?O&x6gLCrWAi4)1$qGw5QKo(!7Ww}^AaX7voFOS z5K7tF$(~|g8Gpynqjmb;&}8X|Bq7IcW|c-8z21W$z z4IWMchmV0?P9NU6{nKFPTXd>UO;yZSkH>~qn|FJ1>MKK!BsibPVartkUl&$`uToar zxI(AQOfUf6w4Ad}o&p9SUwmG5M^s_5A?`lv@aP>-2Fe@t#U_^P*2yXKLc# zbHlc3*Q;2ds0w5~p#q!js6~vgojH4qM|=^A$JLM@7kBRZy%zV^hkW@bjy(-fdNe8-BuxDB}0`_GvuO5aD3LKCg_q)A+t0Wos&b zw75Ihppz?&>f232G?;rRw2&YiX%KD}ojLlQ%FROS(al=HzG?=OAWLvfJn?o*>eBp z#%sOFDD@of67?w7sN0DNbG3BzXs|T8JTEd+2=~)Yko^3ySSJEf>SzsiiHxmc3ac4i zImz6ty9}}CDy)h?q3066+v7XActf*G}UdL6!qrJ$Z@59Y(E`b0T+ORktV#b z8!W!wEtMFN#*rY{nDnB1DqaUf=X7PUp82uq9Ql-Up@SVwLw81P;4CvCfR+~|KVU#z zOn$^2DpWLVb><{~ihP0S@{i%^t9*#Mg2u2xx}4^mXJ}M?I~73m-%hi3#I^>4GRksS zt@Tlur9Y4HZhM{C?RT)|pijPs91@Y zy%5y~-7wPU`kMU=fj7OK+BIVwFvC~mZ_}H(*DJw$h=uK|W*sw}yH!+Bo^Ys1ucF#F zYi10V00E213kz8G&$xZf#dZF4nkt}UmK2KY&nc&QV2t&=H#_O=faU$L8gR%!&A^%Y z1;MB-*1;3CYT1RPVs)p#KKC!k8(5lEsG34*9PYS zE0T%qqs4VfY_I$gNsB;dW zL6(ejZtSkHxgayL7+F_ODcAcBY#mdee*hJ{Cy?qfEfn zsp}La3nKl}<+CMA>S$#%hshcohjQXr2d`Z#C5*9-sV8!mqB?IMbR*hD`T8nDG!hl6 zQd9+$cNcP_pXs&bwJG78)q&edif@(7BYvIhI!!iQQyn@X@TfuV_@4{c)=7h5xM*(; zFLnm^a!FNvUA?kO2^ZcJ7fG30Ak*>!Odlq8@ z9m|yCx@&drj_!0Dyg?XNu;aBDRShrZ!~A{%SK1hD%>1%K5Qc9@>_%!hcl|w?35Bhb zt!dxWu&a3OofMy_Arlh~&vmbTX$Z0G&eH3;tWG{-j+g?o-D)DSAGrXKuOhOk5^V|6 z-F(`;_kkk_1Y%;El}yWHXDTWF222iwG}I(phuNjMoC|IkBCs2eFknHVQ^xO-Tdaip zAQ=#y4bSBD=vv*lixD6w?9y6Cx3I%~f+M^QWXtcSzH=Tf13MS6tBldhPUj`aX7!F3 zdLvg6avq=}{Eu{ymExYIX!-RWvk;%r)_3EK#eMoQW)R3A(b`j%??zRR4Bt!D)~_tq z-#DJe?S^CX-lhW~JC58tC3o2GNGe{O3QR2}(Nv5zH4aqI;GL=zxh+$Rkx z1Rd(a+>&Th1x&! z%uago&#~m~ub+1#{PB!Z1PDqQFy>pjD-iaSw{l*f!2&?DP2~=OH;x4wrgcq z1G{5`Z4Q$!`aF_DTZZQ&ImzW{kgL1^%Cr7wKewCHsDJ<2DWRnwv~Ly7)uy|6jq%DW z=}{T43J}dx?ju-DKcIA0zb>D;@h;uaE-4AgNkk@*qXD)0z|zN&o}_B1OSdlD;}{K^ zElK{|nxQ(R8h6f$9<>cDqmOV&&D2eL@xkO}?%g9dTN4ir5Uyj2V85%!>1{sl|NFBi z*!pR`e2CxklHF%MSjNfZRM9|Wk+}}s-JC6ffS9}5OHd*5+TW6 zCq&31I<>0=0~_j*FOp;C8i!U0x0E7x$T!-c@T(e&BY7Tbg-BRm?SWo?pBXx!YQ%V@ zjIvwt=s{WS)^vQzE{v0CdmgwcP=Id0z@u~A#~wY~`NJtm?^%<@(D++zP}biYODHJJ zNPMM0hwNvk-1ndOWaq7wgw<$G!BPMrms3pnhngaA6o^@%X9sHFl&QZUHZ}T((O*{% zfwKF5ot3mo;hi}mZ#cWr#grk23xPqF>tclyI+KgN%A=H{57{Sd{;bc)@)xnI-_Oau z6!3H)9H@6XW3IhsCwDYb7qw=8QVwZHgPHtreVyuGDFKR5ZznCJ-qGNAyUXx7_l1}v zgo+F!%D5tb(P75ag|phMzY>QeIEj#L75wJuWV^nKQzookTQ%X{1a^J5Xb|uZfd(_B z^!GH=gG)ROC)($MVzu8QeF}&@@&j3KO!F~IcyjRev_e8Yb`WMg4X$!6IkGm0i(q0B z= zpzNQMs7?4Ba~6{1Zv=r>ts3W^Dx&sPrwXZ_g*^RCAqK|7Ft*1{Xf_-AtD{x41W%sW z$G!!|zT2xQET0%n>b4U98C+VW>>+ErPffm9GO_ZQJT|CM42;=dhDVEw_&&P7H)Rvo8n7t{#fkvqO%{`Y)gwi) z{iR{xqi;D^L#q7e?5Q0CmK{1Y={Vr8Mfn%ARAl_Q+4h^Nwj2HjG*L09Uv>^x#i! zBcBO2;FCC~3jtq<6Zd%O%pNf;;exH^!xiGoD(?XFf1zwH+CE!xadxH-65^U4MV zz?tR0RuTl#B&`Yxgz*XgYJ*>pA?>TJn$1!DzAzp@!$$f1&Jt4*M)IPY0d$5le*cHf z-$PobM;o>yO9#zc>2^qJ@X0mkNAgk+apJji3dOq1X%g-!znxvmlq0LFw(WAK2X3nu zE}Mao)E*5@zh?6ux)B4d9wz`DsQAy}d?Ken?2Rk6&zhdiD`Tf4(ahYJJvXTm?9^vo z_O2*37Pw+^fW#$xo4Dr2Cz49KqRr@Wd4x~{xHu*%D2wS5e6duG&fhGnmRG*U>MTnE z*5{;20E~rL{5$$|LK_8Vne( ziqqs0pG=^Xb90wbP)kcvK1t}t{SrmIqy9VKH0$Z3;lK*Ad-GE#<8C-zC^|MT_+$Bd z$<4_p*!CU~}|T|Q8$7u&ovF8l>a zF8(5d5799uX;<-Jeyew9X z3maURsPc6@JFPLI45@iIMu6b?`@_Y5FFr+MdivcCmvrhOV#W6)gGd_}`H?_=fVlO$ zJM0pgYp6Kah?0ycfW$!YNp!Ivt4rn7w?ol~mf z4N;z68K2Dk#x$wc!z9G9gl=iWI4W zbm=V!NJqMK2ql0>m)@%sK|qj>fFQj{4G7YUsPx{0(v?mCr7P{78^8Da&N*wHS+i!% z{5WUU{9uuUgy+d!ulw5j+B==#hn(xRmH^ z3!fYg2UcusUDr`kx{bA&>#fq$j0)DKaX!F)Zq>v9)7^XLVatHwV2c^}pyp|2m<^cf z7b~H@+fKu`HBcLzwi6b1KPx_+iD-BjPP2SMbq@Rf_B-A*HHAEWk~SecV(IC=naNe17~IQdf=OnKT=?dEd^yC=pv*&kBM3+Dsh z4yi~BK7AX)b!A}Cz9d(%0hE}DU+!IcAwlkV6}V$i*aFXm1kXqDfb%TQQIcfkM+0AZ zEA(p=m(_h0KD1Tz;_=5VU)0N3o$o9xNKvWc;!PvKZ0P+cy~xntd|3+}-1ynW(_yB< z=%=AEHVbd_0jcQm;AV&9w!a_>Q=jUWR_I7{O|7TT15|+Q5i#;Xw&8YVVyXI85}GE->KORXIr>@$ zd%>9+Um_CJ;nBf|ueqN~9QVG?yg+f#x(i z?(T(gocCL7M0EPxyXNkQZcBx}xl&UK0a!bbb4Yif%Xq$b^CPcYSQxr_QkLeL$&&vx z9I&r?b=Ea?)T_!5k9z4YFh-{n4#4C4*ODCBE?4l{0G9B56 z`En3{Ahc|O@l&n#qHmX(AyWX=PNj5|%(#W+(1xq9nGD}!UXv-hZI#N!&!{+2H9dHe3?OhxdcAPd~{M?QIDB2kG43+ zS^@$kOy&ti&6L{h$0{UUf2U#>h~WcSN!CmvKfI<^)|dO_bP&$0&i7Alaq7R6y|dxh zW&cFyIyV#ZyHDqwkuPLKN-^!gQipeoP2OhW)18OB-{OvP-YQA1zCK{(Y?2huDJ@Sf z=Z)#Vc<^XE{oe+uv$ODyvhnBD)Q6TnZDwr~IlTe#VuR5_)A9Xfi(sJs;a*~b4lX4@ zp3)Q0E_lBq2Ovi8mQv;cLt3dVO)Hf|cm?a;?T#<{i~pEP`iG0Z2~DnG-kYbN*M^AZ zTZg;+9MNsD-&)?2EprRvolwAcrTTbx->6&LJx6X1No4Y>oMS_zHKUQ8epO*s^$-b& zWVm3zAJb$}HMbEiSN8l?SK)-cstrpIr7O9Yw5}1lw;D72f*&H?AHsm@^MbK`)tHv{ zbVIOC?CuhJL`65?x|nm4fI&V$a#EhJ+1M@%RA67F1v!fR#F<%Nd#t_XiYls_;ptP7 z6B~JiGHh$Nvr$uq2=-w{ZOXuq=f15%)w1Fq)D{l0M-Do3qb;Y>*qTtcsn=EVV+ZY( z=akIO5yybm#@PE8WKJ6W{;AlBEtvJe#t@Vb(Lxf&jj&=vgBZ}c-VHWT4hSAv4{4+K zF0y|gF%0Jv6|iMdIXAL}2zinmH82>r`}keHN>%+X5!2#NBC99-=S2r{Q_qSe3b$uD z#pF=db65H6jkTFkw{qGt(b?(VE<6w7Po8}$m$n9alGzTwb$V7x?Y={tws6}Z>L#&A ztqv6rLNdj!9gMIwZ>_;eeiMOt^bxqaJ3o)^zx6aL*J@?G$kqsVGNsa4{+~j zLs>ced0udfz9%)zaA&syh-7~td*aw2GWgKyTf#n<_w`g?gw;j|JPWmaCU&qTouyL9 z#hXLrB4-CvZQ{E@XZj!zXxi5Gf$c{s?-Vq}M4#9 zg*YOjpcu9Dj;k|Tu=rG;he6BnB4Rzlj67#mhMtrWOVhriKIKTI(A(mUe7H6MKdW4+ zbG^)pG`Mg(DkYjMBxM0+4!>wky?flk#?|nH*BuXapR1^~!Du?>r&1eRUVOEB%i9j4 zO9lkxgY57)4j9{Nzr&j;wHqDJCijrHXo@WZ7xDH{LfhYv^oeisGnw84oJo+)ulM3ki<3 z{pE9^xA+4eGTR)Z=EP##oG~!UCTR3}_R=Ub2&bM^G%CKjf`s_)jsE$U_>c_@pMJ)c zsJ@dd>ziwFxricL`&-}p^!1z*hsi;%(L0JFSv5TMf$8k~VS6X>?`i`S-lER3%rp5d z^_JG$Y22ot$>KUT?lzO0`;wf*LTr(e%l7UvtSljlnvV~QCQ(HbpXWP-l3Ow}ywc7i z+0N}30DbSa^v^poG*PZ^Cr#R6&dWxr_A?3h46bbtlR`iClomG5Uqpi4(ltn*_r(?( zANOYx5yT3kMMc9{p-T2qVrg2R;c5q5Z*P@qDa`ob(hYH)N-?In))$lM%6YAUT#rW(n7)9B^C z$}!3aaHlC3j|PD2n@ywxUHu0K%3hDrg!=|6hCvK~n@4u}`!WR-=2&1B_m+YeDC!KG zPtJWAPQoE36T!>fd?)rt+W&k2p@{Oxn;qU^v_V*A9-#n9CU$7|rTCBg1(v^Pwn@T4GKe}9xV{X08djS0LLnJ` zU9|oM@dJ$Btvz_kzFF5t%bZRvoQ1;e-ulFg&A7#mHVcR93LRv37KJOIlvKOa&%u8) zaM0?WqHSLUFMnsqyYz%kDqJ1ha~}R9o(K`}B0x1H%(OVhHd#(cO-XjW9ZH%>YJ8kP za!t2tJGcEdl1_Y|q{xEh!~g@kK{y@(MEu3}2T@$tj`Q{yg_(s8^zLd+j9& z_g31CVJNqTkSWPm5S*`P04-mI^;C?;*+X8-`zoSG6zN4hUV)CER;^9v5+~7fYkW1T z)t2}`zV7|s&R7=^t;So%!W)}+d|#iSVpaF@YRKf=+Nq>Mrd)2A&v5)`wW#7Im5lM8 zX6Vcf0*=>R6+*^88ax^+<->;UcPjUsLp9rKZKdi4`y7G`1v0)ibj=07sEqF4bm2jJ zQH@*+7{zY>J6$G55d53u-dWFYgGVOe?dOqiE|$;Bt8xh$u41*TT0@P9Ze)wy;C@YP zfqfOkO&!q2{i@EU%E88SJ24oz%|R?S$<*oR>IJLK+pZs%H~8dis#sMr_YO4V9s>6h zK*9zrPO5xNku9V=hVVi(_d4HlD3y9w;fiwW#r9h5;lO)8dzRoQL{QaL2|-Ybu59xGDuyt~4tQ82 zr@%D13eGvNVz@UK2gthcZ$EzRyPOO;k3vm?>fK7ZWk&j?v%G@b{T5`-zRdqJ;6erW zYDnjUuo?0tO{Xzd<%|n$8rl;Wc-)c>adl@tH83#oZY;;%QX}c_v;wk(_i4V@HWfx3 zBjSYatUo)d?54C0r*2NUcBwyL7rcKXZ3FC)4t*u2Oq7&WA9oehij>E{R>V(5u^*sl z$|T+>7&f~&Uim=X0y4JXvc8k=wvN&0>hj25Epe}sR;9}_zLQwfGa9qa#Ljtdtm4s8 z90%ih`x62;&fjjt*^QOJ^e3Di(&av5x}B;^OYi~5_4;=3=jJj<=s$H@CBL=*cKT%? z?*Y1XADpeSe=eTapL2jxvuF;Ow07$m3uD!yN1cbhRp_{KUqDP>d|P|_c)3H5r z=lnUCA0wbjHK2I#XI;m`1&f#%SE&@KM9&kEfFdy~7GPI6AKX)$vzyWL^Mo=^^P~jo z$P#y{V!X5(X3p!6-|j$|V4_;|@(!djz(0Dhb^Kxc@sMPF_4@60)ikw9n^av}Zvm#0 zqb`Wr|NfjMB%_^$r%>fRp7e%?zpa})(e1RBa5da_C9&VM5OCt6gSU!CEIdob29jgN%5BEAdkr zKTWYcX_e;sZ{M@i=-En375R%|SImhl^HDBL@1G@&-p8v>JTsQV;w1i`r=(Z7Tz&O82(~UBAFKI4jcvm=@qMRcn0rk(8}{PCF?VLo+7%CD&uHrbfqP2xAMa2Vd5n z+QK7!EWMjUKGZ_nWKpLaSX9@}gv4m$ACKw*;t*9DzdhM}55L0xcQJw%lw+wy?z>Ni z7+^ZbWm*wWphSZ^hqv;*T6mU7@v}}R7qb^_`ISS7mzu-uDICavle&0&p}EX!96hqc2q9*qv~&e$;jh9bM# zj8X<}?Xb%|Ae9EA=|>+qn{6FNSTt%8=}vlH$ep@z=xG%|oGSr$qsr0t3#Avn42*Fd zo?N=g=f>Dr_qlJeVHxP_Vc(bcy}>$DDRcI59gb5#F+a%Tg2hYwE9x=@P!S$)dN-gs zED|YRWv<*)odeJz*DZsI)t0Z^xa%6-z#N3og;~3~!LBl0i!_E$_vr1W$TYfqWvEv1 zx>O;09?($(BTlEa0^Vp<)cEHZ&%aQ<;aAMFNWn@D(bE;q3--*5Z|b_zm7DQOIm-OO zRrW5ql6G+unIxPiL=)j6=LQ^KsJs*G`AzzSU&OGAG2T;cD=`kN|I#(q9FMwkb(xjn zL7~a1XQYJ;jbkJ70tI_1$T>dJmIM9`m+?ie;>~YzDG)2VKSFo53-X>5UGRFThby;# zzcpnYT47rpz@sfE%M9ewh=%0BquZXnmyh9vo|O4}(!`Ly^z$yeifuWjVGQ1*#n2ddYf?h%3T9dctbE}(fi8l z+=UP*JSDGh<@0ENMFoj?!8Y%w_QOpy z%t(E8a`R<5VwUk-$e3TAwZ+fg18(`dOw?R_tAojGr}>!}1DilJ zP<8CAt81A^FL-|4!ZJZ9lt=@ew4jk}8dA!@8I~Q00raMVV7dcj?G29m-GUkdT2H)B zL=ybSIJPuouar2mn`_#$FZd|*u609QRHvG53C;>ChIxdFNph}|Lt_hP=D*&Fur?vf z3^Vf14nD5k!MRSobW&M5Z>h5b%(@1?C|gdhWC$_TOj;MM2i-x;j*PV$Ud6QC8P?DBX7j!e22RV}! zRX~m!jTy+)KUIPyQ^9*xsK52Gg0?wQ5!ykh{k#DmBI0Rdlga&az$RA8wX@ z0&f6>u_y1$6d)1ncKTEhud;e?pA3h_ScINOb|W0PJX!ZSNgi-CLz#86a#xan7!(~) zO33BKKO~FJh=zQe;I48k^iI@^#6)cBZuviM?HtXj;lm)wwQ#NpV4nMFY!$rSeOu47 zg!DazCHb)&u@!?3a7Ji?M@HY|VOnE?iSak9AG3QXQ1_md*ZSq=jsaufl1E1F%v1EV&2A5_&w_l< zD$-L6$7FB*5qlD5RWWF9Smn4pPih1&Zr-ETobWriXEC3NI<^mIu6o1U&w%KL?bPeX zI`ES|cqv#aW}XnHF*vjY3moJ)PMwXp?+GKeh`+@o#{jkx?9Ie>-F>3jn<7M83n}^I z_JvQY6-myz!E#9Gz=fP@3Me0^j{YVC*VLXkBG~NA)0T^YHa8msrcQN^xzXfZ2w_+z zTBEqItuyrdGy!|iVyug{4EYYxl1!R?6YK{pa}w0z`S91Z;(??(`za!e92!|PvuKzX z{Q#lkkucBMU2btQiXHRq)rt*ZKoTPDycA!^-sj|?>Ro+N;yKH8CT1$7O$*9;_e`Z!_z+5ByH(QqIc11Rrg!LMUFAZ|=`;r@{`3 zEs=U!IQJ4u=8?4q4@>a60dg=Q+~OoRE=2y!8%vpbC1VaWAjW}f!)0bKQTgd!3sCf| zpEgLud#vB>t^%)CfN0l6Xx0!oq`(Loz}J02#kMH(DoAr0(CU)w(d1EI7?ZL8V zHUo8XewRBu_B-SAq$8YQGgYP}PZ2VI6-yXT?_lw#;ggqK?O-+_cong$L?4>SKuuee z*{Mr*ch=hp2y$bgeJm@N6etCVZcE)Q%2>x%w`aeAxW|8yrQC5xq3GQ+4dt!CEwC}h z7t59?NW^;nmY>K(cL9W2yMLT>jg>I)dKX|XyYFqzyz*Ient8rEEM&@U{ndkMM=E_t zDjL!=g>tUalE<%v4y%-V=dW>D$337;FgHH}oJw3t7wcf6Gc&%EswqvkMx4s*f0@hu5p@-#X&j|yExw2{|j_!-icl8ESGf%X5E@wNSO)Q!4LQoc5J2kOt0!w=o?5PQBMp}lUfz`@c)eC zHlVAz)oeSfr$T(~+32>TnIf%r3+v*ijO z>R&MWQnuHoGa_Gc%_bEn-*fD*dnxQH8PDg3 zFp|FUec;OLHcAG{F1M?kc&#!NbXbO6=LJ2Ghgvwis3$)DFUdJ`?+&YA3{W2?=$i{) z++&28|8z3|-ReKye@d|4Y^}o}25qwsxRCO5E@*3Db_c|#DWF+wOU<`x8Y)07LCG$9 zil03`#{GSN;?E|Ax{_dC1tIr$lQIqvaxY%kPiHXH&DIK4&UaJ~b54;%2j#8Hzf`N) zAmn_99E@j1+IMcnxAn+~^MFO@if!{h`(gOsR_AOAlo_Hd!1s&k@1$cuLA?HOaJ2jl zcmmfIO63_c1}kJgNgYBb|7CxmH6r1?m(C)J_4W_(AL|bVYh==iwKZ1fuE-4!152I6 zvg9NP^-(VFzdm#&Bdh%|W-kT75vX9k;jX+tvQ}+DEC^uOe>+4ug!b#vS$XB+Y8%aX zWuighX|~OB;tpax`@ZjB5*<{AOj~O>X75IY)^bUOB#sKplfeqZP%&ZXnD!r;S|#@M zh7-e)Hi(LX2g2Z%j{Oit$GSN)tjP1Zl|=|;^eQ~Q)>t)kL8$`^)bz~&6Rv5y${y}k zIK*yD!NY%krS~(JH`hC73&~&Pz{Ab3SzMz2``Q!g*I;);{*A)x^UFsVUCY5bUTkqE8@3@3@(E}KEv~sJ}s9KlFj7vt@p4r+HR zi(Zwmddz77+a+5Kzu-FSI3?Q|JZj*;{14;2LKY5|pV+|qj$E*CFUnH-?l4zon~+jj z2X$h)VOSVoUw|(7cYXPvJ-$nI6AzdAhYW#&2jz|+CR?p{n4Hdb(^T?5*kk^)z~|3z zEE3f&zssr8ix)Zb`Kuwj_R5~?4Q&VC5?;Jm1=WiG=rTc-n=FE2AAzF*dbtt(>r@&Jt__)X=$b>j;Q!*WM&!Zq9LisVxfzqIjmoO?D^3J zK0blPDXA7U-5SI=cx9<7$FYhlLNy}6&zGMWCvCb7wy-8C-hMuc9oZV?B=tqiw26XH z-A@9`%L_X~ZaIE9+tK2Tx<lTnLz>2VXsyU%fTz1VHV%)zjGp;YE=@ZAG8}QmO}H%& ztGbTkqnV9f%hT{_GOyK?_`88++9e;(B?xkG) zUy#TH)SLTw+&*|72VQ;u_F^tk3#Qr@`YI-KiNr%&dN!3rjNYg^3#jOo^6g;nG)0O^ z%AQnEx%}`%Q=ar2;X=*;1GA_hVE{{c-Vn>)KKeRSDr*v13UhR7h?u(12-ySb?$1H` z^dIWl-h4?+Kkj$?NQb!hLK;1Lp?O&OkD^ZLCuOGd>7+MY7(BfhbPcxI+wR_JH*F!o zMZf<8pgzKwGF#&MXdFV5pZK_y;3lz==4TQMs`C^b#{00Qbl)qMyFWfVT;u#wL*~Y9 zlf#h)lnoNc)UP9}2bcY4E)kK=C-j+!OuD{HsWD+Ub*c2Kyv*8T&3 z3Mk&hE4R-v^oIz1o8n#A+nEta>BW}}s!cO!&U(q+03~#Edp46_1A1U(F&24M$vkj* zjRU4%$)pxsTm9K-ULW>oPLVAtm2s{tF*)9UV3Ix}9v4Xd;<<7s1us2 zyHzb=2NgLanSEjsR?`m`JukT4j0R7e1O@|sZCzHWx`zuidlA<~OUZuht@2AIVW+VO z4y?EwiD~Z-9`EMn2#HhVXffzXqL?BB`aN|1SfR)?nW1vfX79Q5>6Ff1(LoyGQ8R7|XwgBychL6eIf z4L!9M3=v5T;Sw@E=p4gwdzX`<srht)BgL3`eUmb85|a@JXa+0ctx9zZ2Lk@a2ElhM*&(iC6m?~Q zX(ILL#11Q~ZuMgIogF#OI`_OZR3R?U-Fc?_C(&{qg-8@Yj1EsThJ$}o-P*k1fO+rl z-WP4{S;>YR!LfMrH`j#9l7|O_(kt4G4oYUb7d>wT3z_mj+?$a?TjTFzD`a~C88KG+ zQeO8GJ~0YoO=ODcyYsoEuVMqNu8+i-m|`yGVv>JoVFo{UJ{x^EWHNy#LP~lu11Y}9 zgBSJIe}F`sRf?W(dbLX!y;uBZ@GH_Iqv|^g&Vtzeh~h;dkrx0)GKM58Bdcm2%uWyV zJoK1Gm)N7;>p$xq7MQ>wX_U^x@=h7n9M}P)E}`pcoyM9UoT*WCzQL4#&`{cCHc=%t z>&(WCn|J%j;%_hr%@El~nPgcFoCXW01vQ0`GfR$V!+wxc`~t~~g&c971(w*qATJ0> z^*wz*jIvoWm%H@gyP@mY#vf-wg<6*1szanM_mOrIzqG`c#YU>?wA6MK*9O(>L*AYB zoI(+GEQYlWv3~fu_@=imW%n+Hq26O_N%ObAc{kBTr@zEz#nF5WGU{R*UTO`k0p2mz z?pX1UP^?qC#b7V|C{Igg?p;#Rz9M#S8F6$aiv+}C@geOV!l9#cvkCky@05%q)ml=I zCkNPI&dkv+=bwPWXWC(jgf!SR*)I+*IH2zznSLr!vg(r7_nzES|4HpFAH<0b`WFt%(;}rus@4A_ zHcip~1^E}}#iTh65hv296D$uUr#^P}PM znm2G;f!t5x92#1vwwL%#A>sqe)B_7cZ|CBb{@JvWAa|nRJ;pGZ8%Fa$=`UUM=ypUd zGHUm0cpm->x7L`IGsNwL5&EKq#cyjTwXdiPOmmfXTx@>CEHYh1#XMr$ncB znsw2YqV2KvJasWKm44v8?KYS#_9b;eTqo&@NRCw zrEHFNa(>g^-KK_h|{5DQ_Gwuu&qjXJ8?PfoC zPs;sZDS=nw3x zrOzLhM|}~pEupIP9=ViH4gymRYE)yYe3D}k63*ysBXOx;WJr?|JCe;;5kHK}6v9W% z`>iSN-ks&AG0WFzhW2o?@95ByCdU?_i>jy4e%Ng@+X=Tx+Sp&TFONwtf_V_s<@`Xq zHm*p75F)mz{vVoM_od*IXK5ubf;k7rb~@3IpU1Ukh69v4qQK3;o5#iVQ?uyz%B@?v zGji5ZFz96R1?OGQ7i{MHE!Ae}!p-e8o+1ADkaFy`0zXHTa~W;sd(U%h!CAegILvlN zXow~>+;j}kM;U0-(#1|HZ08st#64T*GdIM`;WGp+!*m}Th_OrF(P0JEyx&U1- zF3PrN-kHjo(F0-njdd>X?Umgoe0~Eyym)4bRDNnkOhsODf)e@GfA0gv z?k8xcF(>#@m<>;g6tT{!)pMJB_raTGB%j#9(mpga*L*k&EK5af;bIJ!r-~bC85^9| zn>M)ILm-d$>%zw^^%+>KxQYD<9>&72>Fr?bigf{YRE@K3HPCG? zGmwDHoPOOOMXoxiU*k$r-Uua&i zZj9g`oP8fc$6RVLO(9WCp%!_RbUwIsiy4gLIni+8&?dJ6ot?s0xs!wI$ZD+-A(<*0 z!t^{;4}G!i!c&;ob&{@30YI(Q!}30!e35v9)cM3`+CQ>q4eyaD_V{kJ9#K_R!BD)o z#*lTdCPMTkxqa9BB!~neD`vxqObnYG3s17_f*jiawXXq@i};vPFg%KA2q0}Y|I=f=?@+9{6$`SRloop zwolcm+tx*{P)i}08-7!id)Ooy!OP;XmI4(3LMh;>=Eoc|AmVcmCK-ZKSye7zJ(zgM z2JDu^0*M9%Ro`>*?&I~srP>JNsvkxh-dU#rz%_N^76G@yv0&wb3jGD70!0)fs9ID0 z=Z%w|;FDQiF>gh`BP@#9A?SdWTxjhe-;tKGK@QIUtGbokTIU@NFzq z#ML`vE38y~xkwxoIymz|C-YKB`xzOtirXC3>3mWLv-G<_{WQO)8fCpfs&)js-6_7h z^JCVghA5*q%%Zq42++)t`Gb^Sft;MdwIiH#sZuZOlcc>%45RJswgcpmj-8pG{v#|n z<5?2Wy9aqa@*lR!d9Lm%x*`5Ey$7~x7l)9i`?F#&|-LJB$aISK-gs9t=-6~y91-J@sV#2EPdDY({Kuwxsku-s6vseO)NT6XZWjM;%t0bJX->s6X^n#g&L_YryhOp_Jq z0NL+?7$~~oF24|P8g>jHw|9T|m_>l`l%%`48Q?L~sRKUFqxbeaPtl55!m0R(yEe@4*rg7eg>#L$iSe7) zfxB47RfXL(ouN_Pl3ZfMuzGWCa`QW-Vd4n``1Ao!r$D*TO%jkU+2Vll&fcQ23$YRz zD!ictPkuY!S6ptaNtWEgjMZGmFFuNU*So=`RNifsEQY?|9c~jUR`Jo{Ejd)vWJqJ* z#Z`*Db9B+VDru=4O9PMT{f|PvvMzBhK{i3T?5p;}3SZegR$^v%Pk`WN438T25Xdvt z)NjXxT>*YWs!7He2FFCI?)i@++R@_~zpZ0>>=G{4T{P{hPzg?{Gpsg@aS--rrbOgW zd5v_l8R1Wa<)L^?`Khb9?HycHU&R=0ajwuvB5mdm*XbMQ*>)Q@B!`?8TS#>)M{!p* zz*TD@&%ru}c@shH`KH^cC^xxronjm;5Vz78?4QpXg$}V_TMEHO)<@hY(2uVP(@v*I zPWK-%gnFkdH`Q&=s*{REDQY0};LZYQtLl(N?Wr9Ebx~BX7{o&!zXFY3OTh;{bzUE# z<1w}I`|EGJer0-yfApza z_Pk7 max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) - 1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num + 1) * extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num * extra_len: max_len] + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class SentencePairClassificationProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_0827.tsv")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_0827.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_0827.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[0]) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + ln_type = bert_config.ln_type + if ln_type == 'preln': # add by brightmart, 10-06. if it is preln, we need to an additonal layer: layer normalization as suggested in paper "ON LAYER NORMALIZATION IN THE TRANSFORMER ARCHITECTURE" + print("ln_type is preln. add LN layer.") + output_layer = layer_norm(output_layer) + else: + print("ln_type is postln or other,do nothing.") + + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, + axis=-1) # todo 08-29 try temp-loss + ###############bi_tempered_logistic_loss############################################################################ + # print("##cross entropy loss is used...."); tf.logging.info("##cross entropy loss is used....") + # t1=0.9 #t1=0.90 + # t2=1.05 #t2=1.05 + # per_example_loss=bi_tempered_logistic_loss(log_probs,one_hot_labels,t1,t2,label_smoothing=0.1,num_iters=5) # TODO label_smoothing=0.0 + # tf.logging.info("per_example_loss:"+str(per_example_loss.shape)) + ##############bi_tempered_logistic_loss############################################################################# + + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "sentence_pair": SentencePairClassificationProcessor, + "lcqmc_pair": LCQMCProcessor, + "lcqmc": LCQMCProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "xnli": XnliProcessor, + "thucnews": THUCNewsProcessor, + "bq": BQProcessor, + "iflydata": iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu. + print("###tpu_cluster_resolver:", tpu_cluster_resolver) + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) # TODO + print("###length of total train_examples:", len(train_examples)) + num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + train_file_exists = os.path.exists(train_file) + print("###train_file_exists:", train_file_exists, " ;train_file:", train_file) + if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + # dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_albert_zh.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_albert_zh.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + # test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_albert_zh.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_albert_zh.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/albert/run_classifier_bq.sh b/baselines/models/albert/run_classifier_bq.sh new file mode 100755 index 0000000..14ea2cc --- /dev/null +++ b/baselines/models/albert/run_classifier_bq.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 22:31:52 + +TASK_NAME="bq" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_iflytek.sh b/baselines/models/albert/run_classifier_iflytek.sh new file mode 100644 index 0000000..3478e07 --- /dev/null +++ b/baselines/models/albert/run_classifier_iflytek.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 20:24:06 + +TASK_NAME="iflydata" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_inews.sh b/baselines/models/albert/run_classifier_inews.sh new file mode 100755 index 0000000..2c7c684 --- /dev/null +++ b/baselines/models/albert/run_classifier_inews.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 20:24:11 + +TASK_NAME="inews" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_lcqmc.sh b/baselines/models/albert/run_classifier_lcqmc.sh new file mode 100755 index 0000000..42ec887 --- /dev/null +++ b/baselines/models/albert/run_classifier_lcqmc.sh @@ -0,0 +1,64 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 12:34:38 + +TASK_NAME="lcqmc" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_thucnews.sh b/baselines/models/albert/run_classifier_thucnews.sh new file mode 100644 index 0000000..eebbbef --- /dev/null +++ b/baselines/models/albert/run_classifier_thucnews.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 20:24:16 + +TASK_NAME="thucnews" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_tnews.sh b/baselines/models/albert/run_classifier_tnews.sh new file mode 100755 index 0000000..d536993 --- /dev/null +++ b/baselines/models/albert/run_classifier_tnews.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 22:35:38 + +TASK_NAME="tnews" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download modeltn +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_classifier_xnli.sh b/baselines/models/albert/run_classifier_xnli.sh new file mode 100755 index 0000000..ac7c304 --- /dev/null +++ b/baselines/models/albert/run_classifier_xnli.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-05 23:34:52 + +TASK_NAME="xnli" +MODEL_NAME="albert_xlarge_zh" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export ALBERT_CONFIG_DIR=$CURRENT_DIR/albert_config +export ALBERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ALBERT_XLARGE_DIR=$ALBERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ALBERT_XLARGE_DIR ]; then + mkdir -p $ALBERT_XLARGE_DIR + echo "makedir $ALBERT_XLARGE_DIR" +fi +cd $ALBERT_XLARGE_DIR +if [ ! -f "albert_config_xlarge.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "albert_model.ckpt.index" ] || [ ! -f "albert_model.ckpt.meta" ] || [ ! -f "albert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/albert_zh/albert_xlarge_zh_177k.zip + unzip albert_xlarge_zh_177k.zip + rm albert_xlarge_zh_177k.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ALBERT_CONFIG_DIR/vocab.txt \ + --bert_config_file=$ALBERT_CONFIG_DIR/albert_config_xlarge.json \ + --init_checkpoint=$ALBERT_XLARGE_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/albert/run_pretraining.py b/baselines/models/albert/run_pretraining.py new file mode 100755 index 0000000..346d8bb --- /dev/null +++ b/baselines/models/albert/run_pretraining.py @@ -0,0 +1,501 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(),model.get_embedding_table_2(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + print("init_checkpoint:",init_checkpoint) + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + # next_sentence_example_loss=0.0 TODO + # next_sentence_log_probs=0.0 # TODO + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights,project_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + # logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + # input_tensor=[-1,hidden_size], project_weights=[embedding_size, hidden_size], project_weights_transpose=[hidden_size, embedding_size]--->[-1, embedding_size] + input_project = tf.matmul(input_tensor, project_weights, transpose_b=True) + logits = tf.matmul(input_project, output_weights, transpose_b=True) + # # input_project=[-1, embedding_size], output_weights=[vocab_size, embedding_size], output_weights_transpose=[embedding_size, vocab_size] ---> [-1, vocab_size] + + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型 + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息 + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] # 输入可以是多个文件,以“逗号隔开”;可以是一个匹配形式的,如“input_x*” + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO + tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone) + # ###tpu_cluster_resolver: ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470 + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + keep_checkpoint_max=20, # 10 + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/albert/test_changes.py b/baselines/models/albert/test_changes.py new file mode 100755 index 0000000..f5f1d2e --- /dev/null +++ b/baselines/models/albert/test_changes.py @@ -0,0 +1,87 @@ +# coding=utf-8 +import tensorflow as tf +from modeling import embedding_lookup_factorized,transformer_model +import os + +""" +测试albert主要的改进点:词嵌入的因式分解、层间参数共享、段落间连贯性 +test main change of albert from bert +""" +batch_size = 2048 +sequence_length = 512 +vocab_size = 30000 +hidden_size = 1024 +num_attention_heads = int(hidden_size / 64) + +def get_total_parameters(): + """ + get total parameters of a graph + :return: + """ + total_parameters = 0 + for variable in tf.trainable_variables(): + # shape is an array of tf.Dimension + shape = variable.get_shape() + # print(shape) + # print(len(shape)) + variable_parameters = 1 + for dim in shape: + # print(dim) + variable_parameters *= dim.value + # print(variable_parameters) + total_parameters += variable_parameters + return total_parameters + +def test_factorized_embedding(): + """ + test of Factorized embedding parameterization + :return: + """ + input_ids=tf.zeros((batch_size, sequence_length),dtype=tf.int32) + output, embedding_table, embedding_table_2=embedding_lookup_factorized(input_ids,vocab_size,hidden_size) + print("output:",output) + +def test_share_parameters(): + """ + test of share parameters across all layers: how many parameter after share parameter across layers of transformer. + :return: + """ + def total_parameters_transformer(share_parameter_across_layers): + input_tensor=tf.zeros((batch_size, sequence_length, hidden_size),dtype=tf.float32) + print("transformer_model. input:",input_tensor) + transformer_result=transformer_model(input_tensor,hidden_size=hidden_size,num_attention_heads=num_attention_heads,share_parameter_across_layers=share_parameter_across_layers) + print("transformer_result:",transformer_result) + total_parameters=get_total_parameters() + print('total_parameters(not share):',total_parameters) + + share_parameter_across_layers=False + total_parameters_transformer(share_parameter_across_layers) # total parameters, not share: 125,976,576 = 125 million + + tf.reset_default_graph() # Clears the default graph stack and resets the global default graph + share_parameter_across_layers=True + total_parameters_transformer(share_parameter_across_layers) # total parameters, share: 10,498,048 = 10.5 million + +def test_sentence_order_prediction(): + """ + sentence order prediction. + + check method of create_instances_from_document_albert from create_pretrining_data.py + + :return: + """ + # 添加运行权限 + os.system("chmod +x create_pretrain_data.sh") + + os.system("./create_pretrain_data.sh") + + +# 1.test of Factorized embedding parameterization +#test_factorized_embedding() + +# 2. test of share parameters across all layers: how many parameter after share parameter across layers of transformer. +# before share parameter: 125,976,576; after share parameter: +#test_share_parameters() + +# 3. test of sentence order prediction(SOP) +test_sentence_order_prediction() + diff --git a/baselines/models/albert/tokenization.py b/baselines/models/albert/tokenization.py new file mode 100755 index 0000000..f7020e8 --- /dev/null +++ b/baselines/models/albert/tokenization.py @@ -0,0 +1,401 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + #print("items:",items) #['[CLS]', '日', '##期', ',', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', ',', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', ',', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', ',', '主', '##张', '保', '##证', '责', '##任', ',', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] + for i,item in enumerate(items): + #print(i,"item:",item) # ##期 + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/albert/tpu/run_classifier_inews.sh b/baselines/models/albert/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..76af9b1 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-xlarge/albert_xlarge_zh_183k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-xlarge/albert_xlarge_zh_183k/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_xlarge.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_inews_tiny.sh b/baselines/models/albert/tpu/run_classifier_inews_tiny.sh new file mode 100755 index 0000000..9e3071e --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_inews_tiny.sh @@ -0,0 +1,22 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert_tiny/albert_tiny_207k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert_tiny/tpu/${TASK_NAME}/$CURRENT_TIME + +python3 $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_tiny.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=6e-5 \ + --num_train_epochs=10.0 \ + --save_checkpoints_steps=600 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.240.1.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_lcqmc.sh b/baselines/models/albert/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..5572703 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-xlarge/albert_xlarge_zh_183k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-xlarge/albert_xlarge_zh_183k/tpu/${TASK_NAME}/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_xlarge.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=64 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.250.1.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_lcqmc_tiny.sh b/baselines/models/albert/tpu/run_classifier_lcqmc_tiny.sh new file mode 100755 index 0000000..3ef8cdc --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_lcqmc_tiny.sh @@ -0,0 +1,22 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-tiny/albert_tiny_489k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-tiny/tpu/${TASK_NAME}/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --do_predict=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_tiny.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=6e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://192.168.0.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_thucnews.sh b/baselines/models/albert/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..a80fb8d --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-xlarge/albert_xlarge_zh_183k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-xlarge/albert_xlarge_zh_183k/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_xlarge.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.2.101.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_thucnews_tiny.sh b/baselines/models/albert/tpu/run_classifier_thucnews_tiny.sh new file mode 100755 index 0000000..cc2da55 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_thucnews_tiny.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-tiny/albert_tiny_489k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-tiny/tpu/${TASK_NAME}/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_tiny.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=64 \ + --learning_rate=1e-4 \ + --num_train_epochs=5.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_tnews.sh b/baselines/models/albert/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..198c75f --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-xlarge/albert_xlarge_zh_183k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-xlarge/albert_xlarge_zh_183k/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_xlarge.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=6e-5 \ + --num_train_epochs=9.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.240.1.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_tnews_tiny.sh b/baselines/models/albert/tpu/run_classifier_tnews_tiny.sh new file mode 100755 index 0000000..31c6628 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_tnews_tiny.sh @@ -0,0 +1,22 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-tiny/albert_tiny_489k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-tiny/tpu/${TASK_NAME}/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --do_predict=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_tiny.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=6e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.20.0.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_xnli.sh b/baselines/models/albert/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..f3030c7 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-xlarge/albert_xlarge_zh_183k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-xlarge/albert_xlarge_zh_183k/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_xlarge.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=64 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.2.101.2:8470 diff --git a/baselines/models/albert/tpu/run_classifier_xnli_tiny.sh b/baselines/models/albert/tpu/run_classifier_xnli_tiny.sh new file mode 100755 index 0000000..83f5d35 --- /dev/null +++ b/baselines/models/albert/tpu/run_classifier_xnli_tiny.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/albert-tiny/albert_tiny_207k +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/albert-tiny/albert_tiny_207k/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/albert_config_tiny.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/albert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=64 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.2.101.2:8470 diff --git a/baselines/models/bert/.gitignore b/baselines/models/bert/.gitignore new file mode 100644 index 0000000..df9efad --- /dev/null +++ b/baselines/models/bert/.gitignore @@ -0,0 +1,116 @@ +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/baselines/models/bert/CONTRIBUTING.md b/baselines/models/bert/CONTRIBUTING.md new file mode 100644 index 0000000..124b4b3 --- /dev/null +++ b/baselines/models/bert/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# How to Contribute + +BERT needs to maintain permanent compatibility with the pre-trained model files, +so we do not plan to make any major changes to this library (other than what was +promised in the README). However, we can accept small patches related to +re-factoring and documentation. To submit contributes, there are just a few +small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/baselines/models/bert/LICENSE b/baselines/models/bert/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/baselines/models/bert/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/models/bert/__init__.py b/baselines/models/bert/__init__.py new file mode 100644 index 0000000..effb57b --- /dev/null +++ b/baselines/models/bert/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/baselines/models/bert/conlleval.py b/baselines/models/bert/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/bert/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/bert/create_pretraining_data.py b/baselines/models/bert/create_pretraining_data.py new file mode 100644 index 0000000..5340d96 --- /dev/null +++ b/baselines/models/bert/create_pretraining_data.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/baselines/models/bert/extract_features.py b/baselines/models/bert/extract_features.py new file mode 100644 index 0000000..60e3830 --- /dev/null +++ b/baselines/models/bert/extract_features.py @@ -0,0 +1,419 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract pre-computed feature vectors from BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs +import collections +import json +import re + +import modeling +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "") + +flags.DEFINE_string("output_file", None, "") + +flags.DEFINE_string("layers", "-1,-2,-3,-4", "") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("master", None, + "If using a TPU, the address of the master.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_one_hot_embeddings", False, + "If True, tf.one_hot will be used for embedding lookups, otherwise " + "tf.nn.embedding_lookup will be used. On TPUs, this should be True " + "since it is much faster.") + + +class InputExample(object): + + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids + + +def input_fn_builder(features, seq_length): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] + + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) + + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d + + return input_fn + + +def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] + + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, + initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + all_layers = model.get_all_encoder_layers() + + predictions = { + "unique_id": unique_ids, + } + + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +def convert_examples_to_features(examples, seq_length, tokenizer): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def read_examples(input_file): + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + examples = read_examples(FLAGS.input_file) + + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) + + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) + + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/baselines/models/bert/modeling.py b/baselines/models/bert/modeling.py new file mode 100644 index 0000000..fed5259 --- /dev/null +++ b/baselines/models/bert/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/bert/modeling_test.py b/baselines/models/bert/modeling_test.py new file mode 100644 index 0000000..817ad2d --- /dev/null +++ b/baselines/models/bert/modeling_test.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +import modeling +import six +import tensorflow as tf + + +class BertModelTest(tf.test.TestCase): + + class BertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert/multilingual.md b/baselines/models/bert/multilingual.md new file mode 100644 index 0000000..3b38379 --- /dev/null +++ b/baselines/models/bert/multilingual.md @@ -0,0 +1,303 @@ +## Models + +There are two multilingual models currently available. We do not plan to release +more single-language models, but we may release `BERT-Large` versions of these +two in the future: + +* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: + 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: + 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: + Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M + parameters + +**The `Multilingual Cased (New)` model also fixes normalization issues in many +languages, so it is recommended in languages with non-Latin alphabets (and is +often better for most languages with Latin alphabets). When using this model, +make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other +scripts.** + +See the [list of languages](#list-of-languages) that the Multilingual model +supports. The Multilingual model does include Chinese (and English), but if your +fine-tuning data is Chinese-only, then the Chinese model will likely produce +better results. + +## Results + +To evaluate these systems, we use the +[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a +version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the +dev and test sets have been translated (by humans) into 15 languages. Note that +the training set was *machine* translated (we used the translations provided by +XNLI, not Google NMT). For clarity, we only report on 6 languages below: + + + +| System | English | Chinese | Spanish | German | Arabic | Urdu | +| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | +| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | +| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | +| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | +| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | +| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | +| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | + + + +The first two rows are baselines from the XNLI paper and the last three rows are +our results with BERT. + +**Translate Train** means that the MultiNLI training set was machine translated +from English into the foreign language. So training and evaluation were both +done in the foreign language. Unfortunately, training was done on +machine-translated data, so it is impossible to quantify how much of the lower +accuracy (compared to English) is due to the quality of the machine translation +vs. the quality of the pre-trained model. + +**Translate Test** means that the XNLI test set was machine translated from the +foreign language into English. So training and evaluation were both done on +English. However, test evaluation was done on machine-translated English, so the +accuracy depends on the quality of the machine translation system. + +**Zero Shot** means that the Multilingual BERT system was fine-tuned on English +MultiNLI, and then evaluated on the foreign language XNLI test. In this case, +machine translation was not involved at all in either the pre-training or +fine-tuning. + +Note that the English result is worse than the 84.2 MultiNLI baseline because +this training used Multilingual BERT rather than English-only BERT. This implies +that for high-resource languages, the Multilingual model is somewhat worse than +a single-language model. However, it is not feasible for us to train and +maintain dozens of single-language models. Therefore, if your goal is to maximize +performance with a language other than English or Chinese, you might find it +beneficial to run pre-training for additional steps starting from our +Multilingual model on data from your language of interest. + +Here is a comparison of training Chinese models with the Multilingual +`BERT-Base` and Chinese-only `BERT-Base`: + +System | Chinese +----------------------- | ------- +XNLI Baseline | 67.0 +BERT Multilingual Model | 74.2 +BERT Chinese-only Model | 77.2 + +Similar to English, the single-language model does 3% better than the +Multilingual model. + +## Fine-tuning Example + +The multilingual model does **not** require any special consideration or API +changes. We did update the implementation of `BasicTokenizer` in +`tokenization.py` to support Chinese character tokenization, so please update if +you forked it. However, we did not change the tokenization API. + +To test the new models, we did modify `run_classifier.py` to add support for the +[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language +version of MultiNLI where the dev/test sets have been human-translated, and the +training set has been machine-translated. + +To run the fine-tuning code, please download the +[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the +[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) +and then unpack both .zip files into some directory `$XNLI_DIR`. + +To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` +(Chinese by default), so please modify `XnliProcessor` if you want to run on +another language. + +This is a large dataset, so this will training will take a few hours on a GPU +(or about 30 minutes on a Cloud TPU). To run an experiment quickly for +debugging, just set `num_train_epochs` to a small value like `0.1`. + +```shell +export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 +export XNLI_DIR=/path/to/xnli + +python run_classifier.py \ + --task_name=XNLI \ + --do_train=true \ + --do_eval=true \ + --data_dir=$XNLI_DIR \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --output_dir=/tmp/xnli_output/ +``` + +With the Chinese-only model, the results should look something like this: + +``` + ***** Eval results ***** +eval_accuracy = 0.774116 +eval_loss = 0.83554 +global_step = 24543 +loss = 0.74603 +``` + +## Details + +### Data Source and Sampling + +The languages chosen were the +[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). +The entire Wikipedia dump for each language (excluding user and talk pages) was +taken as the training data for each language + +However, the size of the Wikipedia for a given language varies greatly, and +therefore low-resource languages may be "under-represented" in terms of the +neural network model (under the assumption that languages are "competing" for +limited model capacity to some extent). At the same time, we also don't want +to overfit the model by performing thousands of epochs over a tiny Wikipedia +for a particular language. + +To balance these two factors, we performed exponentially smoothed weighting of +the data during pre-training data creation (and WordPiece vocab creation). In +other words, let's say that the probability of a language is *P(L)*, e.g., +*P(English) = 0.21* means that after concatenating all of the Wikipedias +together, 21% of our data is English. We exponentiate each probability by some +factor *S* and then re-normalize, and sample from that distribution. In our case +we use *S=0.7*. So, high-resource languages like English will be under-sampled, +and low-resource languages like Icelandic will be over-sampled. E.g., in the +original distribution English would be sampled 1000x more than Icelandic, but +after smoothing it's only sampled 100x more. + +### Tokenization + +For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are +weighted the same way as the data, so low-resource languages are upweighted by +some factor. We intentionally do *not* use any marker to denote the input +language (so that zero-shot training can work). + +Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace +characters, we add spaces around every character in the +[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) +before applying WordPiece. This means that Chinese is effectively +character-tokenized. Note that the CJK Unicode block only includes +Chinese-origin characters and does *not* include Hangul Korean or +Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like +all other languages. + +For all other languages, we apply the +[same recipe as English](https://github.com/google-research/bert#tokenization): +(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace +tokenization. We understand that accent markers have substantial meaning in some +languages, but felt that the benefits of reducing the effective vocabulary make +up for this. Generally the strong contextual models of BERT should make up for +any ambiguity introduced by stripping accent markers. + +### List of Languages + +The multilingual model supports the following languages. These languages were +chosen because they are the top 100 languages with the largest Wikipedias: + +* Afrikaans +* Albanian +* Arabic +* Aragonese +* Armenian +* Asturian +* Azerbaijani +* Bashkir +* Basque +* Bavarian +* Belarusian +* Bengali +* Bishnupriya Manipuri +* Bosnian +* Breton +* Bulgarian +* Burmese +* Catalan +* Cebuano +* Chechen +* Chinese (Simplified) +* Chinese (Traditional) +* Chuvash +* Croatian +* Czech +* Danish +* Dutch +* English +* Estonian +* Finnish +* French +* Galician +* Georgian +* German +* Greek +* Gujarati +* Haitian +* Hebrew +* Hindi +* Hungarian +* Icelandic +* Ido +* Indonesian +* Irish +* Italian +* Japanese +* Javanese +* Kannada +* Kazakh +* Kirghiz +* Korean +* Latin +* Latvian +* Lithuanian +* Lombard +* Low Saxon +* Luxembourgish +* Macedonian +* Malagasy +* Malay +* Malayalam +* Marathi +* Minangkabau +* Nepali +* Newar +* Norwegian (Bokmal) +* Norwegian (Nynorsk) +* Occitan +* Persian (Farsi) +* Piedmontese +* Polish +* Portuguese +* Punjabi +* Romanian +* Russian +* Scots +* Serbian +* Serbo-Croatian +* Sicilian +* Slovak +* Slovenian +* South Azerbaijani +* Spanish +* Sundanese +* Swahili +* Swedish +* Tagalog +* Tajik +* Tamil +* Tatar +* Telugu +* Turkish +* Ukrainian +* Urdu +* Uzbek +* Vietnamese +* Volapük +* Waray-Waray +* Welsh +* West Frisian +* Western Punjabi +* Yoruba + +The **Multilingual Cased (New)** release contains additionally **Thai** and +**Mongolian**, which were not included in the original release. diff --git a/baselines/models/bert/optimization.py b/baselines/models/bert/optimization.py new file mode 100644 index 0000000..d33dabd --- /dev/null +++ b/baselines/models/bert/optimization.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/bert/optimization_test.py b/baselines/models/bert/optimization_test.py new file mode 100644 index 0000000..4f2dcf1 --- /dev/null +++ b/baselines/models/bert/optimization_test.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import optimization +import tensorflow as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb b/baselines/models/bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb new file mode 100644 index 0000000..466857f --- /dev/null +++ b/baselines/models/bert/predicting_movie_reviews_with_bert_on_tf_hub.ipynb @@ -0,0 +1,1231 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "metadata": { + "id": "j0a4mTk9o1Qg", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Copyright 2019 Google Inc.\n", + "\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dCpvgG0vwXAZ", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Predicting Movie Review Sentiment with BERT on TF Hub" + ] + }, + { + "metadata": { + "id": "xiYrZKaHwV81", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n", + "\n", + "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n", + "\n", + "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!" + ] + }, + { + "metadata": { + "id": "hsZvic2YxnTz", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "from datetime import datetime" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "cp5wfXDx5SPH", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "In addition to the standard libraries we imported above, we'll need to install BERT's python package." + ] + }, + { + "metadata": { + "id": "jviywGyWyKsA", + "colab_type": "code", + "outputId": "166f3005-d219-404f-b201-2a0b75480360", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + } + }, + "cell_type": "code", + "source": [ + "!pip install bert-tensorflow" + ], + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.6/dist-packages (1.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.11.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "hhbGEfwgdEtw", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import bert\n", + "from bert import run_classifier\n", + "from bert import optimization\n", + "from bert import tokenization" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "KVB3eOcjxxm1", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n", + "\n", + "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n", + "\n", + "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)." + ] + }, + { + "metadata": { + "id": "US_EAnICvP7f", + "colab_type": "code", + "outputId": "7780a032-31d4-4794-e6aa-664a5d2ae7dd", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# Set the output directory for saving model file\n", + "# Optionally, set a GCP bucket location\n", + "\n", + "OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:\"string\"}\n", + "#@markdown Whether or not to clear/delete the directory and create a new one\n", + "DO_DELETE = False #@param {type:\"boolean\"}\n", + "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n", + "USE_BUCKET = True #@param {type:\"boolean\"}\n", + "BUCKET = 'BUCKET_NAME' #@param {type:\"string\"}\n", + "\n", + "if USE_BUCKET:\n", + " OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + "\n", + "if DO_DELETE:\n", + " try:\n", + " tf.gfile.DeleteRecursively(OUTPUT_DIR)\n", + " except:\n", + " # Doesn't matter if the directory didn't exist\n", + " pass\n", + "tf.gfile.MakeDirs(OUTPUT_DIR)\n", + "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "***** Model output directory: gs://bert-tfhub/aclImdb_v1 *****\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "pmFYvkylMwXn", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data" + ] + }, + { + "metadata": { + "id": "MC_w8SRqN0fr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)." + ] + }, + { + "metadata": { + "id": "fom_ff20gyy6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from tensorflow import keras\n", + "import os\n", + "import re\n", + "\n", + "# Load all files from a directory in a DataFrame.\n", + "def load_directory_data(directory):\n", + " data = {}\n", + " data[\"sentence\"] = []\n", + " data[\"sentiment\"] = []\n", + " for file_path in os.listdir(directory):\n", + " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", + " data[\"sentence\"].append(f.read())\n", + " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", + " return pd.DataFrame.from_dict(data)\n", + "\n", + "# Merge positive and negative examples, add a polarity column and shuffle.\n", + "def load_dataset(directory):\n", + " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", + " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", + " pos_df[\"polarity\"] = 1\n", + " neg_df[\"polarity\"] = 0\n", + " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", + "\n", + "# Download and process the dataset files.\n", + "def download_and_load_datasets(force_download=False):\n", + " dataset = tf.keras.utils.get_file(\n", + " fname=\"aclImdb.tar.gz\", \n", + " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", + " extract=True)\n", + " \n", + " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"train\"))\n", + " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"test\"))\n", + " \n", + " return train_df, test_df\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "2abfwdn-g135", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train, test = download_and_load_datasets()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "XA8WHJgzhIZf", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To keep training fast, we'll take a sample of 5000 train and test examples, respectively." + ] + }, + { + "metadata": { + "id": "lw_F488eixTV", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train = train.sample(5000)\n", + "test = test.sample(5000)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "prRQM8pDi8xI", + "colab_type": "code", + "outputId": "34445cb8-2be0-4379-fdbc-7794091f6049", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "train.columns" + ], + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['sentence', 'sentiment', 'polarity'], dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 44 + } + ] + }, + { + "metadata": { + "id": "sfRnHSz3iSXz", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)" + ] + }, + { + "metadata": { + "id": "IuMOGwFui4it", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "DATA_COLUMN = 'sentence'\n", + "LABEL_COLUMN = 'polarity'\n", + "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", + "label_list = [0, 1]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "V399W0rqNJ-Z", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data Preprocessing\n", + "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n", + "\n", + "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n", + "- `label` is the label for our example, i.e. True, False" + ] + }, + { + "metadata": { + "id": "p9gEt5SmM6i6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", + "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)\n", + "\n", + "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "SCZWZtKxObjh", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n", + "\n", + "\n", + "1. Lowercase our text (if we're using a BERT lowercase model)\n", + "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", + "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", + "4. Map our words to indexes using a vocab file that BERT provides\n", + "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n", + "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", + "\n", + "Happily, we don't have to worry about most of these details.\n", + "\n", + "\n" + ] + }, + { + "metadata": { + "id": "qMWiDtpyQSoU", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:" + ] + }, + { + "metadata": { + "id": "IhJSe0QHNG7U", + "colab_type": "code", + "outputId": "20b28cc7-3cb3-4ce6-bfff-a7847ce3bbaa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# This is a path to an uncased (all lowercase) version of BERT\n", + "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", + "\n", + "def create_tokenizer_from_hub_module():\n", + " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", + " with tf.Graph().as_default():\n", + " bert_module = hub.Module(BERT_MODEL_HUB)\n", + " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", + " with tf.Session() as sess:\n", + " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", + " tokenization_info[\"do_lower_case\"]])\n", + " \n", + " return bert.tokenization.FullTokenizer(\n", + " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", + "\n", + "tokenizer = create_tokenizer_from_hub_module()" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "z4oFkhpZBDKm", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:" + ] + }, + { + "metadata": { + "id": "dsBo6RCtQmwx", + "colab_type": "code", + "outputId": "9af8c917-90ec-4fe9-897b-79dc89ca88e1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + } + }, + "cell_type": "code", + "source": [ + "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['this',\n", + " 'here',\n", + " \"'\",\n", + " 's',\n", + " 'an',\n", + " 'example',\n", + " 'of',\n", + " 'using',\n", + " 'the',\n", + " 'bert',\n", + " 'token',\n", + " '##izer']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + } + ] + }, + { + "metadata": { + "id": "0OEzfFIt6GIc", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands." + ] + }, + { + "metadata": { + "id": "LL5W8gEGRTAf", + "colab_type": "code", + "outputId": "65001dda-155b-48fc-b5fc-1e4cabc8dfbf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1261 + } + }, + "cell_type": "code", + "source": [ + "# We'll set sequences to be at most 128 tokens long.\n", + "MAX_SEQ_LENGTH = 128\n", + "# Convert our train and test features to InputFeatures that BERT understands.\n", + "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" + ], + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i ' m watching this on the sci - fi channel right now . it ' s so horrible i can ' t stop watching it ! i ' m a video ##grapher and this movie makes me sad . i feel bad for anyone associated with this movie . some of the camera work is good . most is very questionable . there are a few decent actors in the flick . too bad they ' re surrounded by what must have been the director ' s relatives . that ' s the only way they could have been qualified to be in a movie ! music was a little better than the acting . if you get around to watching this i hope it [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 1005 1049 3666 2023 2006 1996 16596 1011 10882 3149 2157 2085 1012 2009 1005 1055 2061 9202 1045 2064 1005 1056 2644 3666 2009 999 1045 1005 1049 1037 2678 18657 1998 2023 3185 3084 2033 6517 1012 1045 2514 2919 2005 3087 3378 2007 2023 3185 1012 2070 1997 1996 4950 2147 2003 2204 1012 2087 2003 2200 21068 1012 2045 2024 1037 2261 11519 5889 1999 1996 17312 1012 2205 2919 2027 1005 2128 5129 2011 2054 2442 2031 2042 1996 2472 1005 1055 9064 1012 2008 1005 1055 1996 2069 2126 2027 2071 2031 2042 4591 2000 2022 1999 1037 3185 999 2189 2001 1037 2210 2488 2084 1996 3772 1012 2065 2017 2131 2105 2000 3666 2023 1045 3246 2009 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i have been a fan of pushing dai ##sies since the very beginning . it is wonderful ##ly thought up , and bryan fuller has the most remarkable ideas for this show . < br / > < br / > it is unbelievable on how much tv has been needing a creative , original show like pushing dai ##sies . it is a huge relief to see a show , that is unlike the rest , where as , if you compared it to some of the newer shows , such as scrub ##s and house , you would see the similarities , and it does get ted ##ious at moments to see shows so close in identity . < br / > < br [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2031 2042 1037 5470 1997 6183 18765 14625 2144 1996 2200 2927 1012 2009 2003 6919 2135 2245 2039 1010 1998 8527 12548 2038 1996 2087 9487 4784 2005 2023 2265 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 2003 23653 2006 2129 2172 2694 2038 2042 11303 1037 5541 1010 2434 2265 2066 6183 18765 14625 1012 2009 2003 1037 4121 4335 2000 2156 1037 2265 1010 2008 2003 4406 1996 2717 1010 2073 2004 1010 2065 2017 4102 2009 2000 2070 1997 1996 10947 3065 1010 2107 2004 18157 2015 1998 2160 1010 2017 2052 2156 1996 12319 1010 1998 2009 2515 2131 6945 6313 2012 5312 2000 2156 3065 2061 2485 1999 4767 1012 1026 7987 1013 1028 1026 7987 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] this movie starts out promising ##ly , with an early scene in which frank morgan advises against gary cooper ' s marriage to his daughter , anita louise . frank morgan , playing an una ##bas ##hed gold - digger , loudly complain ##s to cooper about his perceived pen ##ury at the hands of his family - including his daughter , anita louise . i am a fan of all 3 actors . frank morgan is ( to my mind ) a hollywood treasure , cooper a legend , and louise a very lovely , versatile and under - appreciated actress seldom seen in the leading role . i also have nothing against teresa wright , and while not blessed with great range , she [SEP]\n", + "INFO:tensorflow:input_ids: 101 2023 3185 4627 2041 10015 2135 1010 2007 2019 2220 3496 1999 2029 3581 5253 25453 2114 5639 6201 1005 1055 3510 2000 2010 2684 1010 12918 8227 1012 3581 5253 1010 2652 2019 14477 22083 9072 2751 1011 28661 1010 9928 17612 2015 2000 6201 2055 2010 8690 7279 13098 2012 1996 2398 1997 2010 2155 1011 2164 2010 2684 1010 12918 8227 1012 1045 2572 1037 5470 1997 2035 1017 5889 1012 3581 5253 2003 1006 2000 2026 2568 1007 1037 5365 8813 1010 6201 1037 5722 1010 1998 8227 1037 2200 8403 1010 22979 1998 2104 1011 12315 3883 15839 2464 1999 1996 2877 2535 1012 1045 2036 2031 2498 2114 12409 6119 1010 1998 2096 2025 10190 2007 2307 2846 1010 2016 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i was over ##taken by the emotion . un ##for ##get ##table rendering of a wartime story which is unknown to most people . the performances were fault ##less and outstanding . [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2001 2058 25310 2011 1996 7603 1012 4895 29278 18150 10880 14259 1997 1037 12498 2466 2029 2003 4242 2000 2087 2111 1012 1996 4616 2020 6346 3238 1998 5151 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] soldier blue is a movie with pre ##tension ##s : pre ##tension ##s to be some sort of profound statement on man ' s inhuman ##ity to man , on the white man ' s exploitation of and brutality towards indigenous peoples ; a biting , un ##fl ##in ##ching and sar ##don ##ic commentary on the horrors of vietnam . well , sorry , but it fails mis ##era ##bly to be any of those things . what soldier blue actually is is per ##nic ##ious , tri ##te , badly made , dish ##ones ##t rubbish . < br / > < br / > another reviewer here hit the nail on the head in saying that it appears to be a hybrid of [SEP]\n", + "INFO:tensorflow:input_ids: 101 5268 2630 2003 1037 3185 2007 3653 29048 2015 1024 3653 29048 2015 2000 2022 2070 4066 1997 13769 4861 2006 2158 1005 1055 29582 3012 2000 2158 1010 2006 1996 2317 2158 1005 1055 14427 1997 1998 24083 2875 6284 7243 1025 1037 12344 1010 4895 10258 2378 8450 1998 18906 5280 2594 8570 2006 1996 22812 1997 5148 1012 2092 1010 3374 1010 2021 2009 11896 28616 6906 6321 2000 2022 2151 1997 2216 2477 1012 2054 5268 2630 2941 2003 2003 2566 8713 6313 1010 13012 2618 1010 6649 2081 1010 9841 21821 2102 29132 1012 1026 7987 1013 1028 1026 7987 1013 1028 2178 12027 2182 2718 1996 13774 2006 1996 2132 1999 3038 2008 2009 3544 2000 2022 1037 8893 1997 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i just watched this today on tv . it was on abc ' s sunday afternoon movie . < br / > < br / > this wasn ' t a very good movie , but for a low budget independent film like this , it was okay . there is some suspense in it , but there are so many bad qualities that really bring the movie down . the script is pretty lame , and the plot elements aren ' t very realistic , such as the way a 911 operator would laugh and hang up when someone is reporting a murder . i don ' t know what the writer was thinking when they came up with that idea , but it isn [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2074 3427 2023 2651 2006 2694 1012 2009 2001 2006 5925 1005 1055 4465 5027 3185 1012 1026 7987 1013 1028 1026 7987 1013 1028 2023 2347 1005 1056 1037 2200 2204 3185 1010 2021 2005 1037 2659 5166 2981 2143 2066 2023 1010 2009 2001 3100 1012 2045 2003 2070 23873 1999 2009 1010 2021 2045 2024 2061 2116 2919 11647 2008 2428 3288 1996 3185 2091 1012 1996 5896 2003 3492 20342 1010 1998 1996 5436 3787 4995 1005 1056 2200 12689 1010 2107 2004 1996 2126 1037 19989 6872 2052 4756 1998 6865 2039 2043 2619 2003 7316 1037 4028 1012 1045 2123 1005 1056 2113 2054 1996 3213 2001 3241 2043 2027 2234 2039 2007 2008 2801 1010 2021 2009 3475 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] from hardly alien sounding lasers , to an elementary school style shuttle crash , \" night ##be ##ast \" is better classified as a far ##cic ##al mix of fake blood and bare chest . the almost pornographic style of the film seems to be a failed attempt to recover from a lack of co ##hesive or effective story . the acting however is not nearly as beast ##ly , many of the young , aspiring , actors ad ##mir ##ably showcase a hidden talent . particularly don lei ##fer ##t and jamie ze ##mare ##l , who shed a well needed sha ##rd of light on this otherwise terrible film . night ##be ##ast would have never shown up on set had he known the [SEP]\n", + "INFO:tensorflow:input_ids: 101 2013 6684 7344 9391 23965 1010 2000 2019 4732 2082 2806 10382 5823 1010 1000 2305 4783 14083 1000 2003 2488 6219 2004 1037 2521 19053 2389 4666 1997 8275 2668 1998 6436 3108 1012 1996 2471 26932 2806 1997 1996 2143 3849 2000 2022 1037 3478 3535 2000 8980 2013 1037 3768 1997 2522 21579 2030 4621 2466 1012 1996 3772 2174 2003 2025 3053 2004 6841 2135 1010 2116 1997 1996 2402 1010 22344 1010 5889 4748 14503 8231 13398 1037 5023 5848 1012 3391 2123 26947 7512 2102 1998 6175 27838 24376 2140 1010 2040 8328 1037 2092 2734 21146 4103 1997 2422 2006 2023 4728 6659 2143 1012 2305 4783 14083 2052 2031 2196 3491 2039 2006 2275 2018 2002 2124 1996 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] here we have the in ##imi ##table charlie chaplin for ##sa ##king his slap ##stick past to tackle the serious subject of anti - semi ##tism , and into ##ler ##ance in general . he portrays two characters - the sweet , innocent jewish barber - a war veteran , and the ravi ##ng and ruthless dictator , aden ##oid h ##yn ##kel . the jewish ghetto in this country is not safe for long , due to the w ##him ##s of h ##yn ##kel and his armed thugs , who routinely rough up its residents , or leave them alone , dependent upon his mood that day or week . the barber is among them , but is befriended by his former commanding officer [SEP]\n", + "INFO:tensorflow:input_ids: 101 2182 2057 2031 1996 1999 27605 10880 4918 23331 2005 3736 6834 2010 14308 21354 2627 2000 11147 1996 3809 3395 1997 3424 1011 4100 17456 1010 1998 2046 3917 6651 1999 2236 1012 2002 17509 2048 3494 1011 1996 4086 1010 7036 3644 13362 1011 1037 2162 8003 1010 1998 1996 16806 3070 1998 18101 21237 1010 16298 9314 1044 6038 11705 1012 1996 3644 17276 1999 2023 2406 2003 2025 3647 2005 2146 1010 2349 2000 1996 1059 14341 2015 1997 1044 6038 11705 1998 2010 4273 24106 1010 2040 19974 5931 2039 2049 3901 1010 2030 2681 2068 2894 1010 7790 2588 2010 6888 2008 2154 2030 2733 1012 1996 13362 2003 2426 2068 1010 2021 2003 23386 2011 2010 2280 7991 2961 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i really hated this movie and it ' s the first movie written by stephen king that i didn ' t finish . i was truly disappointed , it was the worst crap i ' ve ever seen . what were you thinking making three hours out of it ? it may have a quite good story , but actors ? no . suspense ? no . romance ? no . horror ? no . it didn ' t have anything . < br / > < br / > it ' s got this strange , crazy science man with einstein - hair , the classic thing . not real at all . and a man keep getting younger all the time . it seems [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2428 6283 2023 3185 1998 2009 1005 1055 1996 2034 3185 2517 2011 4459 2332 2008 1045 2134 1005 1056 3926 1012 1045 2001 5621 9364 1010 2009 2001 1996 5409 10231 1045 1005 2310 2412 2464 1012 2054 2020 2017 3241 2437 2093 2847 2041 1997 2009 1029 2009 2089 2031 1037 3243 2204 2466 1010 2021 5889 1029 2053 1012 23873 1029 2053 1012 7472 1029 2053 1012 5469 1029 2053 1012 2009 2134 1005 1056 2031 2505 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 1005 1055 2288 2023 4326 1010 4689 2671 2158 2007 15313 1011 2606 1010 1996 4438 2518 1012 2025 2613 2012 2035 1012 1998 1037 2158 2562 2893 3920 2035 1996 2051 1012 2009 3849 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] story chinese tall story tells the story of righteous monk trip ##ita ##ka , who , along with his guardians monkey , sandy and pigs ##y make their journey west on a quest to recover ancient sutra ##s , finally , they reach the final leg of their journey in sha ##che city but all is not as it seems when the city is attacked by evil tree demons . monkey tries his best to battle them but is overwhelmed , knowing his master is in grave danger , he uses his trust ##y golden staff to thrust trip ##ita ##ka to safety . < br / > < br / > the monk ends up being knocked out when he land and when he wakes [SEP]\n", + "INFO:tensorflow:input_ids: 101 2466 2822 4206 2466 4136 1996 2466 1997 19556 8284 4440 6590 2912 1010 2040 1010 2247 2007 2010 14240 10608 1010 7525 1998 14695 2100 2191 2037 4990 2225 2006 1037 8795 2000 8980 3418 26567 2015 1010 2633 1010 2027 3362 1996 2345 4190 1997 2037 4990 1999 21146 5403 2103 2021 2035 2003 2025 2004 2009 3849 2043 1996 2103 2003 4457 2011 4763 3392 7942 1012 10608 5363 2010 2190 2000 2645 2068 2021 2003 13394 1010 4209 2010 3040 2003 1999 6542 5473 1010 2002 3594 2010 3404 2100 3585 3095 2000 7400 4440 6590 2912 2000 3808 1012 1026 7987 1013 1028 1026 7987 1013 1028 1996 8284 4515 2039 2108 6573 2041 2043 2002 2455 1998 2043 2002 17507 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "ccp5trMwRtmr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Creating a model\n", + "\n", + "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)." + ] + }, + { + "metadata": { + "id": "6o2a5ZIvRcJq", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", + " num_labels):\n", + " \"\"\"Creates a classification model.\"\"\"\n", + "\n", + " bert_module = hub.Module(\n", + " BERT_MODEL_HUB,\n", + " trainable=True)\n", + " bert_inputs = dict(\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " segment_ids=segment_ids)\n", + " bert_outputs = bert_module(\n", + " inputs=bert_inputs,\n", + " signature=\"tokens\",\n", + " as_dict=True)\n", + "\n", + " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", + " # Use \"sequence_outputs\" for token-level output.\n", + " output_layer = bert_outputs[\"pooled_output\"]\n", + "\n", + " hidden_size = output_layer.shape[-1].value\n", + "\n", + " # Create our own layer to tune for politeness data.\n", + " output_weights = tf.get_variable(\n", + " \"output_weights\", [num_labels, hidden_size],\n", + " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", + "\n", + " output_bias = tf.get_variable(\n", + " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", + "\n", + " with tf.variable_scope(\"loss\"):\n", + "\n", + " # Dropout helps prevent overfitting\n", + " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", + "\n", + " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", + " logits = tf.nn.bias_add(logits, output_bias)\n", + " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", + "\n", + " # Convert labels into one-hot encoding\n", + " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", + "\n", + " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", + " # If we're predicting, we want predicted labels and the probabiltiies.\n", + " if is_predicting:\n", + " return (predicted_labels, log_probs)\n", + "\n", + " # If we're train/eval, compute loss between predicted and actual label\n", + " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", + " loss = tf.reduce_mean(per_example_loss)\n", + " return (loss, predicted_labels, log_probs)\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "qpE0ZIDOCQzE", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction." + ] + }, + { + "metadata": { + "id": "FnH-AnOQ9KKW", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# model_fn_builder actually creates our model function\n", + "# using the passed parameters for num_labels, learning_rate, etc.\n", + "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", + " num_warmup_steps):\n", + " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", + " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", + " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", + "\n", + " input_ids = features[\"input_ids\"]\n", + " input_mask = features[\"input_mask\"]\n", + " segment_ids = features[\"segment_ids\"]\n", + " label_ids = features[\"label_ids\"]\n", + "\n", + " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", + " \n", + " # TRAIN and EVAL\n", + " if not is_predicting:\n", + "\n", + " (loss, predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " train_op = bert.optimization.create_optimizer(\n", + " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", + "\n", + " # Calculate evaluation metrics. \n", + " def metric_fn(label_ids, predicted_labels):\n", + " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", + " f1_score = tf.contrib.metrics.f1_score(\n", + " label_ids,\n", + " predicted_labels)\n", + " auc = tf.metrics.auc(\n", + " label_ids,\n", + " predicted_labels)\n", + " recall = tf.metrics.recall(\n", + " label_ids,\n", + " predicted_labels)\n", + " precision = tf.metrics.precision(\n", + " label_ids,\n", + " predicted_labels) \n", + " true_pos = tf.metrics.true_positives(\n", + " label_ids,\n", + " predicted_labels)\n", + " true_neg = tf.metrics.true_negatives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_pos = tf.metrics.false_positives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_neg = tf.metrics.false_negatives(\n", + " label_ids,\n", + " predicted_labels)\n", + " return {\n", + " \"eval_accuracy\": accuracy,\n", + " \"f1_score\": f1_score,\n", + " \"auc\": auc,\n", + " \"precision\": precision,\n", + " \"recall\": recall,\n", + " \"true_positives\": true_pos,\n", + " \"true_negatives\": true_neg,\n", + " \"false_positives\": false_pos,\n", + " \"false_negatives\": false_neg\n", + " }\n", + "\n", + " eval_metrics = metric_fn(label_ids, predicted_labels)\n", + "\n", + " if mode == tf.estimator.ModeKeys.TRAIN:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " train_op=train_op)\n", + " else:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " eval_metric_ops=eval_metrics)\n", + " else:\n", + " (predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " predictions = {\n", + " 'probabilities': log_probs,\n", + " 'labels': predicted_labels\n", + " }\n", + " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", + "\n", + " # Return the actual model function in the closure\n", + " return model_fn\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "OjwJ4bTeWXD8", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute train and warmup steps from batch size\n", + "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", + "BATCH_SIZE = 32\n", + "LEARNING_RATE = 2e-5\n", + "NUM_TRAIN_EPOCHS = 3.0\n", + "# Warmup is a period of time where hte learning rate \n", + "# is small and gradually increases--usually helps training.\n", + "WARMUP_PROPORTION = 0.1\n", + "# Model configs\n", + "SAVE_CHECKPOINTS_STEPS = 500\n", + "SAVE_SUMMARY_STEPS = 100" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "emHf9GhfWBZ_", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute # train and warmup steps from batch size\n", + "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", + "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "oEJldMr3WYZa", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Specify outpit directory and number of checkpoint steps to save\n", + "run_config = tf.estimator.RunConfig(\n", + " model_dir=OUTPUT_DIR,\n", + " save_summary_steps=SAVE_SUMMARY_STEPS,\n", + " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "q_WebpS1X97v", + "colab_type": "code", + "outputId": "1648932a-7391-49d3-8af7-52d514e226e8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + } + }, + "cell_type": "code", + "source": [ + "model_fn = model_fn_builder(\n", + " num_labels=len(label_list),\n", + " learning_rate=LEARNING_RATE,\n", + " num_train_steps=num_train_steps,\n", + " num_warmup_steps=num_warmup_steps)\n", + "\n", + "estimator = tf.estimator.Estimator(\n", + " model_fn=model_fn,\n", + " config=run_config,\n", + " params={\"batch_size\": BATCH_SIZE})\n" + ], + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Using config: {'_model_dir': 'gs://bert-tfhub/aclImdb_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", + "graph_options {\n", + " rewrite_options {\n", + " meta_optimizer_iterations: ONE\n", + " }\n", + "}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "NOO3RfG1DYLo", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)." + ] + }, + { + "metadata": { + "id": "1Pv2bAlOX_-K", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Create an input function for training. drop_remainder = True for using TPUs.\n", + "train_input_fn = bert.run_classifier.input_fn_builder(\n", + " features=train_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=True,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "t6Nukby2EB6-", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes." + ] + }, + { + "metadata": { + "id": "nucD4gluYJmK", + "colab_type": "code", + "outputId": "5d728e72-4631-42bf-c48d-3f51d4b968ce", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "cell_type": "code", + "source": [ + "print(f'Beginning Training!')\n", + "current_time = datetime.now()\n", + "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", + "print(\"Training took time \", datetime.now() - current_time)" + ], + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Beginning Training!\n", + "INFO:tensorflow:Skipping training since max_steps has already saved.\n", + "Training took time 0:00:00.759709\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "CmbLTVniARy3", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's use our test data to see how well our model did:" + ] + }, + { + "metadata": { + "id": "JIhejfpyJ8Bx", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "test_input_fn = run_classifier.input_fn_builder(\n", + " features=test_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=False,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "PPVEXhNjYXC-", + "colab_type": "code", + "outputId": "dd5482cd-c558-465f-c854-ec11a0175316", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 445 + } + }, + "cell_type": "code", + "source": [ + "estimator.evaluate(input_fn=test_input_fn, steps=None)" + ], + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Starting evaluation at 2019-02-12T21:04:20Z\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n", + "INFO:tensorflow:Finished evaluation at 2019-02-12-21:06:05\n", + "INFO:tensorflow:Saving dict for global step 468: auc = 0.86659324, eval_accuracy = 0.8664, f1_score = 0.8659711, false_negatives = 375.0, false_positives = 293.0, global_step = 468, loss = 0.51870537, precision = 0.880457, recall = 0.8519542, true_negatives = 2174.0, true_positives = 2158.0\n", + "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'auc': 0.86659324,\n", + " 'eval_accuracy': 0.8664,\n", + " 'f1_score': 0.8659711,\n", + " 'false_negatives': 375.0,\n", + " 'false_positives': 293.0,\n", + " 'global_step': 468,\n", + " 'loss': 0.51870537,\n", + " 'precision': 0.880457,\n", + " 'recall': 0.8519542,\n", + " 'true_negatives': 2174.0,\n", + " 'true_positives': 2158.0}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 59 + } + ] + }, + { + "metadata": { + "id": "ueKsULteiz1B", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's write code to make predictions on new sentences:" + ] + }, + { + "metadata": { + "id": "OsrbTD2EJTVl", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def getPrediction(in_sentences):\n", + " labels = [\"Negative\", \"Positive\"]\n", + " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", + " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", + " predictions = estimator.predict(predict_input_fn)\n", + " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "-thbodgih_VJ", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "pred_sentences = [\n", + " \"That movie was absolutely awful\",\n", + " \"The acting was a bit lacking\",\n", + " \"The film was creative and surprising\",\n", + " \"Absolutely fantastic!\"\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QrZmvZySKQTm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 649 + }, + "outputId": "3891fafb-a460-4eb8-fa6c-335a5bbc10e5" + }, + "cell_type": "code", + "source": [ + "predictions = getPrediction(pred_sentences)" + ], + "execution_count": 72, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 4\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] that movie was absolutely awful [SEP]\n", + "INFO:tensorflow:input_ids: 101 2008 3185 2001 7078 9643 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the acting was a bit lacking [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 3772 2001 1037 2978 11158 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the film was creative and surprising [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 2143 2001 5541 1998 11341 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] absolutely fantastic ! [SEP]\n", + "INFO:tensorflow:input_ids: 101 7078 10392 999 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "MXkRiEBUqN3n", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Voila! We have a sentiment classifier!" + ] + }, + { + "metadata": { + "id": "ERkTE8-7oQLZ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "outputId": "26c33224-dc2c-4b3d-f7b4-ac3ef0a58b27" + }, + "cell_type": "code", + "source": [ + "predictions" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('That movie was absolutely awful',\n", + " array([-4.9142293e-03, -5.3180690e+00], dtype=float32),\n", + " 'Negative'),\n", + " ('The acting was a bit lacking',\n", + " array([-0.03325794, -3.4200459 ], dtype=float32),\n", + " 'Negative'),\n", + " ('The film was creative and surprising',\n", + " array([-5.3589125e+00, -4.7171740e-03], dtype=float32),\n", + " 'Positive'),\n", + " ('Absolutely fantastic!',\n", + " array([-5.0434084 , -0.00647258], dtype=float32),\n", + " 'Positive')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 73 + } + ] + } + ] +} \ No newline at end of file diff --git a/baselines/models/bert/requirements.txt b/baselines/models/bert/requirements.txt new file mode 100644 index 0000000..357b5ea --- /dev/null +++ b/baselines/models/bert/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 1.11.0 # CPU Version of TensorFlow. +# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. diff --git a/baselines/models/bert/run_classifier.py b/baselines/models/bert/run_classifier.py new file mode 100644 index 0000000..03b4732 --- /dev/null +++ b/baselines/models/bert/run_classifier.py @@ -0,0 +1,1592 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 10:57:46 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, delimiter="\t", quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + if set_type == "test": + label = tokenization.convert_to_unicode(line[1]) + else: + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class JDCOMMENTProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_train.csv"),",", "\""), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_dev.csv"),",", "\""), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_test.csv"),",", "\""), "test") + + def get_labels(self): + """See base class.""" + return ["1", "2", "3", "4", "5"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + #print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[0]) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "jdcomment": JDCOMMENTProcessor, + "lcqmc": LCQMCProcessor, + "thucnews": THUCNewsProcessor, + "bq": BQProcessor, + "iflydata": iFLYTEKDataProcessor, + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + print("data_dir:", FLAGS.data_dir) + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + ## dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_bert.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_bert.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + ## test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_bert.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_bert.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + index2label_map = {} + for (i, label) in enumerate(label_list): + index2label_map[i] = label + output_predict_file_label = os.path.join(FLAGS.output_dir, "test_results_label.tsv") + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file_label, "w") as writer_label: + with tf.gfile.GFile(output_predict_file, "w") as writer: + writer_label.write("predict_label" + "\n") + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + label_index = probabilities.argmax(0) + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + writer_label.write(str(index2label_map[label_index]) + "\n") + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert/run_classifier_bq.sh b/baselines/models/bert/run_classifier_bq.sh new file mode 100644 index 0000000..9b4bcad --- /dev/null +++ b/baselines/models/bert/run_classifier_bq.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-07 23:38:36 + +TASK_NAME="bq" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_iflydata.sh b/baselines/models/bert/run_classifier_iflydata.sh new file mode 100644 index 0000000..03fb870 --- /dev/null +++ b/baselines/models/bert/run_classifier_iflydata.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 00:07:39 + +TASK_NAME="iflydata" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_inews.sh b/baselines/models/bert/run_classifier_inews.sh new file mode 100755 index 0000000..9a6ca15 --- /dev/null +++ b/baselines/models/bert/run_classifier_inews.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 00:10:40 + +TASK_NAME="inews" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_lcqmc.sh b/baselines/models/bert/run_classifier_lcqmc.sh new file mode 100644 index 0000000..b1b0e89 --- /dev/null +++ b/baselines/models/bert/run_classifier_lcqmc.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 12:34:25 + +TASK_NAME="lcqmc" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_thucnews.sh b/baselines/models/bert/run_classifier_thucnews.sh new file mode 100644 index 0000000..678d836 --- /dev/null +++ b/baselines/models/bert/run_classifier_thucnews.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 00:12:27 + +TASK_NAME="thucnews" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_tnews.sh b/baselines/models/bert/run_classifier_tnews.sh new file mode 100644 index 0000000..2810f12 --- /dev/null +++ b/baselines/models/bert/run_classifier_tnews.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 00:13:10 + +TASK_NAME="tnews" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_classifier_with_tfhub.py b/baselines/models/bert/run_classifier_with_tfhub.py new file mode 100644 index 0000000..9d2f80f --- /dev/null +++ b/baselines/models/bert/run_classifier_with_tfhub.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner with TF-Hub.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import optimization +import run_classifier +import tokenization +import tensorflow as tf +import tensorflow_hub as hub + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "bert_hub_module_handle", None, + "Handle for the BERT TF-Hub module.") + + +def create_model(is_training, input_ids, input_mask, segment_ids, labels, + num_labels, bert_hub_module_handle): + """Creates a classification model.""" + tags = set() + if is_training: + tags.add("train") + bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) + bert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + bert_outputs = bert_module( + inputs=bert_inputs, + signature="tokens", + as_dict=True) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use + # bert_outputs["sequence_output"] instead. + output_layer = bert_outputs["pooled_output"] + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(num_labels, learning_rate, num_train_steps, + num_warmup_steps, use_tpu, bert_hub_module_handle): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, + bert_hub_module_handle) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics) + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions={"probabilities": probabilities}) + else: + raise ValueError( + "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def create_tokenizer_from_hub_module(bert_hub_module_handle): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + bert_module = hub.Module(bert_hub_module_handle) + tokenization_info = bert_module(signature="tokenization_info", as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + return tokenization.FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": run_classifier.ColaProcessor, + "mnli": run_classifier.MnliProcessor, + "mrpc": run_classifier.MrpcProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + num_labels=len(label_list), + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + bert_hub_module_handle=FLAGS.bert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_features = run_classifier.convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = run_classifier.input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = run_classifier.convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = run_classifier.input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + if FLAGS.use_tpu: + # Discard batch remainder if running on TPU + n = len(predict_examples) + predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)] + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + run_classifier.file_based_convert_examples_to_features( + predict_examples, label_list, FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = run_classifier.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=FLAGS.use_tpu) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + tf.logging.info("***** Predict results *****") + for prediction in result: + probabilities = prediction["probabilities"] + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("bert_hub_module_handle") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert/run_classifier_xnli.sh b/baselines/models/bert/run_classifier_xnli.sh new file mode 100644 index 0000000..d6684bc --- /dev/null +++ b/baselines/models/bert/run_classifier_xnli.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-08 00:13:55 + +TASK_NAME="xnli" +MODEL_NAME="chinese_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_PRETRAINED_MODELS_DIR ]; then + mkdir -p $BERT_PRETRAINED_MODELS_DIR + echo "makedir $BERT_PRETRAINED_MODELS_DIR" +fi +cd $BERT_PRETRAINED_MODELS_DIR +if [ ! -d $MODEL_NAME ]; then + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip +else + cd $MODEL_NAME + if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + cd .. + rm -rf $MODEL_NAME + wget https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip + unzip chinese_L-12_H-768_A-12.zip + rm chinese_L-12_H-768_A-12.zip + else + echo "model exists" + fi +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_ner.py b/baselines/models/bert/run_ner.py new file mode 100644 index 0000000..8457e92 --- /dev/null +++ b/baselines/models/bert/run_ner.py @@ -0,0 +1,852 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-06 17:40:44 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-07 10:28:29 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +# Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON'] * (len(word) - 1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION'] * (len(word) - 1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION'] * (len(word) - 1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask=label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example( + ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision( + label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example( + ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), + os.path.join(FLAGS.output_dir, "tmp"), + os.path.join(FLAGS.data_dir, "test_results_bert.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert/run_ner_msra.sh b/baselines/models/bert/run_ner_msra.sh new file mode 100644 index 0000000..c839288 --- /dev/null +++ b/baselines/models/bert/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/chinese_L-12_H-768_A-12 +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert/run_pretraining.py b/baselines/models/bert/run_pretraining.py new file mode 100644 index 0000000..b118f62 --- /dev/null +++ b/baselines/models/bert/run_pretraining.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert/run_squad.py b/baselines/models/bert/run_squad.py new file mode 100644 index 0000000..edd4c3e --- /dev/null +++ b/baselines/models/bert/run_squad.py @@ -0,0 +1,1283 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import math +import os +import random +import modeling +import optimization +import tokenization +import six +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +flags.DEFINE_float( + "null_score_diff_threshold", 0.0, + "If null_score - best_non_null is greater than the threshold predict null.") + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def input_fn_builder(input_file, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if FLAGS.version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if FLAGS.version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if FLAGS.version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not FLAGS.version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > FLAGS.null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if FLAGS.version_2_with_negative: + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def validate_flags_or_throw(bert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + validate_flags_or_throw(bert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + train_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "train.tf_record"), + is_training=True) + convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = input_fn_builder( + input_file=train_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), + is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + all_results = [] + + predict_input_fn = input_fn_builder( + input_file=eval_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") + + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert/sample_text.txt b/baselines/models/bert/sample_text.txt new file mode 100644 index 0000000..a428120 --- /dev/null +++ b/baselines/models/bert/sample_text.txt @@ -0,0 +1,33 @@ +This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত +Text should be one-sentence-per-line, with empty lines between documents. +This sample text is public domain and was randomly selected from Project Guttenberg. + +The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. +Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. +Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. +"Cass" Beard had risen early that morning, but not with a view to discovery. +A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. +The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. +This was nearly opposite. +Mr. Cassius crossed the highway, and stopped suddenly. +Something glittered in the nearest red pool before him. +Gold, surely! +But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. +Looking at it more attentively, he saw that it bore the inscription, "May to Cass." +Like most of his fellow gold-seekers, Cass was superstitious. + +The fountain of classic wisdom, Hypatia herself. +As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. +From my youth I felt in me a soul above the matter-entangled herd. +She revealed to me the glorious fact, that I am a spark of Divinity itself. +A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. +There is a philosophic pleasure in opening one's treasures to the modest young. +Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. +Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; +but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. +Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. +His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; +while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. +At last they reached the quay at the opposite end of the street; +and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. +He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. diff --git a/baselines/models/bert/tf_metrics.py b/baselines/models/bert/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/bert/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/bert/tokenization.py b/baselines/models/bert/tokenization.py new file mode 100644 index 0000000..0ee1359 --- /dev/null +++ b/baselines/models/bert/tokenization.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/bert/tokenization_test.py b/baselines/models/bert/tokenization_test.py new file mode 100644 index 0000000..0afaedd --- /dev/null +++ b/baselines/models/bert/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile +import tokenization +import six +import tensorflow as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert/tpu/run_classifier_inews.sh b/baselines/models/bert/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..e18ba63 --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/al/bert-base/chinese_L-12_H-768_A-12/ +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/bert/tpu/run_classifier_jdcomment.sh b/baselines/models/bert/tpu/run_classifier_jdcomment.sh new file mode 100755 index 0000000..9aa0866 --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_jdcomment.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="jdcomment" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-base/chinese_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME +echo $DATA_DIR +python3 $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/bert/tpu/run_classifier_lcqmc.sh b/baselines/models/bert/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..0edf35e --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-base/chinese_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.20.0.2:8470 diff --git a/baselines/models/bert/tpu/run_classifier_thucnews.sh b/baselines/models/bert/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..633665d --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/al/bert-base/chinese_L-12_H-768_A-12/ +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/bert/tpu/run_classifier_tnews.sh b/baselines/models/bert/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..d1c68c5 --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-base/chinese_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://192.168.0.2:8470 diff --git a/baselines/models/bert/tpu/run_classifier_xnli.sh b/baselines/models/bert/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..0146683 --- /dev/null +++ b/baselines/models/bert/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/al/bert-base/chinese_L-12_H-768_A-12/ +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-base/chinese_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/bert_wwm_ext/.gitignore b/baselines/models/bert_wwm_ext/.gitignore new file mode 100644 index 0000000..df9efad --- /dev/null +++ b/baselines/models/bert_wwm_ext/.gitignore @@ -0,0 +1,116 @@ +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/baselines/models/bert_wwm_ext/CONTRIBUTING.md b/baselines/models/bert_wwm_ext/CONTRIBUTING.md new file mode 100644 index 0000000..124b4b3 --- /dev/null +++ b/baselines/models/bert_wwm_ext/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# How to Contribute + +BERT needs to maintain permanent compatibility with the pre-trained model files, +so we do not plan to make any major changes to this library (other than what was +promised in the README). However, we can accept small patches related to +re-factoring and documentation. To submit contributes, there are just a few +small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/baselines/models/bert_wwm_ext/LICENSE b/baselines/models/bert_wwm_ext/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/baselines/models/bert_wwm_ext/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/models/bert_wwm_ext/__init__.py b/baselines/models/bert_wwm_ext/__init__.py new file mode 100644 index 0000000..effb57b --- /dev/null +++ b/baselines/models/bert_wwm_ext/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/baselines/models/bert_wwm_ext/conlleval.py b/baselines/models/bert_wwm_ext/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/bert_wwm_ext/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/bert_wwm_ext/create_pretraining_data.py b/baselines/models/bert_wwm_ext/create_pretraining_data.py new file mode 100644 index 0000000..5340d96 --- /dev/null +++ b/baselines/models/bert_wwm_ext/create_pretraining_data.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/extract_features.py b/baselines/models/bert_wwm_ext/extract_features.py new file mode 100644 index 0000000..60e3830 --- /dev/null +++ b/baselines/models/bert_wwm_ext/extract_features.py @@ -0,0 +1,419 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract pre-computed feature vectors from BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs +import collections +import json +import re + +import modeling +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "") + +flags.DEFINE_string("output_file", None, "") + +flags.DEFINE_string("layers", "-1,-2,-3,-4", "") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("master", None, + "If using a TPU, the address of the master.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_one_hot_embeddings", False, + "If True, tf.one_hot will be used for embedding lookups, otherwise " + "tf.nn.embedding_lookup will be used. On TPUs, this should be True " + "since it is much faster.") + + +class InputExample(object): + + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids + + +def input_fn_builder(features, seq_length): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] + + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) + + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d + + return input_fn + + +def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] + + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, + initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + all_layers = model.get_all_encoder_layers() + + predictions = { + "unique_id": unique_ids, + } + + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +def convert_examples_to_features(examples, seq_length, tokenizer): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def read_examples(input_file): + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + examples = read_examples(FLAGS.input_file) + + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) + + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) + + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/modeling.py b/baselines/models/bert_wwm_ext/modeling.py new file mode 100644 index 0000000..fed5259 --- /dev/null +++ b/baselines/models/bert_wwm_ext/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/bert_wwm_ext/modeling_test.py b/baselines/models/bert_wwm_ext/modeling_test.py new file mode 100644 index 0000000..817ad2d --- /dev/null +++ b/baselines/models/bert_wwm_ext/modeling_test.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +import modeling +import six +import tensorflow as tf + + +class BertModelTest(tf.test.TestCase): + + class BertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert_wwm_ext/multilingual.md b/baselines/models/bert_wwm_ext/multilingual.md new file mode 100644 index 0000000..3b38379 --- /dev/null +++ b/baselines/models/bert_wwm_ext/multilingual.md @@ -0,0 +1,303 @@ +## Models + +There are two multilingual models currently available. We do not plan to release +more single-language models, but we may release `BERT-Large` versions of these +two in the future: + +* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: + 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: + 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: + Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M + parameters + +**The `Multilingual Cased (New)` model also fixes normalization issues in many +languages, so it is recommended in languages with non-Latin alphabets (and is +often better for most languages with Latin alphabets). When using this model, +make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other +scripts.** + +See the [list of languages](#list-of-languages) that the Multilingual model +supports. The Multilingual model does include Chinese (and English), but if your +fine-tuning data is Chinese-only, then the Chinese model will likely produce +better results. + +## Results + +To evaluate these systems, we use the +[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a +version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the +dev and test sets have been translated (by humans) into 15 languages. Note that +the training set was *machine* translated (we used the translations provided by +XNLI, not Google NMT). For clarity, we only report on 6 languages below: + + + +| System | English | Chinese | Spanish | German | Arabic | Urdu | +| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | +| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | +| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | +| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | +| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | +| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | +| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | + + + +The first two rows are baselines from the XNLI paper and the last three rows are +our results with BERT. + +**Translate Train** means that the MultiNLI training set was machine translated +from English into the foreign language. So training and evaluation were both +done in the foreign language. Unfortunately, training was done on +machine-translated data, so it is impossible to quantify how much of the lower +accuracy (compared to English) is due to the quality of the machine translation +vs. the quality of the pre-trained model. + +**Translate Test** means that the XNLI test set was machine translated from the +foreign language into English. So training and evaluation were both done on +English. However, test evaluation was done on machine-translated English, so the +accuracy depends on the quality of the machine translation system. + +**Zero Shot** means that the Multilingual BERT system was fine-tuned on English +MultiNLI, and then evaluated on the foreign language XNLI test. In this case, +machine translation was not involved at all in either the pre-training or +fine-tuning. + +Note that the English result is worse than the 84.2 MultiNLI baseline because +this training used Multilingual BERT rather than English-only BERT. This implies +that for high-resource languages, the Multilingual model is somewhat worse than +a single-language model. However, it is not feasible for us to train and +maintain dozens of single-language models. Therefore, if your goal is to maximize +performance with a language other than English or Chinese, you might find it +beneficial to run pre-training for additional steps starting from our +Multilingual model on data from your language of interest. + +Here is a comparison of training Chinese models with the Multilingual +`BERT-Base` and Chinese-only `BERT-Base`: + +System | Chinese +----------------------- | ------- +XNLI Baseline | 67.0 +BERT Multilingual Model | 74.2 +BERT Chinese-only Model | 77.2 + +Similar to English, the single-language model does 3% better than the +Multilingual model. + +## Fine-tuning Example + +The multilingual model does **not** require any special consideration or API +changes. We did update the implementation of `BasicTokenizer` in +`tokenization.py` to support Chinese character tokenization, so please update if +you forked it. However, we did not change the tokenization API. + +To test the new models, we did modify `run_classifier.py` to add support for the +[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language +version of MultiNLI where the dev/test sets have been human-translated, and the +training set has been machine-translated. + +To run the fine-tuning code, please download the +[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the +[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) +and then unpack both .zip files into some directory `$XNLI_DIR`. + +To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` +(Chinese by default), so please modify `XnliProcessor` if you want to run on +another language. + +This is a large dataset, so this will training will take a few hours on a GPU +(or about 30 minutes on a Cloud TPU). To run an experiment quickly for +debugging, just set `num_train_epochs` to a small value like `0.1`. + +```shell +export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 +export XNLI_DIR=/path/to/xnli + +python run_classifier.py \ + --task_name=XNLI \ + --do_train=true \ + --do_eval=true \ + --data_dir=$XNLI_DIR \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --output_dir=/tmp/xnli_output/ +``` + +With the Chinese-only model, the results should look something like this: + +``` + ***** Eval results ***** +eval_accuracy = 0.774116 +eval_loss = 0.83554 +global_step = 24543 +loss = 0.74603 +``` + +## Details + +### Data Source and Sampling + +The languages chosen were the +[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). +The entire Wikipedia dump for each language (excluding user and talk pages) was +taken as the training data for each language + +However, the size of the Wikipedia for a given language varies greatly, and +therefore low-resource languages may be "under-represented" in terms of the +neural network model (under the assumption that languages are "competing" for +limited model capacity to some extent). At the same time, we also don't want +to overfit the model by performing thousands of epochs over a tiny Wikipedia +for a particular language. + +To balance these two factors, we performed exponentially smoothed weighting of +the data during pre-training data creation (and WordPiece vocab creation). In +other words, let's say that the probability of a language is *P(L)*, e.g., +*P(English) = 0.21* means that after concatenating all of the Wikipedias +together, 21% of our data is English. We exponentiate each probability by some +factor *S* and then re-normalize, and sample from that distribution. In our case +we use *S=0.7*. So, high-resource languages like English will be under-sampled, +and low-resource languages like Icelandic will be over-sampled. E.g., in the +original distribution English would be sampled 1000x more than Icelandic, but +after smoothing it's only sampled 100x more. + +### Tokenization + +For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are +weighted the same way as the data, so low-resource languages are upweighted by +some factor. We intentionally do *not* use any marker to denote the input +language (so that zero-shot training can work). + +Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace +characters, we add spaces around every character in the +[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) +before applying WordPiece. This means that Chinese is effectively +character-tokenized. Note that the CJK Unicode block only includes +Chinese-origin characters and does *not* include Hangul Korean or +Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like +all other languages. + +For all other languages, we apply the +[same recipe as English](https://github.com/google-research/bert#tokenization): +(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace +tokenization. We understand that accent markers have substantial meaning in some +languages, but felt that the benefits of reducing the effective vocabulary make +up for this. Generally the strong contextual models of BERT should make up for +any ambiguity introduced by stripping accent markers. + +### List of Languages + +The multilingual model supports the following languages. These languages were +chosen because they are the top 100 languages with the largest Wikipedias: + +* Afrikaans +* Albanian +* Arabic +* Aragonese +* Armenian +* Asturian +* Azerbaijani +* Bashkir +* Basque +* Bavarian +* Belarusian +* Bengali +* Bishnupriya Manipuri +* Bosnian +* Breton +* Bulgarian +* Burmese +* Catalan +* Cebuano +* Chechen +* Chinese (Simplified) +* Chinese (Traditional) +* Chuvash +* Croatian +* Czech +* Danish +* Dutch +* English +* Estonian +* Finnish +* French +* Galician +* Georgian +* German +* Greek +* Gujarati +* Haitian +* Hebrew +* Hindi +* Hungarian +* Icelandic +* Ido +* Indonesian +* Irish +* Italian +* Japanese +* Javanese +* Kannada +* Kazakh +* Kirghiz +* Korean +* Latin +* Latvian +* Lithuanian +* Lombard +* Low Saxon +* Luxembourgish +* Macedonian +* Malagasy +* Malay +* Malayalam +* Marathi +* Minangkabau +* Nepali +* Newar +* Norwegian (Bokmal) +* Norwegian (Nynorsk) +* Occitan +* Persian (Farsi) +* Piedmontese +* Polish +* Portuguese +* Punjabi +* Romanian +* Russian +* Scots +* Serbian +* Serbo-Croatian +* Sicilian +* Slovak +* Slovenian +* South Azerbaijani +* Spanish +* Sundanese +* Swahili +* Swedish +* Tagalog +* Tajik +* Tamil +* Tatar +* Telugu +* Turkish +* Ukrainian +* Urdu +* Uzbek +* Vietnamese +* Volapük +* Waray-Waray +* Welsh +* West Frisian +* Western Punjabi +* Yoruba + +The **Multilingual Cased (New)** release contains additionally **Thai** and +**Mongolian**, which were not included in the original release. diff --git a/baselines/models/bert_wwm_ext/optimization.py b/baselines/models/bert_wwm_ext/optimization.py new file mode 100644 index 0000000..d33dabd --- /dev/null +++ b/baselines/models/bert_wwm_ext/optimization.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/bert_wwm_ext/optimization_test.py b/baselines/models/bert_wwm_ext/optimization_test.py new file mode 100644 index 0000000..4f2dcf1 --- /dev/null +++ b/baselines/models/bert_wwm_ext/optimization_test.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import optimization +import tensorflow as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert_wwm_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb b/baselines/models/bert_wwm_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb new file mode 100644 index 0000000..466857f --- /dev/null +++ b/baselines/models/bert_wwm_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb @@ -0,0 +1,1231 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "metadata": { + "id": "j0a4mTk9o1Qg", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Copyright 2019 Google Inc.\n", + "\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dCpvgG0vwXAZ", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Predicting Movie Review Sentiment with BERT on TF Hub" + ] + }, + { + "metadata": { + "id": "xiYrZKaHwV81", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n", + "\n", + "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n", + "\n", + "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!" + ] + }, + { + "metadata": { + "id": "hsZvic2YxnTz", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "from datetime import datetime" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "cp5wfXDx5SPH", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "In addition to the standard libraries we imported above, we'll need to install BERT's python package." + ] + }, + { + "metadata": { + "id": "jviywGyWyKsA", + "colab_type": "code", + "outputId": "166f3005-d219-404f-b201-2a0b75480360", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + } + }, + "cell_type": "code", + "source": [ + "!pip install bert-tensorflow" + ], + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.6/dist-packages (1.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.11.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "hhbGEfwgdEtw", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import bert\n", + "from bert import run_classifier\n", + "from bert import optimization\n", + "from bert import tokenization" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "KVB3eOcjxxm1", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n", + "\n", + "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n", + "\n", + "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)." + ] + }, + { + "metadata": { + "id": "US_EAnICvP7f", + "colab_type": "code", + "outputId": "7780a032-31d4-4794-e6aa-664a5d2ae7dd", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# Set the output directory for saving model file\n", + "# Optionally, set a GCP bucket location\n", + "\n", + "OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:\"string\"}\n", + "#@markdown Whether or not to clear/delete the directory and create a new one\n", + "DO_DELETE = False #@param {type:\"boolean\"}\n", + "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n", + "USE_BUCKET = True #@param {type:\"boolean\"}\n", + "BUCKET = 'BUCKET_NAME' #@param {type:\"string\"}\n", + "\n", + "if USE_BUCKET:\n", + " OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + "\n", + "if DO_DELETE:\n", + " try:\n", + " tf.gfile.DeleteRecursively(OUTPUT_DIR)\n", + " except:\n", + " # Doesn't matter if the directory didn't exist\n", + " pass\n", + "tf.gfile.MakeDirs(OUTPUT_DIR)\n", + "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "***** Model output directory: gs://bert-tfhub/aclImdb_v1 *****\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "pmFYvkylMwXn", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data" + ] + }, + { + "metadata": { + "id": "MC_w8SRqN0fr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)." + ] + }, + { + "metadata": { + "id": "fom_ff20gyy6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from tensorflow import keras\n", + "import os\n", + "import re\n", + "\n", + "# Load all files from a directory in a DataFrame.\n", + "def load_directory_data(directory):\n", + " data = {}\n", + " data[\"sentence\"] = []\n", + " data[\"sentiment\"] = []\n", + " for file_path in os.listdir(directory):\n", + " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", + " data[\"sentence\"].append(f.read())\n", + " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", + " return pd.DataFrame.from_dict(data)\n", + "\n", + "# Merge positive and negative examples, add a polarity column and shuffle.\n", + "def load_dataset(directory):\n", + " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", + " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", + " pos_df[\"polarity\"] = 1\n", + " neg_df[\"polarity\"] = 0\n", + " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", + "\n", + "# Download and process the dataset files.\n", + "def download_and_load_datasets(force_download=False):\n", + " dataset = tf.keras.utils.get_file(\n", + " fname=\"aclImdb.tar.gz\", \n", + " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", + " extract=True)\n", + " \n", + " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"train\"))\n", + " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"test\"))\n", + " \n", + " return train_df, test_df\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "2abfwdn-g135", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train, test = download_and_load_datasets()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "XA8WHJgzhIZf", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To keep training fast, we'll take a sample of 5000 train and test examples, respectively." + ] + }, + { + "metadata": { + "id": "lw_F488eixTV", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train = train.sample(5000)\n", + "test = test.sample(5000)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "prRQM8pDi8xI", + "colab_type": "code", + "outputId": "34445cb8-2be0-4379-fdbc-7794091f6049", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "train.columns" + ], + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['sentence', 'sentiment', 'polarity'], dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 44 + } + ] + }, + { + "metadata": { + "id": "sfRnHSz3iSXz", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)" + ] + }, + { + "metadata": { + "id": "IuMOGwFui4it", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "DATA_COLUMN = 'sentence'\n", + "LABEL_COLUMN = 'polarity'\n", + "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", + "label_list = [0, 1]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "V399W0rqNJ-Z", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data Preprocessing\n", + "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n", + "\n", + "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n", + "- `label` is the label for our example, i.e. True, False" + ] + }, + { + "metadata": { + "id": "p9gEt5SmM6i6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", + "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)\n", + "\n", + "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "SCZWZtKxObjh", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n", + "\n", + "\n", + "1. Lowercase our text (if we're using a BERT lowercase model)\n", + "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", + "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", + "4. Map our words to indexes using a vocab file that BERT provides\n", + "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n", + "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", + "\n", + "Happily, we don't have to worry about most of these details.\n", + "\n", + "\n" + ] + }, + { + "metadata": { + "id": "qMWiDtpyQSoU", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:" + ] + }, + { + "metadata": { + "id": "IhJSe0QHNG7U", + "colab_type": "code", + "outputId": "20b28cc7-3cb3-4ce6-bfff-a7847ce3bbaa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# This is a path to an uncased (all lowercase) version of BERT\n", + "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", + "\n", + "def create_tokenizer_from_hub_module():\n", + " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", + " with tf.Graph().as_default():\n", + " bert_module = hub.Module(BERT_MODEL_HUB)\n", + " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", + " with tf.Session() as sess:\n", + " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", + " tokenization_info[\"do_lower_case\"]])\n", + " \n", + " return bert.tokenization.FullTokenizer(\n", + " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", + "\n", + "tokenizer = create_tokenizer_from_hub_module()" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "z4oFkhpZBDKm", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:" + ] + }, + { + "metadata": { + "id": "dsBo6RCtQmwx", + "colab_type": "code", + "outputId": "9af8c917-90ec-4fe9-897b-79dc89ca88e1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + } + }, + "cell_type": "code", + "source": [ + "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['this',\n", + " 'here',\n", + " \"'\",\n", + " 's',\n", + " 'an',\n", + " 'example',\n", + " 'of',\n", + " 'using',\n", + " 'the',\n", + " 'bert',\n", + " 'token',\n", + " '##izer']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + } + ] + }, + { + "metadata": { + "id": "0OEzfFIt6GIc", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands." + ] + }, + { + "metadata": { + "id": "LL5W8gEGRTAf", + "colab_type": "code", + "outputId": "65001dda-155b-48fc-b5fc-1e4cabc8dfbf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1261 + } + }, + "cell_type": "code", + "source": [ + "# We'll set sequences to be at most 128 tokens long.\n", + "MAX_SEQ_LENGTH = 128\n", + "# Convert our train and test features to InputFeatures that BERT understands.\n", + "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" + ], + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i ' m watching this on the sci - fi channel right now . it ' s so horrible i can ' t stop watching it ! i ' m a video ##grapher and this movie makes me sad . i feel bad for anyone associated with this movie . some of the camera work is good . most is very questionable . there are a few decent actors in the flick . too bad they ' re surrounded by what must have been the director ' s relatives . that ' s the only way they could have been qualified to be in a movie ! music was a little better than the acting . if you get around to watching this i hope it [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 1005 1049 3666 2023 2006 1996 16596 1011 10882 3149 2157 2085 1012 2009 1005 1055 2061 9202 1045 2064 1005 1056 2644 3666 2009 999 1045 1005 1049 1037 2678 18657 1998 2023 3185 3084 2033 6517 1012 1045 2514 2919 2005 3087 3378 2007 2023 3185 1012 2070 1997 1996 4950 2147 2003 2204 1012 2087 2003 2200 21068 1012 2045 2024 1037 2261 11519 5889 1999 1996 17312 1012 2205 2919 2027 1005 2128 5129 2011 2054 2442 2031 2042 1996 2472 1005 1055 9064 1012 2008 1005 1055 1996 2069 2126 2027 2071 2031 2042 4591 2000 2022 1999 1037 3185 999 2189 2001 1037 2210 2488 2084 1996 3772 1012 2065 2017 2131 2105 2000 3666 2023 1045 3246 2009 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i have been a fan of pushing dai ##sies since the very beginning . it is wonderful ##ly thought up , and bryan fuller has the most remarkable ideas for this show . < br / > < br / > it is unbelievable on how much tv has been needing a creative , original show like pushing dai ##sies . it is a huge relief to see a show , that is unlike the rest , where as , if you compared it to some of the newer shows , such as scrub ##s and house , you would see the similarities , and it does get ted ##ious at moments to see shows so close in identity . < br / > < br [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2031 2042 1037 5470 1997 6183 18765 14625 2144 1996 2200 2927 1012 2009 2003 6919 2135 2245 2039 1010 1998 8527 12548 2038 1996 2087 9487 4784 2005 2023 2265 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 2003 23653 2006 2129 2172 2694 2038 2042 11303 1037 5541 1010 2434 2265 2066 6183 18765 14625 1012 2009 2003 1037 4121 4335 2000 2156 1037 2265 1010 2008 2003 4406 1996 2717 1010 2073 2004 1010 2065 2017 4102 2009 2000 2070 1997 1996 10947 3065 1010 2107 2004 18157 2015 1998 2160 1010 2017 2052 2156 1996 12319 1010 1998 2009 2515 2131 6945 6313 2012 5312 2000 2156 3065 2061 2485 1999 4767 1012 1026 7987 1013 1028 1026 7987 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] this movie starts out promising ##ly , with an early scene in which frank morgan advises against gary cooper ' s marriage to his daughter , anita louise . frank morgan , playing an una ##bas ##hed gold - digger , loudly complain ##s to cooper about his perceived pen ##ury at the hands of his family - including his daughter , anita louise . i am a fan of all 3 actors . frank morgan is ( to my mind ) a hollywood treasure , cooper a legend , and louise a very lovely , versatile and under - appreciated actress seldom seen in the leading role . i also have nothing against teresa wright , and while not blessed with great range , she [SEP]\n", + "INFO:tensorflow:input_ids: 101 2023 3185 4627 2041 10015 2135 1010 2007 2019 2220 3496 1999 2029 3581 5253 25453 2114 5639 6201 1005 1055 3510 2000 2010 2684 1010 12918 8227 1012 3581 5253 1010 2652 2019 14477 22083 9072 2751 1011 28661 1010 9928 17612 2015 2000 6201 2055 2010 8690 7279 13098 2012 1996 2398 1997 2010 2155 1011 2164 2010 2684 1010 12918 8227 1012 1045 2572 1037 5470 1997 2035 1017 5889 1012 3581 5253 2003 1006 2000 2026 2568 1007 1037 5365 8813 1010 6201 1037 5722 1010 1998 8227 1037 2200 8403 1010 22979 1998 2104 1011 12315 3883 15839 2464 1999 1996 2877 2535 1012 1045 2036 2031 2498 2114 12409 6119 1010 1998 2096 2025 10190 2007 2307 2846 1010 2016 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i was over ##taken by the emotion . un ##for ##get ##table rendering of a wartime story which is unknown to most people . the performances were fault ##less and outstanding . [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2001 2058 25310 2011 1996 7603 1012 4895 29278 18150 10880 14259 1997 1037 12498 2466 2029 2003 4242 2000 2087 2111 1012 1996 4616 2020 6346 3238 1998 5151 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] soldier blue is a movie with pre ##tension ##s : pre ##tension ##s to be some sort of profound statement on man ' s inhuman ##ity to man , on the white man ' s exploitation of and brutality towards indigenous peoples ; a biting , un ##fl ##in ##ching and sar ##don ##ic commentary on the horrors of vietnam . well , sorry , but it fails mis ##era ##bly to be any of those things . what soldier blue actually is is per ##nic ##ious , tri ##te , badly made , dish ##ones ##t rubbish . < br / > < br / > another reviewer here hit the nail on the head in saying that it appears to be a hybrid of [SEP]\n", + "INFO:tensorflow:input_ids: 101 5268 2630 2003 1037 3185 2007 3653 29048 2015 1024 3653 29048 2015 2000 2022 2070 4066 1997 13769 4861 2006 2158 1005 1055 29582 3012 2000 2158 1010 2006 1996 2317 2158 1005 1055 14427 1997 1998 24083 2875 6284 7243 1025 1037 12344 1010 4895 10258 2378 8450 1998 18906 5280 2594 8570 2006 1996 22812 1997 5148 1012 2092 1010 3374 1010 2021 2009 11896 28616 6906 6321 2000 2022 2151 1997 2216 2477 1012 2054 5268 2630 2941 2003 2003 2566 8713 6313 1010 13012 2618 1010 6649 2081 1010 9841 21821 2102 29132 1012 1026 7987 1013 1028 1026 7987 1013 1028 2178 12027 2182 2718 1996 13774 2006 1996 2132 1999 3038 2008 2009 3544 2000 2022 1037 8893 1997 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i just watched this today on tv . it was on abc ' s sunday afternoon movie . < br / > < br / > this wasn ' t a very good movie , but for a low budget independent film like this , it was okay . there is some suspense in it , but there are so many bad qualities that really bring the movie down . the script is pretty lame , and the plot elements aren ' t very realistic , such as the way a 911 operator would laugh and hang up when someone is reporting a murder . i don ' t know what the writer was thinking when they came up with that idea , but it isn [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2074 3427 2023 2651 2006 2694 1012 2009 2001 2006 5925 1005 1055 4465 5027 3185 1012 1026 7987 1013 1028 1026 7987 1013 1028 2023 2347 1005 1056 1037 2200 2204 3185 1010 2021 2005 1037 2659 5166 2981 2143 2066 2023 1010 2009 2001 3100 1012 2045 2003 2070 23873 1999 2009 1010 2021 2045 2024 2061 2116 2919 11647 2008 2428 3288 1996 3185 2091 1012 1996 5896 2003 3492 20342 1010 1998 1996 5436 3787 4995 1005 1056 2200 12689 1010 2107 2004 1996 2126 1037 19989 6872 2052 4756 1998 6865 2039 2043 2619 2003 7316 1037 4028 1012 1045 2123 1005 1056 2113 2054 1996 3213 2001 3241 2043 2027 2234 2039 2007 2008 2801 1010 2021 2009 3475 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] from hardly alien sounding lasers , to an elementary school style shuttle crash , \" night ##be ##ast \" is better classified as a far ##cic ##al mix of fake blood and bare chest . the almost pornographic style of the film seems to be a failed attempt to recover from a lack of co ##hesive or effective story . the acting however is not nearly as beast ##ly , many of the young , aspiring , actors ad ##mir ##ably showcase a hidden talent . particularly don lei ##fer ##t and jamie ze ##mare ##l , who shed a well needed sha ##rd of light on this otherwise terrible film . night ##be ##ast would have never shown up on set had he known the [SEP]\n", + "INFO:tensorflow:input_ids: 101 2013 6684 7344 9391 23965 1010 2000 2019 4732 2082 2806 10382 5823 1010 1000 2305 4783 14083 1000 2003 2488 6219 2004 1037 2521 19053 2389 4666 1997 8275 2668 1998 6436 3108 1012 1996 2471 26932 2806 1997 1996 2143 3849 2000 2022 1037 3478 3535 2000 8980 2013 1037 3768 1997 2522 21579 2030 4621 2466 1012 1996 3772 2174 2003 2025 3053 2004 6841 2135 1010 2116 1997 1996 2402 1010 22344 1010 5889 4748 14503 8231 13398 1037 5023 5848 1012 3391 2123 26947 7512 2102 1998 6175 27838 24376 2140 1010 2040 8328 1037 2092 2734 21146 4103 1997 2422 2006 2023 4728 6659 2143 1012 2305 4783 14083 2052 2031 2196 3491 2039 2006 2275 2018 2002 2124 1996 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] here we have the in ##imi ##table charlie chaplin for ##sa ##king his slap ##stick past to tackle the serious subject of anti - semi ##tism , and into ##ler ##ance in general . he portrays two characters - the sweet , innocent jewish barber - a war veteran , and the ravi ##ng and ruthless dictator , aden ##oid h ##yn ##kel . the jewish ghetto in this country is not safe for long , due to the w ##him ##s of h ##yn ##kel and his armed thugs , who routinely rough up its residents , or leave them alone , dependent upon his mood that day or week . the barber is among them , but is befriended by his former commanding officer [SEP]\n", + "INFO:tensorflow:input_ids: 101 2182 2057 2031 1996 1999 27605 10880 4918 23331 2005 3736 6834 2010 14308 21354 2627 2000 11147 1996 3809 3395 1997 3424 1011 4100 17456 1010 1998 2046 3917 6651 1999 2236 1012 2002 17509 2048 3494 1011 1996 4086 1010 7036 3644 13362 1011 1037 2162 8003 1010 1998 1996 16806 3070 1998 18101 21237 1010 16298 9314 1044 6038 11705 1012 1996 3644 17276 1999 2023 2406 2003 2025 3647 2005 2146 1010 2349 2000 1996 1059 14341 2015 1997 1044 6038 11705 1998 2010 4273 24106 1010 2040 19974 5931 2039 2049 3901 1010 2030 2681 2068 2894 1010 7790 2588 2010 6888 2008 2154 2030 2733 1012 1996 13362 2003 2426 2068 1010 2021 2003 23386 2011 2010 2280 7991 2961 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i really hated this movie and it ' s the first movie written by stephen king that i didn ' t finish . i was truly disappointed , it was the worst crap i ' ve ever seen . what were you thinking making three hours out of it ? it may have a quite good story , but actors ? no . suspense ? no . romance ? no . horror ? no . it didn ' t have anything . < br / > < br / > it ' s got this strange , crazy science man with einstein - hair , the classic thing . not real at all . and a man keep getting younger all the time . it seems [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2428 6283 2023 3185 1998 2009 1005 1055 1996 2034 3185 2517 2011 4459 2332 2008 1045 2134 1005 1056 3926 1012 1045 2001 5621 9364 1010 2009 2001 1996 5409 10231 1045 1005 2310 2412 2464 1012 2054 2020 2017 3241 2437 2093 2847 2041 1997 2009 1029 2009 2089 2031 1037 3243 2204 2466 1010 2021 5889 1029 2053 1012 23873 1029 2053 1012 7472 1029 2053 1012 5469 1029 2053 1012 2009 2134 1005 1056 2031 2505 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 1005 1055 2288 2023 4326 1010 4689 2671 2158 2007 15313 1011 2606 1010 1996 4438 2518 1012 2025 2613 2012 2035 1012 1998 1037 2158 2562 2893 3920 2035 1996 2051 1012 2009 3849 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] story chinese tall story tells the story of righteous monk trip ##ita ##ka , who , along with his guardians monkey , sandy and pigs ##y make their journey west on a quest to recover ancient sutra ##s , finally , they reach the final leg of their journey in sha ##che city but all is not as it seems when the city is attacked by evil tree demons . monkey tries his best to battle them but is overwhelmed , knowing his master is in grave danger , he uses his trust ##y golden staff to thrust trip ##ita ##ka to safety . < br / > < br / > the monk ends up being knocked out when he land and when he wakes [SEP]\n", + "INFO:tensorflow:input_ids: 101 2466 2822 4206 2466 4136 1996 2466 1997 19556 8284 4440 6590 2912 1010 2040 1010 2247 2007 2010 14240 10608 1010 7525 1998 14695 2100 2191 2037 4990 2225 2006 1037 8795 2000 8980 3418 26567 2015 1010 2633 1010 2027 3362 1996 2345 4190 1997 2037 4990 1999 21146 5403 2103 2021 2035 2003 2025 2004 2009 3849 2043 1996 2103 2003 4457 2011 4763 3392 7942 1012 10608 5363 2010 2190 2000 2645 2068 2021 2003 13394 1010 4209 2010 3040 2003 1999 6542 5473 1010 2002 3594 2010 3404 2100 3585 3095 2000 7400 4440 6590 2912 2000 3808 1012 1026 7987 1013 1028 1026 7987 1013 1028 1996 8284 4515 2039 2108 6573 2041 2043 2002 2455 1998 2043 2002 17507 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "ccp5trMwRtmr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Creating a model\n", + "\n", + "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)." + ] + }, + { + "metadata": { + "id": "6o2a5ZIvRcJq", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", + " num_labels):\n", + " \"\"\"Creates a classification model.\"\"\"\n", + "\n", + " bert_module = hub.Module(\n", + " BERT_MODEL_HUB,\n", + " trainable=True)\n", + " bert_inputs = dict(\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " segment_ids=segment_ids)\n", + " bert_outputs = bert_module(\n", + " inputs=bert_inputs,\n", + " signature=\"tokens\",\n", + " as_dict=True)\n", + "\n", + " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", + " # Use \"sequence_outputs\" for token-level output.\n", + " output_layer = bert_outputs[\"pooled_output\"]\n", + "\n", + " hidden_size = output_layer.shape[-1].value\n", + "\n", + " # Create our own layer to tune for politeness data.\n", + " output_weights = tf.get_variable(\n", + " \"output_weights\", [num_labels, hidden_size],\n", + " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", + "\n", + " output_bias = tf.get_variable(\n", + " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", + "\n", + " with tf.variable_scope(\"loss\"):\n", + "\n", + " # Dropout helps prevent overfitting\n", + " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", + "\n", + " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", + " logits = tf.nn.bias_add(logits, output_bias)\n", + " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", + "\n", + " # Convert labels into one-hot encoding\n", + " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", + "\n", + " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", + " # If we're predicting, we want predicted labels and the probabiltiies.\n", + " if is_predicting:\n", + " return (predicted_labels, log_probs)\n", + "\n", + " # If we're train/eval, compute loss between predicted and actual label\n", + " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", + " loss = tf.reduce_mean(per_example_loss)\n", + " return (loss, predicted_labels, log_probs)\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "qpE0ZIDOCQzE", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction." + ] + }, + { + "metadata": { + "id": "FnH-AnOQ9KKW", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# model_fn_builder actually creates our model function\n", + "# using the passed parameters for num_labels, learning_rate, etc.\n", + "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", + " num_warmup_steps):\n", + " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", + " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", + " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", + "\n", + " input_ids = features[\"input_ids\"]\n", + " input_mask = features[\"input_mask\"]\n", + " segment_ids = features[\"segment_ids\"]\n", + " label_ids = features[\"label_ids\"]\n", + "\n", + " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", + " \n", + " # TRAIN and EVAL\n", + " if not is_predicting:\n", + "\n", + " (loss, predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " train_op = bert.optimization.create_optimizer(\n", + " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", + "\n", + " # Calculate evaluation metrics. \n", + " def metric_fn(label_ids, predicted_labels):\n", + " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", + " f1_score = tf.contrib.metrics.f1_score(\n", + " label_ids,\n", + " predicted_labels)\n", + " auc = tf.metrics.auc(\n", + " label_ids,\n", + " predicted_labels)\n", + " recall = tf.metrics.recall(\n", + " label_ids,\n", + " predicted_labels)\n", + " precision = tf.metrics.precision(\n", + " label_ids,\n", + " predicted_labels) \n", + " true_pos = tf.metrics.true_positives(\n", + " label_ids,\n", + " predicted_labels)\n", + " true_neg = tf.metrics.true_negatives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_pos = tf.metrics.false_positives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_neg = tf.metrics.false_negatives(\n", + " label_ids,\n", + " predicted_labels)\n", + " return {\n", + " \"eval_accuracy\": accuracy,\n", + " \"f1_score\": f1_score,\n", + " \"auc\": auc,\n", + " \"precision\": precision,\n", + " \"recall\": recall,\n", + " \"true_positives\": true_pos,\n", + " \"true_negatives\": true_neg,\n", + " \"false_positives\": false_pos,\n", + " \"false_negatives\": false_neg\n", + " }\n", + "\n", + " eval_metrics = metric_fn(label_ids, predicted_labels)\n", + "\n", + " if mode == tf.estimator.ModeKeys.TRAIN:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " train_op=train_op)\n", + " else:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " eval_metric_ops=eval_metrics)\n", + " else:\n", + " (predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " predictions = {\n", + " 'probabilities': log_probs,\n", + " 'labels': predicted_labels\n", + " }\n", + " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", + "\n", + " # Return the actual model function in the closure\n", + " return model_fn\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "OjwJ4bTeWXD8", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute train and warmup steps from batch size\n", + "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", + "BATCH_SIZE = 32\n", + "LEARNING_RATE = 2e-5\n", + "NUM_TRAIN_EPOCHS = 3.0\n", + "# Warmup is a period of time where hte learning rate \n", + "# is small and gradually increases--usually helps training.\n", + "WARMUP_PROPORTION = 0.1\n", + "# Model configs\n", + "SAVE_CHECKPOINTS_STEPS = 500\n", + "SAVE_SUMMARY_STEPS = 100" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "emHf9GhfWBZ_", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute # train and warmup steps from batch size\n", + "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", + "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "oEJldMr3WYZa", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Specify outpit directory and number of checkpoint steps to save\n", + "run_config = tf.estimator.RunConfig(\n", + " model_dir=OUTPUT_DIR,\n", + " save_summary_steps=SAVE_SUMMARY_STEPS,\n", + " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "q_WebpS1X97v", + "colab_type": "code", + "outputId": "1648932a-7391-49d3-8af7-52d514e226e8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + } + }, + "cell_type": "code", + "source": [ + "model_fn = model_fn_builder(\n", + " num_labels=len(label_list),\n", + " learning_rate=LEARNING_RATE,\n", + " num_train_steps=num_train_steps,\n", + " num_warmup_steps=num_warmup_steps)\n", + "\n", + "estimator = tf.estimator.Estimator(\n", + " model_fn=model_fn,\n", + " config=run_config,\n", + " params={\"batch_size\": BATCH_SIZE})\n" + ], + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Using config: {'_model_dir': 'gs://bert-tfhub/aclImdb_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", + "graph_options {\n", + " rewrite_options {\n", + " meta_optimizer_iterations: ONE\n", + " }\n", + "}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "NOO3RfG1DYLo", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)." + ] + }, + { + "metadata": { + "id": "1Pv2bAlOX_-K", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Create an input function for training. drop_remainder = True for using TPUs.\n", + "train_input_fn = bert.run_classifier.input_fn_builder(\n", + " features=train_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=True,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "t6Nukby2EB6-", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes." + ] + }, + { + "metadata": { + "id": "nucD4gluYJmK", + "colab_type": "code", + "outputId": "5d728e72-4631-42bf-c48d-3f51d4b968ce", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "cell_type": "code", + "source": [ + "print(f'Beginning Training!')\n", + "current_time = datetime.now()\n", + "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", + "print(\"Training took time \", datetime.now() - current_time)" + ], + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Beginning Training!\n", + "INFO:tensorflow:Skipping training since max_steps has already saved.\n", + "Training took time 0:00:00.759709\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "CmbLTVniARy3", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's use our test data to see how well our model did:" + ] + }, + { + "metadata": { + "id": "JIhejfpyJ8Bx", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "test_input_fn = run_classifier.input_fn_builder(\n", + " features=test_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=False,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "PPVEXhNjYXC-", + "colab_type": "code", + "outputId": "dd5482cd-c558-465f-c854-ec11a0175316", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 445 + } + }, + "cell_type": "code", + "source": [ + "estimator.evaluate(input_fn=test_input_fn, steps=None)" + ], + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Starting evaluation at 2019-02-12T21:04:20Z\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n", + "INFO:tensorflow:Finished evaluation at 2019-02-12-21:06:05\n", + "INFO:tensorflow:Saving dict for global step 468: auc = 0.86659324, eval_accuracy = 0.8664, f1_score = 0.8659711, false_negatives = 375.0, false_positives = 293.0, global_step = 468, loss = 0.51870537, precision = 0.880457, recall = 0.8519542, true_negatives = 2174.0, true_positives = 2158.0\n", + "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'auc': 0.86659324,\n", + " 'eval_accuracy': 0.8664,\n", + " 'f1_score': 0.8659711,\n", + " 'false_negatives': 375.0,\n", + " 'false_positives': 293.0,\n", + " 'global_step': 468,\n", + " 'loss': 0.51870537,\n", + " 'precision': 0.880457,\n", + " 'recall': 0.8519542,\n", + " 'true_negatives': 2174.0,\n", + " 'true_positives': 2158.0}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 59 + } + ] + }, + { + "metadata": { + "id": "ueKsULteiz1B", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's write code to make predictions on new sentences:" + ] + }, + { + "metadata": { + "id": "OsrbTD2EJTVl", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def getPrediction(in_sentences):\n", + " labels = [\"Negative\", \"Positive\"]\n", + " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", + " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", + " predictions = estimator.predict(predict_input_fn)\n", + " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "-thbodgih_VJ", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "pred_sentences = [\n", + " \"That movie was absolutely awful\",\n", + " \"The acting was a bit lacking\",\n", + " \"The film was creative and surprising\",\n", + " \"Absolutely fantastic!\"\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QrZmvZySKQTm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 649 + }, + "outputId": "3891fafb-a460-4eb8-fa6c-335a5bbc10e5" + }, + "cell_type": "code", + "source": [ + "predictions = getPrediction(pred_sentences)" + ], + "execution_count": 72, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 4\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] that movie was absolutely awful [SEP]\n", + "INFO:tensorflow:input_ids: 101 2008 3185 2001 7078 9643 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the acting was a bit lacking [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 3772 2001 1037 2978 11158 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the film was creative and surprising [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 2143 2001 5541 1998 11341 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] absolutely fantastic ! [SEP]\n", + "INFO:tensorflow:input_ids: 101 7078 10392 999 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "MXkRiEBUqN3n", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Voila! We have a sentiment classifier!" + ] + }, + { + "metadata": { + "id": "ERkTE8-7oQLZ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "outputId": "26c33224-dc2c-4b3d-f7b4-ac3ef0a58b27" + }, + "cell_type": "code", + "source": [ + "predictions" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('That movie was absolutely awful',\n", + " array([-4.9142293e-03, -5.3180690e+00], dtype=float32),\n", + " 'Negative'),\n", + " ('The acting was a bit lacking',\n", + " array([-0.03325794, -3.4200459 ], dtype=float32),\n", + " 'Negative'),\n", + " ('The film was creative and surprising',\n", + " array([-5.3589125e+00, -4.7171740e-03], dtype=float32),\n", + " 'Positive'),\n", + " ('Absolutely fantastic!',\n", + " array([-5.0434084 , -0.00647258], dtype=float32),\n", + " 'Positive')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 73 + } + ] + } + ] +} \ No newline at end of file diff --git a/baselines/models/bert_wwm_ext/requirements.txt b/baselines/models/bert_wwm_ext/requirements.txt new file mode 100644 index 0000000..357b5ea --- /dev/null +++ b/baselines/models/bert_wwm_ext/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 1.11.0 # CPU Version of TensorFlow. +# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. diff --git a/baselines/models/bert_wwm_ext/run_classifier.py b/baselines/models/bert_wwm_ext/run_classifier.py new file mode 100644 index 0000000..5ef3479 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier.py @@ -0,0 +1,1591 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-09 21:10:23 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 2: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) - 1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num + 1) * extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num * extra_len: max_len] + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + if set_type == "test": + #label = "0" + label = tokenization.convert_to_unicode(line[1]) + else: + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "lcqmc": LCQMCProcessor, + "thucnews": THUCNewsProcessor, + "bq": BQProcessor, + "iflydata": iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + # dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_bert_wwm_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_bert_wwm_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + # test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_bert_wwm_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_bert_wwm_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/run_classifier_bq.sh b/baselines/models/bert_wwm_ext/run_classifier_bq.sh new file mode 100644 index 0000000..4d41f2f --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_bq.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:28 + +TASK_NAME="bq" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_iflydata.sh b/baselines/models/bert_wwm_ext/run_classifier_iflydata.sh new file mode 100644 index 0000000..bd43d19 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_iflydata.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:26 + +TASK_NAME="iflydata" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_inews.sh b/baselines/models/bert_wwm_ext/run_classifier_inews.sh new file mode 100644 index 0000000..c694bcc --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_inews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:23 + +TASK_NAME="inews" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_lcqmc.sh b/baselines/models/bert_wwm_ext/run_classifier_lcqmc.sh new file mode 100644 index 0000000..92e2d5e --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_lcqmc.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:21 + +TASK_NAME="lcqmc" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_thucnews.sh b/baselines/models/bert_wwm_ext/run_classifier_thucnews.sh new file mode 100644 index 0000000..a11240a --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_thucnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:18 + +TASK_NAME="thucnews" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_tnews.sh b/baselines/models/bert_wwm_ext/run_classifier_tnews.sh new file mode 100644 index 0000000..be5c88a --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_tnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:16 + +TASK_NAME="tnews" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_classifier_with_tfhub.py b/baselines/models/bert_wwm_ext/run_classifier_with_tfhub.py new file mode 100644 index 0000000..9d2f80f --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_with_tfhub.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner with TF-Hub.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import optimization +import run_classifier +import tokenization +import tensorflow as tf +import tensorflow_hub as hub + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "bert_hub_module_handle", None, + "Handle for the BERT TF-Hub module.") + + +def create_model(is_training, input_ids, input_mask, segment_ids, labels, + num_labels, bert_hub_module_handle): + """Creates a classification model.""" + tags = set() + if is_training: + tags.add("train") + bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) + bert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + bert_outputs = bert_module( + inputs=bert_inputs, + signature="tokens", + as_dict=True) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use + # bert_outputs["sequence_output"] instead. + output_layer = bert_outputs["pooled_output"] + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(num_labels, learning_rate, num_train_steps, + num_warmup_steps, use_tpu, bert_hub_module_handle): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, + bert_hub_module_handle) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics) + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions={"probabilities": probabilities}) + else: + raise ValueError( + "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def create_tokenizer_from_hub_module(bert_hub_module_handle): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + bert_module = hub.Module(bert_hub_module_handle) + tokenization_info = bert_module(signature="tokenization_info", as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + return tokenization.FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": run_classifier.ColaProcessor, + "mnli": run_classifier.MnliProcessor, + "mrpc": run_classifier.MrpcProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + num_labels=len(label_list), + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + bert_hub_module_handle=FLAGS.bert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_features = run_classifier.convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = run_classifier.input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = run_classifier.convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = run_classifier.input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + if FLAGS.use_tpu: + # Discard batch remainder if running on TPU + n = len(predict_examples) + predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)] + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + run_classifier.file_based_convert_examples_to_features( + predict_examples, label_list, FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = run_classifier.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=FLAGS.use_tpu) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + tf.logging.info("***** Predict results *****") + for prediction in result: + probabilities = prediction["probabilities"] + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("bert_hub_module_handle") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/run_classifier_xnli.sh b/baselines/models/bert_wwm_ext/run_classifier_xnli.sh new file mode 100644 index 0000000..d1da247 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_classifier_xnli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:47:12 + +TASK_NAME="xnli" +MODEL_NAME="chinese_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export BERT_WWM_BASE_DIR=$BERT_PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $BERT_WWM_BASE_DIR ]; then + mkdir -p $BERT_WWM_BASE_DIR + echo "makedir $BERT_WWM_BASE_DIR" +fi +cd $BERT_WWM_BASE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_wwm_ext_L-12_H-768_A-12.zip + rm chinese_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$BERT_WWM_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_WWM_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_WWM_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_ner.py b/baselines/models/bert_wwm_ext/run_ner.py new file mode 100644 index 0000000..fc9abd5 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_ner.py @@ -0,0 +1,844 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON']*(len(word)-1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION']*(len(word)-1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION']*(len(word)-1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask = label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None +): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision(label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example(ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), \ + os.path.join(FLAGS.output_dir, "tmp"), \ + os.path.join(FLAGS.data_dir, "test_results_bert_wwm_ext.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/run_ner_msra.sh b/baselines/models/bert_wwm_ext/run_ner_msra.sh new file mode 100644 index 0000000..4f10902 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/chinese_wwm_ext_L-12_H-768_A-12 +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/bert_wwm_ext/run_pretraining.py b/baselines/models/bert_wwm_ext/run_pretraining.py new file mode 100644 index 0000000..b118f62 --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_pretraining.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/run_squad.py b/baselines/models/bert_wwm_ext/run_squad.py new file mode 100644 index 0000000..edd4c3e --- /dev/null +++ b/baselines/models/bert_wwm_ext/run_squad.py @@ -0,0 +1,1283 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import math +import os +import random +import modeling +import optimization +import tokenization +import six +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +flags.DEFINE_float( + "null_score_diff_threshold", 0.0, + "If null_score - best_non_null is greater than the threshold predict null.") + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def input_fn_builder(input_file, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if FLAGS.version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if FLAGS.version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if FLAGS.version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not FLAGS.version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > FLAGS.null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if FLAGS.version_2_with_negative: + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def validate_flags_or_throw(bert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + validate_flags_or_throw(bert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + train_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "train.tf_record"), + is_training=True) + convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = input_fn_builder( + input_file=train_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), + is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + all_results = [] + + predict_input_fn = input_fn_builder( + input_file=eval_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") + + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/bert_wwm_ext/sample_text.txt b/baselines/models/bert_wwm_ext/sample_text.txt new file mode 100644 index 0000000..a428120 --- /dev/null +++ b/baselines/models/bert_wwm_ext/sample_text.txt @@ -0,0 +1,33 @@ +This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত +Text should be one-sentence-per-line, with empty lines between documents. +This sample text is public domain and was randomly selected from Project Guttenberg. + +The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. +Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. +Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. +"Cass" Beard had risen early that morning, but not with a view to discovery. +A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. +The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. +This was nearly opposite. +Mr. Cassius crossed the highway, and stopped suddenly. +Something glittered in the nearest red pool before him. +Gold, surely! +But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. +Looking at it more attentively, he saw that it bore the inscription, "May to Cass." +Like most of his fellow gold-seekers, Cass was superstitious. + +The fountain of classic wisdom, Hypatia herself. +As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. +From my youth I felt in me a soul above the matter-entangled herd. +She revealed to me the glorious fact, that I am a spark of Divinity itself. +A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. +There is a philosophic pleasure in opening one's treasures to the modest young. +Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. +Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; +but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. +Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. +His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; +while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. +At last they reached the quay at the opposite end of the street; +and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. +He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. diff --git a/baselines/models/bert_wwm_ext/tf_metrics.py b/baselines/models/bert_wwm_ext/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/bert_wwm_ext/tokenization.py b/baselines/models/bert_wwm_ext/tokenization.py new file mode 100644 index 0000000..0ee1359 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tokenization.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/bert_wwm_ext/tokenization_test.py b/baselines/models/bert_wwm_ext/tokenization_test.py new file mode 100644 index 0000000..0afaedd --- /dev/null +++ b/baselines/models/bert_wwm_ext/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile +import tokenization +import six +import tensorflow as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/bert_wwm_ext/tpu/run_classifier_inews.sh b/baselines/models/bert_wwm_ext/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..45f59f5 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/bert_wwm_ext/tpu/run_classifier_lcqmc.sh b/baselines/models/bert_wwm_ext/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..457af8b --- /dev/null +++ b/baselines/models/bert_wwm_ext/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://192.168.0.2:8470 diff --git a/baselines/models/bert_wwm_ext/tpu/run_classifier_thucnews.sh b/baselines/models/bert_wwm_ext/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..45f59f5 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/bert_wwm_ext/tpu/run_classifier_tnews.sh b/baselines/models/bert_wwm_ext/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..45c0320 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.230.1.2:8470 diff --git a/baselines/models/bert_wwm_ext/tpu/run_classifier_xnli.sh b/baselines/models/bert_wwm_ext/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..92a88c1 --- /dev/null +++ b/baselines/models/bert_wwm_ext/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/bert-wwm-ext-base/chinese_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/ernie/.gitignore b/baselines/models/ernie/.gitignore new file mode 100644 index 0000000..df9efad --- /dev/null +++ b/baselines/models/ernie/.gitignore @@ -0,0 +1,116 @@ +# Initially taken from Github's Python gitignore file + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/baselines/models/ernie/CONTRIBUTING.md b/baselines/models/ernie/CONTRIBUTING.md new file mode 100644 index 0000000..124b4b3 --- /dev/null +++ b/baselines/models/ernie/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# How to Contribute + +BERT needs to maintain permanent compatibility with the pre-trained model files, +so we do not plan to make any major changes to this library (other than what was +promised in the README). However, we can accept small patches related to +re-factoring and documentation. To submit contributes, there are just a few +small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/baselines/models/ernie/LICENSE b/baselines/models/ernie/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/baselines/models/ernie/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/models/ernie/__init__.py b/baselines/models/ernie/__init__.py new file mode 100644 index 0000000..effb57b --- /dev/null +++ b/baselines/models/ernie/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/baselines/models/ernie/conlleval.py b/baselines/models/ernie/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/ernie/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/ernie/create_pretraining_data.py b/baselines/models/ernie/create_pretraining_data.py new file mode 100644 index 0000000..5340d96 --- /dev/null +++ b/baselines/models/ernie/create_pretraining_data.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/baselines/models/ernie/extract_features.py b/baselines/models/ernie/extract_features.py new file mode 100644 index 0000000..60e3830 --- /dev/null +++ b/baselines/models/ernie/extract_features.py @@ -0,0 +1,419 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract pre-computed feature vectors from BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs +import collections +import json +import re + +import modeling +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "") + +flags.DEFINE_string("output_file", None, "") + +flags.DEFINE_string("layers", "-1,-2,-3,-4", "") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("master", None, + "If using a TPU, the address of the master.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_one_hot_embeddings", False, + "If True, tf.one_hot will be used for embedding lookups, otherwise " + "tf.nn.embedding_lookup will be used. On TPUs, this should be True " + "since it is much faster.") + + +class InputExample(object): + + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids + + +def input_fn_builder(features, seq_length): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] + + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) + + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d + + return input_fn + + +def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] + + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, + initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + all_layers = model.get_all_encoder_layers() + + predictions = { + "unique_id": unique_ids, + } + + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +def convert_examples_to_features(examples, seq_length, tokenizer): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def read_examples(input_file): + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + examples = read_examples(FLAGS.input_file) + + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) + + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) + + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/baselines/models/ernie/modeling.py b/baselines/models/ernie/modeling.py new file mode 100644 index 0000000..fed5259 --- /dev/null +++ b/baselines/models/ernie/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/ernie/modeling_test.py b/baselines/models/ernie/modeling_test.py new file mode 100644 index 0000000..817ad2d --- /dev/null +++ b/baselines/models/ernie/modeling_test.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +import modeling +import six +import tensorflow as tf + + +class BertModelTest(tf.test.TestCase): + + class BertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/ernie/multilingual.md b/baselines/models/ernie/multilingual.md new file mode 100644 index 0000000..3b38379 --- /dev/null +++ b/baselines/models/ernie/multilingual.md @@ -0,0 +1,303 @@ +## Models + +There are two multilingual models currently available. We do not plan to release +more single-language models, but we may release `BERT-Large` versions of these +two in the future: + +* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: + 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: + 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: + Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M + parameters + +**The `Multilingual Cased (New)` model also fixes normalization issues in many +languages, so it is recommended in languages with non-Latin alphabets (and is +often better for most languages with Latin alphabets). When using this model, +make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other +scripts.** + +See the [list of languages](#list-of-languages) that the Multilingual model +supports. The Multilingual model does include Chinese (and English), but if your +fine-tuning data is Chinese-only, then the Chinese model will likely produce +better results. + +## Results + +To evaluate these systems, we use the +[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a +version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the +dev and test sets have been translated (by humans) into 15 languages. Note that +the training set was *machine* translated (we used the translations provided by +XNLI, not Google NMT). For clarity, we only report on 6 languages below: + + + +| System | English | Chinese | Spanish | German | Arabic | Urdu | +| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | +| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | +| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | +| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | +| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | +| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | +| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | + + + +The first two rows are baselines from the XNLI paper and the last three rows are +our results with BERT. + +**Translate Train** means that the MultiNLI training set was machine translated +from English into the foreign language. So training and evaluation were both +done in the foreign language. Unfortunately, training was done on +machine-translated data, so it is impossible to quantify how much of the lower +accuracy (compared to English) is due to the quality of the machine translation +vs. the quality of the pre-trained model. + +**Translate Test** means that the XNLI test set was machine translated from the +foreign language into English. So training and evaluation were both done on +English. However, test evaluation was done on machine-translated English, so the +accuracy depends on the quality of the machine translation system. + +**Zero Shot** means that the Multilingual BERT system was fine-tuned on English +MultiNLI, and then evaluated on the foreign language XNLI test. In this case, +machine translation was not involved at all in either the pre-training or +fine-tuning. + +Note that the English result is worse than the 84.2 MultiNLI baseline because +this training used Multilingual BERT rather than English-only BERT. This implies +that for high-resource languages, the Multilingual model is somewhat worse than +a single-language model. However, it is not feasible for us to train and +maintain dozens of single-language models. Therefore, if your goal is to maximize +performance with a language other than English or Chinese, you might find it +beneficial to run pre-training for additional steps starting from our +Multilingual model on data from your language of interest. + +Here is a comparison of training Chinese models with the Multilingual +`BERT-Base` and Chinese-only `BERT-Base`: + +System | Chinese +----------------------- | ------- +XNLI Baseline | 67.0 +BERT Multilingual Model | 74.2 +BERT Chinese-only Model | 77.2 + +Similar to English, the single-language model does 3% better than the +Multilingual model. + +## Fine-tuning Example + +The multilingual model does **not** require any special consideration or API +changes. We did update the implementation of `BasicTokenizer` in +`tokenization.py` to support Chinese character tokenization, so please update if +you forked it. However, we did not change the tokenization API. + +To test the new models, we did modify `run_classifier.py` to add support for the +[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language +version of MultiNLI where the dev/test sets have been human-translated, and the +training set has been machine-translated. + +To run the fine-tuning code, please download the +[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the +[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) +and then unpack both .zip files into some directory `$XNLI_DIR`. + +To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` +(Chinese by default), so please modify `XnliProcessor` if you want to run on +another language. + +This is a large dataset, so this will training will take a few hours on a GPU +(or about 30 minutes on a Cloud TPU). To run an experiment quickly for +debugging, just set `num_train_epochs` to a small value like `0.1`. + +```shell +export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 +export XNLI_DIR=/path/to/xnli + +python run_classifier.py \ + --task_name=XNLI \ + --do_train=true \ + --do_eval=true \ + --data_dir=$XNLI_DIR \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --output_dir=/tmp/xnli_output/ +``` + +With the Chinese-only model, the results should look something like this: + +``` + ***** Eval results ***** +eval_accuracy = 0.774116 +eval_loss = 0.83554 +global_step = 24543 +loss = 0.74603 +``` + +## Details + +### Data Source and Sampling + +The languages chosen were the +[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). +The entire Wikipedia dump for each language (excluding user and talk pages) was +taken as the training data for each language + +However, the size of the Wikipedia for a given language varies greatly, and +therefore low-resource languages may be "under-represented" in terms of the +neural network model (under the assumption that languages are "competing" for +limited model capacity to some extent). At the same time, we also don't want +to overfit the model by performing thousands of epochs over a tiny Wikipedia +for a particular language. + +To balance these two factors, we performed exponentially smoothed weighting of +the data during pre-training data creation (and WordPiece vocab creation). In +other words, let's say that the probability of a language is *P(L)*, e.g., +*P(English) = 0.21* means that after concatenating all of the Wikipedias +together, 21% of our data is English. We exponentiate each probability by some +factor *S* and then re-normalize, and sample from that distribution. In our case +we use *S=0.7*. So, high-resource languages like English will be under-sampled, +and low-resource languages like Icelandic will be over-sampled. E.g., in the +original distribution English would be sampled 1000x more than Icelandic, but +after smoothing it's only sampled 100x more. + +### Tokenization + +For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are +weighted the same way as the data, so low-resource languages are upweighted by +some factor. We intentionally do *not* use any marker to denote the input +language (so that zero-shot training can work). + +Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace +characters, we add spaces around every character in the +[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) +before applying WordPiece. This means that Chinese is effectively +character-tokenized. Note that the CJK Unicode block only includes +Chinese-origin characters and does *not* include Hangul Korean or +Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like +all other languages. + +For all other languages, we apply the +[same recipe as English](https://github.com/google-research/bert#tokenization): +(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace +tokenization. We understand that accent markers have substantial meaning in some +languages, but felt that the benefits of reducing the effective vocabulary make +up for this. Generally the strong contextual models of BERT should make up for +any ambiguity introduced by stripping accent markers. + +### List of Languages + +The multilingual model supports the following languages. These languages were +chosen because they are the top 100 languages with the largest Wikipedias: + +* Afrikaans +* Albanian +* Arabic +* Aragonese +* Armenian +* Asturian +* Azerbaijani +* Bashkir +* Basque +* Bavarian +* Belarusian +* Bengali +* Bishnupriya Manipuri +* Bosnian +* Breton +* Bulgarian +* Burmese +* Catalan +* Cebuano +* Chechen +* Chinese (Simplified) +* Chinese (Traditional) +* Chuvash +* Croatian +* Czech +* Danish +* Dutch +* English +* Estonian +* Finnish +* French +* Galician +* Georgian +* German +* Greek +* Gujarati +* Haitian +* Hebrew +* Hindi +* Hungarian +* Icelandic +* Ido +* Indonesian +* Irish +* Italian +* Japanese +* Javanese +* Kannada +* Kazakh +* Kirghiz +* Korean +* Latin +* Latvian +* Lithuanian +* Lombard +* Low Saxon +* Luxembourgish +* Macedonian +* Malagasy +* Malay +* Malayalam +* Marathi +* Minangkabau +* Nepali +* Newar +* Norwegian (Bokmal) +* Norwegian (Nynorsk) +* Occitan +* Persian (Farsi) +* Piedmontese +* Polish +* Portuguese +* Punjabi +* Romanian +* Russian +* Scots +* Serbian +* Serbo-Croatian +* Sicilian +* Slovak +* Slovenian +* South Azerbaijani +* Spanish +* Sundanese +* Swahili +* Swedish +* Tagalog +* Tajik +* Tamil +* Tatar +* Telugu +* Turkish +* Ukrainian +* Urdu +* Uzbek +* Vietnamese +* Volapük +* Waray-Waray +* Welsh +* West Frisian +* Western Punjabi +* Yoruba + +The **Multilingual Cased (New)** release contains additionally **Thai** and +**Mongolian**, which were not included in the original release. diff --git a/baselines/models/ernie/optimization.py b/baselines/models/ernie/optimization.py new file mode 100644 index 0000000..d33dabd --- /dev/null +++ b/baselines/models/ernie/optimization.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/ernie/optimization_test.py b/baselines/models/ernie/optimization_test.py new file mode 100644 index 0000000..4f2dcf1 --- /dev/null +++ b/baselines/models/ernie/optimization_test.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import optimization +import tensorflow as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/ernie/predicting_movie_reviews_with_bert_on_tf_hub.ipynb b/baselines/models/ernie/predicting_movie_reviews_with_bert_on_tf_hub.ipynb new file mode 100644 index 0000000..466857f --- /dev/null +++ b/baselines/models/ernie/predicting_movie_reviews_with_bert_on_tf_hub.ipynb @@ -0,0 +1,1231 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "metadata": { + "id": "j0a4mTk9o1Qg", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Copyright 2019 Google Inc.\n", + "\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dCpvgG0vwXAZ", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Predicting Movie Review Sentiment with BERT on TF Hub" + ] + }, + { + "metadata": { + "id": "xiYrZKaHwV81", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n", + "\n", + "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n", + "\n", + "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!" + ] + }, + { + "metadata": { + "id": "hsZvic2YxnTz", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "from datetime import datetime" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "cp5wfXDx5SPH", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "In addition to the standard libraries we imported above, we'll need to install BERT's python package." + ] + }, + { + "metadata": { + "id": "jviywGyWyKsA", + "colab_type": "code", + "outputId": "166f3005-d219-404f-b201-2a0b75480360", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + } + }, + "cell_type": "code", + "source": [ + "!pip install bert-tensorflow" + ], + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.6/dist-packages (1.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.11.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "hhbGEfwgdEtw", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import bert\n", + "from bert import run_classifier\n", + "from bert import optimization\n", + "from bert import tokenization" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "KVB3eOcjxxm1", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n", + "\n", + "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n", + "\n", + "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)." + ] + }, + { + "metadata": { + "id": "US_EAnICvP7f", + "colab_type": "code", + "outputId": "7780a032-31d4-4794-e6aa-664a5d2ae7dd", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# Set the output directory for saving model file\n", + "# Optionally, set a GCP bucket location\n", + "\n", + "OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:\"string\"}\n", + "#@markdown Whether or not to clear/delete the directory and create a new one\n", + "DO_DELETE = False #@param {type:\"boolean\"}\n", + "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n", + "USE_BUCKET = True #@param {type:\"boolean\"}\n", + "BUCKET = 'BUCKET_NAME' #@param {type:\"string\"}\n", + "\n", + "if USE_BUCKET:\n", + " OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + "\n", + "if DO_DELETE:\n", + " try:\n", + " tf.gfile.DeleteRecursively(OUTPUT_DIR)\n", + " except:\n", + " # Doesn't matter if the directory didn't exist\n", + " pass\n", + "tf.gfile.MakeDirs(OUTPUT_DIR)\n", + "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "***** Model output directory: gs://bert-tfhub/aclImdb_v1 *****\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "pmFYvkylMwXn", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data" + ] + }, + { + "metadata": { + "id": "MC_w8SRqN0fr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)." + ] + }, + { + "metadata": { + "id": "fom_ff20gyy6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from tensorflow import keras\n", + "import os\n", + "import re\n", + "\n", + "# Load all files from a directory in a DataFrame.\n", + "def load_directory_data(directory):\n", + " data = {}\n", + " data[\"sentence\"] = []\n", + " data[\"sentiment\"] = []\n", + " for file_path in os.listdir(directory):\n", + " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", + " data[\"sentence\"].append(f.read())\n", + " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", + " return pd.DataFrame.from_dict(data)\n", + "\n", + "# Merge positive and negative examples, add a polarity column and shuffle.\n", + "def load_dataset(directory):\n", + " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", + " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", + " pos_df[\"polarity\"] = 1\n", + " neg_df[\"polarity\"] = 0\n", + " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", + "\n", + "# Download and process the dataset files.\n", + "def download_and_load_datasets(force_download=False):\n", + " dataset = tf.keras.utils.get_file(\n", + " fname=\"aclImdb.tar.gz\", \n", + " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", + " extract=True)\n", + " \n", + " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"train\"))\n", + " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"test\"))\n", + " \n", + " return train_df, test_df\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "2abfwdn-g135", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train, test = download_and_load_datasets()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "XA8WHJgzhIZf", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To keep training fast, we'll take a sample of 5000 train and test examples, respectively." + ] + }, + { + "metadata": { + "id": "lw_F488eixTV", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train = train.sample(5000)\n", + "test = test.sample(5000)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "prRQM8pDi8xI", + "colab_type": "code", + "outputId": "34445cb8-2be0-4379-fdbc-7794091f6049", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "train.columns" + ], + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['sentence', 'sentiment', 'polarity'], dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 44 + } + ] + }, + { + "metadata": { + "id": "sfRnHSz3iSXz", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)" + ] + }, + { + "metadata": { + "id": "IuMOGwFui4it", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "DATA_COLUMN = 'sentence'\n", + "LABEL_COLUMN = 'polarity'\n", + "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", + "label_list = [0, 1]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "V399W0rqNJ-Z", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data Preprocessing\n", + "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n", + "\n", + "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n", + "- `label` is the label for our example, i.e. True, False" + ] + }, + { + "metadata": { + "id": "p9gEt5SmM6i6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", + "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)\n", + "\n", + "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "SCZWZtKxObjh", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n", + "\n", + "\n", + "1. Lowercase our text (if we're using a BERT lowercase model)\n", + "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", + "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", + "4. Map our words to indexes using a vocab file that BERT provides\n", + "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n", + "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", + "\n", + "Happily, we don't have to worry about most of these details.\n", + "\n", + "\n" + ] + }, + { + "metadata": { + "id": "qMWiDtpyQSoU", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:" + ] + }, + { + "metadata": { + "id": "IhJSe0QHNG7U", + "colab_type": "code", + "outputId": "20b28cc7-3cb3-4ce6-bfff-a7847ce3bbaa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# This is a path to an uncased (all lowercase) version of BERT\n", + "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", + "\n", + "def create_tokenizer_from_hub_module():\n", + " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", + " with tf.Graph().as_default():\n", + " bert_module = hub.Module(BERT_MODEL_HUB)\n", + " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", + " with tf.Session() as sess:\n", + " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", + " tokenization_info[\"do_lower_case\"]])\n", + " \n", + " return bert.tokenization.FullTokenizer(\n", + " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", + "\n", + "tokenizer = create_tokenizer_from_hub_module()" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "z4oFkhpZBDKm", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:" + ] + }, + { + "metadata": { + "id": "dsBo6RCtQmwx", + "colab_type": "code", + "outputId": "9af8c917-90ec-4fe9-897b-79dc89ca88e1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + } + }, + "cell_type": "code", + "source": [ + "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['this',\n", + " 'here',\n", + " \"'\",\n", + " 's',\n", + " 'an',\n", + " 'example',\n", + " 'of',\n", + " 'using',\n", + " 'the',\n", + " 'bert',\n", + " 'token',\n", + " '##izer']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + } + ] + }, + { + "metadata": { + "id": "0OEzfFIt6GIc", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands." + ] + }, + { + "metadata": { + "id": "LL5W8gEGRTAf", + "colab_type": "code", + "outputId": "65001dda-155b-48fc-b5fc-1e4cabc8dfbf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1261 + } + }, + "cell_type": "code", + "source": [ + "# We'll set sequences to be at most 128 tokens long.\n", + "MAX_SEQ_LENGTH = 128\n", + "# Convert our train and test features to InputFeatures that BERT understands.\n", + "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" + ], + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i ' m watching this on the sci - fi channel right now . it ' s so horrible i can ' t stop watching it ! i ' m a video ##grapher and this movie makes me sad . i feel bad for anyone associated with this movie . some of the camera work is good . most is very questionable . there are a few decent actors in the flick . too bad they ' re surrounded by what must have been the director ' s relatives . that ' s the only way they could have been qualified to be in a movie ! music was a little better than the acting . if you get around to watching this i hope it [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 1005 1049 3666 2023 2006 1996 16596 1011 10882 3149 2157 2085 1012 2009 1005 1055 2061 9202 1045 2064 1005 1056 2644 3666 2009 999 1045 1005 1049 1037 2678 18657 1998 2023 3185 3084 2033 6517 1012 1045 2514 2919 2005 3087 3378 2007 2023 3185 1012 2070 1997 1996 4950 2147 2003 2204 1012 2087 2003 2200 21068 1012 2045 2024 1037 2261 11519 5889 1999 1996 17312 1012 2205 2919 2027 1005 2128 5129 2011 2054 2442 2031 2042 1996 2472 1005 1055 9064 1012 2008 1005 1055 1996 2069 2126 2027 2071 2031 2042 4591 2000 2022 1999 1037 3185 999 2189 2001 1037 2210 2488 2084 1996 3772 1012 2065 2017 2131 2105 2000 3666 2023 1045 3246 2009 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i have been a fan of pushing dai ##sies since the very beginning . it is wonderful ##ly thought up , and bryan fuller has the most remarkable ideas for this show . < br / > < br / > it is unbelievable on how much tv has been needing a creative , original show like pushing dai ##sies . it is a huge relief to see a show , that is unlike the rest , where as , if you compared it to some of the newer shows , such as scrub ##s and house , you would see the similarities , and it does get ted ##ious at moments to see shows so close in identity . < br / > < br [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2031 2042 1037 5470 1997 6183 18765 14625 2144 1996 2200 2927 1012 2009 2003 6919 2135 2245 2039 1010 1998 8527 12548 2038 1996 2087 9487 4784 2005 2023 2265 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 2003 23653 2006 2129 2172 2694 2038 2042 11303 1037 5541 1010 2434 2265 2066 6183 18765 14625 1012 2009 2003 1037 4121 4335 2000 2156 1037 2265 1010 2008 2003 4406 1996 2717 1010 2073 2004 1010 2065 2017 4102 2009 2000 2070 1997 1996 10947 3065 1010 2107 2004 18157 2015 1998 2160 1010 2017 2052 2156 1996 12319 1010 1998 2009 2515 2131 6945 6313 2012 5312 2000 2156 3065 2061 2485 1999 4767 1012 1026 7987 1013 1028 1026 7987 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] this movie starts out promising ##ly , with an early scene in which frank morgan advises against gary cooper ' s marriage to his daughter , anita louise . frank morgan , playing an una ##bas ##hed gold - digger , loudly complain ##s to cooper about his perceived pen ##ury at the hands of his family - including his daughter , anita louise . i am a fan of all 3 actors . frank morgan is ( to my mind ) a hollywood treasure , cooper a legend , and louise a very lovely , versatile and under - appreciated actress seldom seen in the leading role . i also have nothing against teresa wright , and while not blessed with great range , she [SEP]\n", + "INFO:tensorflow:input_ids: 101 2023 3185 4627 2041 10015 2135 1010 2007 2019 2220 3496 1999 2029 3581 5253 25453 2114 5639 6201 1005 1055 3510 2000 2010 2684 1010 12918 8227 1012 3581 5253 1010 2652 2019 14477 22083 9072 2751 1011 28661 1010 9928 17612 2015 2000 6201 2055 2010 8690 7279 13098 2012 1996 2398 1997 2010 2155 1011 2164 2010 2684 1010 12918 8227 1012 1045 2572 1037 5470 1997 2035 1017 5889 1012 3581 5253 2003 1006 2000 2026 2568 1007 1037 5365 8813 1010 6201 1037 5722 1010 1998 8227 1037 2200 8403 1010 22979 1998 2104 1011 12315 3883 15839 2464 1999 1996 2877 2535 1012 1045 2036 2031 2498 2114 12409 6119 1010 1998 2096 2025 10190 2007 2307 2846 1010 2016 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i was over ##taken by the emotion . un ##for ##get ##table rendering of a wartime story which is unknown to most people . the performances were fault ##less and outstanding . [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2001 2058 25310 2011 1996 7603 1012 4895 29278 18150 10880 14259 1997 1037 12498 2466 2029 2003 4242 2000 2087 2111 1012 1996 4616 2020 6346 3238 1998 5151 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] soldier blue is a movie with pre ##tension ##s : pre ##tension ##s to be some sort of profound statement on man ' s inhuman ##ity to man , on the white man ' s exploitation of and brutality towards indigenous peoples ; a biting , un ##fl ##in ##ching and sar ##don ##ic commentary on the horrors of vietnam . well , sorry , but it fails mis ##era ##bly to be any of those things . what soldier blue actually is is per ##nic ##ious , tri ##te , badly made , dish ##ones ##t rubbish . < br / > < br / > another reviewer here hit the nail on the head in saying that it appears to be a hybrid of [SEP]\n", + "INFO:tensorflow:input_ids: 101 5268 2630 2003 1037 3185 2007 3653 29048 2015 1024 3653 29048 2015 2000 2022 2070 4066 1997 13769 4861 2006 2158 1005 1055 29582 3012 2000 2158 1010 2006 1996 2317 2158 1005 1055 14427 1997 1998 24083 2875 6284 7243 1025 1037 12344 1010 4895 10258 2378 8450 1998 18906 5280 2594 8570 2006 1996 22812 1997 5148 1012 2092 1010 3374 1010 2021 2009 11896 28616 6906 6321 2000 2022 2151 1997 2216 2477 1012 2054 5268 2630 2941 2003 2003 2566 8713 6313 1010 13012 2618 1010 6649 2081 1010 9841 21821 2102 29132 1012 1026 7987 1013 1028 1026 7987 1013 1028 2178 12027 2182 2718 1996 13774 2006 1996 2132 1999 3038 2008 2009 3544 2000 2022 1037 8893 1997 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i just watched this today on tv . it was on abc ' s sunday afternoon movie . < br / > < br / > this wasn ' t a very good movie , but for a low budget independent film like this , it was okay . there is some suspense in it , but there are so many bad qualities that really bring the movie down . the script is pretty lame , and the plot elements aren ' t very realistic , such as the way a 911 operator would laugh and hang up when someone is reporting a murder . i don ' t know what the writer was thinking when they came up with that idea , but it isn [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2074 3427 2023 2651 2006 2694 1012 2009 2001 2006 5925 1005 1055 4465 5027 3185 1012 1026 7987 1013 1028 1026 7987 1013 1028 2023 2347 1005 1056 1037 2200 2204 3185 1010 2021 2005 1037 2659 5166 2981 2143 2066 2023 1010 2009 2001 3100 1012 2045 2003 2070 23873 1999 2009 1010 2021 2045 2024 2061 2116 2919 11647 2008 2428 3288 1996 3185 2091 1012 1996 5896 2003 3492 20342 1010 1998 1996 5436 3787 4995 1005 1056 2200 12689 1010 2107 2004 1996 2126 1037 19989 6872 2052 4756 1998 6865 2039 2043 2619 2003 7316 1037 4028 1012 1045 2123 1005 1056 2113 2054 1996 3213 2001 3241 2043 2027 2234 2039 2007 2008 2801 1010 2021 2009 3475 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] from hardly alien sounding lasers , to an elementary school style shuttle crash , \" night ##be ##ast \" is better classified as a far ##cic ##al mix of fake blood and bare chest . the almost pornographic style of the film seems to be a failed attempt to recover from a lack of co ##hesive or effective story . the acting however is not nearly as beast ##ly , many of the young , aspiring , actors ad ##mir ##ably showcase a hidden talent . particularly don lei ##fer ##t and jamie ze ##mare ##l , who shed a well needed sha ##rd of light on this otherwise terrible film . night ##be ##ast would have never shown up on set had he known the [SEP]\n", + "INFO:tensorflow:input_ids: 101 2013 6684 7344 9391 23965 1010 2000 2019 4732 2082 2806 10382 5823 1010 1000 2305 4783 14083 1000 2003 2488 6219 2004 1037 2521 19053 2389 4666 1997 8275 2668 1998 6436 3108 1012 1996 2471 26932 2806 1997 1996 2143 3849 2000 2022 1037 3478 3535 2000 8980 2013 1037 3768 1997 2522 21579 2030 4621 2466 1012 1996 3772 2174 2003 2025 3053 2004 6841 2135 1010 2116 1997 1996 2402 1010 22344 1010 5889 4748 14503 8231 13398 1037 5023 5848 1012 3391 2123 26947 7512 2102 1998 6175 27838 24376 2140 1010 2040 8328 1037 2092 2734 21146 4103 1997 2422 2006 2023 4728 6659 2143 1012 2305 4783 14083 2052 2031 2196 3491 2039 2006 2275 2018 2002 2124 1996 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] here we have the in ##imi ##table charlie chaplin for ##sa ##king his slap ##stick past to tackle the serious subject of anti - semi ##tism , and into ##ler ##ance in general . he portrays two characters - the sweet , innocent jewish barber - a war veteran , and the ravi ##ng and ruthless dictator , aden ##oid h ##yn ##kel . the jewish ghetto in this country is not safe for long , due to the w ##him ##s of h ##yn ##kel and his armed thugs , who routinely rough up its residents , or leave them alone , dependent upon his mood that day or week . the barber is among them , but is befriended by his former commanding officer [SEP]\n", + "INFO:tensorflow:input_ids: 101 2182 2057 2031 1996 1999 27605 10880 4918 23331 2005 3736 6834 2010 14308 21354 2627 2000 11147 1996 3809 3395 1997 3424 1011 4100 17456 1010 1998 2046 3917 6651 1999 2236 1012 2002 17509 2048 3494 1011 1996 4086 1010 7036 3644 13362 1011 1037 2162 8003 1010 1998 1996 16806 3070 1998 18101 21237 1010 16298 9314 1044 6038 11705 1012 1996 3644 17276 1999 2023 2406 2003 2025 3647 2005 2146 1010 2349 2000 1996 1059 14341 2015 1997 1044 6038 11705 1998 2010 4273 24106 1010 2040 19974 5931 2039 2049 3901 1010 2030 2681 2068 2894 1010 7790 2588 2010 6888 2008 2154 2030 2733 1012 1996 13362 2003 2426 2068 1010 2021 2003 23386 2011 2010 2280 7991 2961 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i really hated this movie and it ' s the first movie written by stephen king that i didn ' t finish . i was truly disappointed , it was the worst crap i ' ve ever seen . what were you thinking making three hours out of it ? it may have a quite good story , but actors ? no . suspense ? no . romance ? no . horror ? no . it didn ' t have anything . < br / > < br / > it ' s got this strange , crazy science man with einstein - hair , the classic thing . not real at all . and a man keep getting younger all the time . it seems [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2428 6283 2023 3185 1998 2009 1005 1055 1996 2034 3185 2517 2011 4459 2332 2008 1045 2134 1005 1056 3926 1012 1045 2001 5621 9364 1010 2009 2001 1996 5409 10231 1045 1005 2310 2412 2464 1012 2054 2020 2017 3241 2437 2093 2847 2041 1997 2009 1029 2009 2089 2031 1037 3243 2204 2466 1010 2021 5889 1029 2053 1012 23873 1029 2053 1012 7472 1029 2053 1012 5469 1029 2053 1012 2009 2134 1005 1056 2031 2505 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 1005 1055 2288 2023 4326 1010 4689 2671 2158 2007 15313 1011 2606 1010 1996 4438 2518 1012 2025 2613 2012 2035 1012 1998 1037 2158 2562 2893 3920 2035 1996 2051 1012 2009 3849 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] story chinese tall story tells the story of righteous monk trip ##ita ##ka , who , along with his guardians monkey , sandy and pigs ##y make their journey west on a quest to recover ancient sutra ##s , finally , they reach the final leg of their journey in sha ##che city but all is not as it seems when the city is attacked by evil tree demons . monkey tries his best to battle them but is overwhelmed , knowing his master is in grave danger , he uses his trust ##y golden staff to thrust trip ##ita ##ka to safety . < br / > < br / > the monk ends up being knocked out when he land and when he wakes [SEP]\n", + "INFO:tensorflow:input_ids: 101 2466 2822 4206 2466 4136 1996 2466 1997 19556 8284 4440 6590 2912 1010 2040 1010 2247 2007 2010 14240 10608 1010 7525 1998 14695 2100 2191 2037 4990 2225 2006 1037 8795 2000 8980 3418 26567 2015 1010 2633 1010 2027 3362 1996 2345 4190 1997 2037 4990 1999 21146 5403 2103 2021 2035 2003 2025 2004 2009 3849 2043 1996 2103 2003 4457 2011 4763 3392 7942 1012 10608 5363 2010 2190 2000 2645 2068 2021 2003 13394 1010 4209 2010 3040 2003 1999 6542 5473 1010 2002 3594 2010 3404 2100 3585 3095 2000 7400 4440 6590 2912 2000 3808 1012 1026 7987 1013 1028 1026 7987 1013 1028 1996 8284 4515 2039 2108 6573 2041 2043 2002 2455 1998 2043 2002 17507 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "ccp5trMwRtmr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Creating a model\n", + "\n", + "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)." + ] + }, + { + "metadata": { + "id": "6o2a5ZIvRcJq", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", + " num_labels):\n", + " \"\"\"Creates a classification model.\"\"\"\n", + "\n", + " bert_module = hub.Module(\n", + " BERT_MODEL_HUB,\n", + " trainable=True)\n", + " bert_inputs = dict(\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " segment_ids=segment_ids)\n", + " bert_outputs = bert_module(\n", + " inputs=bert_inputs,\n", + " signature=\"tokens\",\n", + " as_dict=True)\n", + "\n", + " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", + " # Use \"sequence_outputs\" for token-level output.\n", + " output_layer = bert_outputs[\"pooled_output\"]\n", + "\n", + " hidden_size = output_layer.shape[-1].value\n", + "\n", + " # Create our own layer to tune for politeness data.\n", + " output_weights = tf.get_variable(\n", + " \"output_weights\", [num_labels, hidden_size],\n", + " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", + "\n", + " output_bias = tf.get_variable(\n", + " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", + "\n", + " with tf.variable_scope(\"loss\"):\n", + "\n", + " # Dropout helps prevent overfitting\n", + " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", + "\n", + " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", + " logits = tf.nn.bias_add(logits, output_bias)\n", + " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", + "\n", + " # Convert labels into one-hot encoding\n", + " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", + "\n", + " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", + " # If we're predicting, we want predicted labels and the probabiltiies.\n", + " if is_predicting:\n", + " return (predicted_labels, log_probs)\n", + "\n", + " # If we're train/eval, compute loss between predicted and actual label\n", + " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", + " loss = tf.reduce_mean(per_example_loss)\n", + " return (loss, predicted_labels, log_probs)\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "qpE0ZIDOCQzE", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction." + ] + }, + { + "metadata": { + "id": "FnH-AnOQ9KKW", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# model_fn_builder actually creates our model function\n", + "# using the passed parameters for num_labels, learning_rate, etc.\n", + "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", + " num_warmup_steps):\n", + " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", + " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", + " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", + "\n", + " input_ids = features[\"input_ids\"]\n", + " input_mask = features[\"input_mask\"]\n", + " segment_ids = features[\"segment_ids\"]\n", + " label_ids = features[\"label_ids\"]\n", + "\n", + " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", + " \n", + " # TRAIN and EVAL\n", + " if not is_predicting:\n", + "\n", + " (loss, predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " train_op = bert.optimization.create_optimizer(\n", + " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", + "\n", + " # Calculate evaluation metrics. \n", + " def metric_fn(label_ids, predicted_labels):\n", + " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", + " f1_score = tf.contrib.metrics.f1_score(\n", + " label_ids,\n", + " predicted_labels)\n", + " auc = tf.metrics.auc(\n", + " label_ids,\n", + " predicted_labels)\n", + " recall = tf.metrics.recall(\n", + " label_ids,\n", + " predicted_labels)\n", + " precision = tf.metrics.precision(\n", + " label_ids,\n", + " predicted_labels) \n", + " true_pos = tf.metrics.true_positives(\n", + " label_ids,\n", + " predicted_labels)\n", + " true_neg = tf.metrics.true_negatives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_pos = tf.metrics.false_positives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_neg = tf.metrics.false_negatives(\n", + " label_ids,\n", + " predicted_labels)\n", + " return {\n", + " \"eval_accuracy\": accuracy,\n", + " \"f1_score\": f1_score,\n", + " \"auc\": auc,\n", + " \"precision\": precision,\n", + " \"recall\": recall,\n", + " \"true_positives\": true_pos,\n", + " \"true_negatives\": true_neg,\n", + " \"false_positives\": false_pos,\n", + " \"false_negatives\": false_neg\n", + " }\n", + "\n", + " eval_metrics = metric_fn(label_ids, predicted_labels)\n", + "\n", + " if mode == tf.estimator.ModeKeys.TRAIN:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " train_op=train_op)\n", + " else:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " eval_metric_ops=eval_metrics)\n", + " else:\n", + " (predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " predictions = {\n", + " 'probabilities': log_probs,\n", + " 'labels': predicted_labels\n", + " }\n", + " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", + "\n", + " # Return the actual model function in the closure\n", + " return model_fn\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "OjwJ4bTeWXD8", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute train and warmup steps from batch size\n", + "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", + "BATCH_SIZE = 32\n", + "LEARNING_RATE = 2e-5\n", + "NUM_TRAIN_EPOCHS = 3.0\n", + "# Warmup is a period of time where hte learning rate \n", + "# is small and gradually increases--usually helps training.\n", + "WARMUP_PROPORTION = 0.1\n", + "# Model configs\n", + "SAVE_CHECKPOINTS_STEPS = 500\n", + "SAVE_SUMMARY_STEPS = 100" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "emHf9GhfWBZ_", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute # train and warmup steps from batch size\n", + "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", + "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "oEJldMr3WYZa", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Specify outpit directory and number of checkpoint steps to save\n", + "run_config = tf.estimator.RunConfig(\n", + " model_dir=OUTPUT_DIR,\n", + " save_summary_steps=SAVE_SUMMARY_STEPS,\n", + " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "q_WebpS1X97v", + "colab_type": "code", + "outputId": "1648932a-7391-49d3-8af7-52d514e226e8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + } + }, + "cell_type": "code", + "source": [ + "model_fn = model_fn_builder(\n", + " num_labels=len(label_list),\n", + " learning_rate=LEARNING_RATE,\n", + " num_train_steps=num_train_steps,\n", + " num_warmup_steps=num_warmup_steps)\n", + "\n", + "estimator = tf.estimator.Estimator(\n", + " model_fn=model_fn,\n", + " config=run_config,\n", + " params={\"batch_size\": BATCH_SIZE})\n" + ], + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Using config: {'_model_dir': 'gs://bert-tfhub/aclImdb_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", + "graph_options {\n", + " rewrite_options {\n", + " meta_optimizer_iterations: ONE\n", + " }\n", + "}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "NOO3RfG1DYLo", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)." + ] + }, + { + "metadata": { + "id": "1Pv2bAlOX_-K", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Create an input function for training. drop_remainder = True for using TPUs.\n", + "train_input_fn = bert.run_classifier.input_fn_builder(\n", + " features=train_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=True,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "t6Nukby2EB6-", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes." + ] + }, + { + "metadata": { + "id": "nucD4gluYJmK", + "colab_type": "code", + "outputId": "5d728e72-4631-42bf-c48d-3f51d4b968ce", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "cell_type": "code", + "source": [ + "print(f'Beginning Training!')\n", + "current_time = datetime.now()\n", + "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", + "print(\"Training took time \", datetime.now() - current_time)" + ], + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Beginning Training!\n", + "INFO:tensorflow:Skipping training since max_steps has already saved.\n", + "Training took time 0:00:00.759709\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "CmbLTVniARy3", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's use our test data to see how well our model did:" + ] + }, + { + "metadata": { + "id": "JIhejfpyJ8Bx", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "test_input_fn = run_classifier.input_fn_builder(\n", + " features=test_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=False,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "PPVEXhNjYXC-", + "colab_type": "code", + "outputId": "dd5482cd-c558-465f-c854-ec11a0175316", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 445 + } + }, + "cell_type": "code", + "source": [ + "estimator.evaluate(input_fn=test_input_fn, steps=None)" + ], + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Starting evaluation at 2019-02-12T21:04:20Z\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n", + "INFO:tensorflow:Finished evaluation at 2019-02-12-21:06:05\n", + "INFO:tensorflow:Saving dict for global step 468: auc = 0.86659324, eval_accuracy = 0.8664, f1_score = 0.8659711, false_negatives = 375.0, false_positives = 293.0, global_step = 468, loss = 0.51870537, precision = 0.880457, recall = 0.8519542, true_negatives = 2174.0, true_positives = 2158.0\n", + "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'auc': 0.86659324,\n", + " 'eval_accuracy': 0.8664,\n", + " 'f1_score': 0.8659711,\n", + " 'false_negatives': 375.0,\n", + " 'false_positives': 293.0,\n", + " 'global_step': 468,\n", + " 'loss': 0.51870537,\n", + " 'precision': 0.880457,\n", + " 'recall': 0.8519542,\n", + " 'true_negatives': 2174.0,\n", + " 'true_positives': 2158.0}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 59 + } + ] + }, + { + "metadata": { + "id": "ueKsULteiz1B", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's write code to make predictions on new sentences:" + ] + }, + { + "metadata": { + "id": "OsrbTD2EJTVl", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def getPrediction(in_sentences):\n", + " labels = [\"Negative\", \"Positive\"]\n", + " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", + " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", + " predictions = estimator.predict(predict_input_fn)\n", + " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "-thbodgih_VJ", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "pred_sentences = [\n", + " \"That movie was absolutely awful\",\n", + " \"The acting was a bit lacking\",\n", + " \"The film was creative and surprising\",\n", + " \"Absolutely fantastic!\"\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QrZmvZySKQTm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 649 + }, + "outputId": "3891fafb-a460-4eb8-fa6c-335a5bbc10e5" + }, + "cell_type": "code", + "source": [ + "predictions = getPrediction(pred_sentences)" + ], + "execution_count": 72, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 4\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] that movie was absolutely awful [SEP]\n", + "INFO:tensorflow:input_ids: 101 2008 3185 2001 7078 9643 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the acting was a bit lacking [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 3772 2001 1037 2978 11158 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the film was creative and surprising [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 2143 2001 5541 1998 11341 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] absolutely fantastic ! [SEP]\n", + "INFO:tensorflow:input_ids: 101 7078 10392 999 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "MXkRiEBUqN3n", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Voila! We have a sentiment classifier!" + ] + }, + { + "metadata": { + "id": "ERkTE8-7oQLZ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "outputId": "26c33224-dc2c-4b3d-f7b4-ac3ef0a58b27" + }, + "cell_type": "code", + "source": [ + "predictions" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('That movie was absolutely awful',\n", + " array([-4.9142293e-03, -5.3180690e+00], dtype=float32),\n", + " 'Negative'),\n", + " ('The acting was a bit lacking',\n", + " array([-0.03325794, -3.4200459 ], dtype=float32),\n", + " 'Negative'),\n", + " ('The film was creative and surprising',\n", + " array([-5.3589125e+00, -4.7171740e-03], dtype=float32),\n", + " 'Positive'),\n", + " ('Absolutely fantastic!',\n", + " array([-5.0434084 , -0.00647258], dtype=float32),\n", + " 'Positive')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 73 + } + ] + } + ] +} \ No newline at end of file diff --git a/baselines/models/ernie/requirements.txt b/baselines/models/ernie/requirements.txt new file mode 100644 index 0000000..357b5ea --- /dev/null +++ b/baselines/models/ernie/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 1.11.0 # CPU Version of TensorFlow. +# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. diff --git a/baselines/models/ernie/run_classifier.py b/baselines/models/ernie/run_classifier.py new file mode 100644 index 0000000..f8ac855 --- /dev/null +++ b/baselines/models/ernie/run_classifier.py @@ -0,0 +1,1578 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:50:33 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + if set_type == "test": + #label = "0" + label = tokenization.convert_to_unicode(line[1]) + else: + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "thucnews":THUCNewsProcessor, + "lcqmc": LCQMCProcessor, + "bq": BQProcessor, + "iflydata":iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + ## dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_ernie.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_ernie.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + ## test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_ernie.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_ernie.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/ernie/run_classifier_bq.sh b/baselines/models/ernie/run_classifier_bq.sh new file mode 100644 index 0000000..3ea1b8a --- /dev/null +++ b/baselines/models/ernie/run_classifier_bq.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:45:48 + +TASK_NAME="bq" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_iflydata.sh b/baselines/models/ernie/run_classifier_iflydata.sh new file mode 100644 index 0000000..612ed80 --- /dev/null +++ b/baselines/models/ernie/run_classifier_iflydata.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:46:42 + +TASK_NAME="iflydata" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_inews.sh b/baselines/models/ernie/run_classifier_inews.sh new file mode 100644 index 0000000..5bf1413 --- /dev/null +++ b/baselines/models/ernie/run_classifier_inews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:47:12 + +TASK_NAME="inews" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_lcqmc.sh b/baselines/models/ernie/run_classifier_lcqmc.sh new file mode 100644 index 0000000..182260e --- /dev/null +++ b/baselines/models/ernie/run_classifier_lcqmc.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:47:43 + +TASK_NAME="lcqmc" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_thucnews.sh b/baselines/models/ernie/run_classifier_thucnews.sh new file mode 100644 index 0000000..ea2743d --- /dev/null +++ b/baselines/models/ernie/run_classifier_thucnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:48:22 + +TASK_NAME="thucnews" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_tnews.sh b/baselines/models/ernie/run_classifier_tnews.sh new file mode 100644 index 0000000..6d8be83 --- /dev/null +++ b/baselines/models/ernie/run_classifier_tnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:48:55 + +TASK_NAME="tnews" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_classifier_with_tfhub.py b/baselines/models/ernie/run_classifier_with_tfhub.py new file mode 100644 index 0000000..9d2f80f --- /dev/null +++ b/baselines/models/ernie/run_classifier_with_tfhub.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner with TF-Hub.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import optimization +import run_classifier +import tokenization +import tensorflow as tf +import tensorflow_hub as hub + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "bert_hub_module_handle", None, + "Handle for the BERT TF-Hub module.") + + +def create_model(is_training, input_ids, input_mask, segment_ids, labels, + num_labels, bert_hub_module_handle): + """Creates a classification model.""" + tags = set() + if is_training: + tags.add("train") + bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) + bert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + bert_outputs = bert_module( + inputs=bert_inputs, + signature="tokens", + as_dict=True) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use + # bert_outputs["sequence_output"] instead. + output_layer = bert_outputs["pooled_output"] + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(num_labels, learning_rate, num_train_steps, + num_warmup_steps, use_tpu, bert_hub_module_handle): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, + bert_hub_module_handle) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics) + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions={"probabilities": probabilities}) + else: + raise ValueError( + "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def create_tokenizer_from_hub_module(bert_hub_module_handle): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + bert_module = hub.Module(bert_hub_module_handle) + tokenization_info = bert_module(signature="tokenization_info", as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + return tokenization.FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": run_classifier.ColaProcessor, + "mnli": run_classifier.MnliProcessor, + "mrpc": run_classifier.MrpcProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + num_labels=len(label_list), + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + bert_hub_module_handle=FLAGS.bert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_features = run_classifier.convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = run_classifier.input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = run_classifier.convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = run_classifier.input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + if FLAGS.use_tpu: + # Discard batch remainder if running on TPU + n = len(predict_examples) + predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)] + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + run_classifier.file_based_convert_examples_to_features( + predict_examples, label_list, FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = run_classifier.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=FLAGS.use_tpu) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + tf.logging.info("***** Predict results *****") + for prediction in result: + probabilities = prediction["probabilities"] + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("bert_hub_module_handle") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/ernie/run_classifier_xnli.sh b/baselines/models/ernie/run_classifier_xnli.sh new file mode 100644 index 0000000..5f7845c --- /dev/null +++ b/baselines/models/ernie/run_classifier_xnli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 15:49:29 + +TASK_NAME="xnli" +MODEL_NAME="baidu_ernie" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ERNIE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ERNIE_DIR ]; then + mkdir -p $ERNIE_DIR + echo "makedir $ERNIE_DIR" +fi +cd $ERNIE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/pretrain_models/baidu_ernie.zip + unzip baidu_ernie.zip + rm baidu_ernie.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ERNIE_DIR/vocab.txt \ + --bert_config_file=$ERNIE_DIR/bert_config.json \ + --init_checkpoint=$ERNIE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_ner.py b/baselines/models/ernie/run_ner.py new file mode 100644 index 0000000..e7bc453 --- /dev/null +++ b/baselines/models/ernie/run_ner.py @@ -0,0 +1,844 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON']*(len(word)-1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION']*(len(word)-1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION']*(len(word)-1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask = label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None +): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision(label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example(ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), \ + os.path.join(FLAGS.output_dir, "tmp"), \ + os.path.join(FLAGS.data_dir, "test_results_ernie_base.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/ernie/run_ner_msra.sh b/baselines/models/ernie/run_ner_msra.sh new file mode 100644 index 0000000..1a44040 --- /dev/null +++ b/baselines/models/ernie/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/tensorflow +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/ernie/run_pretraining.py b/baselines/models/ernie/run_pretraining.py new file mode 100644 index 0000000..b118f62 --- /dev/null +++ b/baselines/models/ernie/run_pretraining.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/ernie/run_squad.py b/baselines/models/ernie/run_squad.py new file mode 100644 index 0000000..edd4c3e --- /dev/null +++ b/baselines/models/ernie/run_squad.py @@ -0,0 +1,1283 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import math +import os +import random +import modeling +import optimization +import tokenization +import six +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +flags.DEFINE_float( + "null_score_diff_threshold", 0.0, + "If null_score - best_non_null is greater than the threshold predict null.") + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def input_fn_builder(input_file, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if FLAGS.version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if FLAGS.version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if FLAGS.version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not FLAGS.version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > FLAGS.null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if FLAGS.version_2_with_negative: + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def validate_flags_or_throw(bert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + validate_flags_or_throw(bert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + train_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "train.tf_record"), + is_training=True) + convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = input_fn_builder( + input_file=train_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), + is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + all_results = [] + + predict_input_fn = input_fn_builder( + input_file=eval_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") + + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/ernie/sample_text.txt b/baselines/models/ernie/sample_text.txt new file mode 100644 index 0000000..a428120 --- /dev/null +++ b/baselines/models/ernie/sample_text.txt @@ -0,0 +1,33 @@ +This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত +Text should be one-sentence-per-line, with empty lines between documents. +This sample text is public domain and was randomly selected from Project Guttenberg. + +The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors. +Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity. +Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them. +"Cass" Beard had risen early that morning, but not with a view to discovery. +A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets. +The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency. +This was nearly opposite. +Mr. Cassius crossed the highway, and stopped suddenly. +Something glittered in the nearest red pool before him. +Gold, surely! +But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring. +Looking at it more attentively, he saw that it bore the inscription, "May to Cass." +Like most of his fellow gold-seekers, Cass was superstitious. + +The fountain of classic wisdom, Hypatia herself. +As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge. +From my youth I felt in me a soul above the matter-entangled herd. +She revealed to me the glorious fact, that I am a spark of Divinity itself. +A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's. +There is a philosophic pleasure in opening one's treasures to the modest young. +Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street. +Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide; +but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind. +Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now. +His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert; +while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts. +At last they reached the quay at the opposite end of the street; +and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers. +He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him. diff --git a/baselines/models/ernie/tf_metrics.py b/baselines/models/ernie/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/ernie/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/ernie/tokenization.py b/baselines/models/ernie/tokenization.py new file mode 100644 index 0000000..0ee1359 --- /dev/null +++ b/baselines/models/ernie/tokenization.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/ernie/tokenization_test.py b/baselines/models/ernie/tokenization_test.py new file mode 100644 index 0000000..0afaedd --- /dev/null +++ b/baselines/models/ernie/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile +import tokenization +import six +import tensorflow as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/ernie/tpu/run_classifier_inews.sh b/baselines/models/ernie/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..428a5a9 --- /dev/null +++ b/baselines/models/ernie/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/ernie1.0-base/baidu_ernie +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/ernie1.0-base/baidu_ernie/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/ernie/tpu/run_classifier_lcqmc.sh b/baselines/models/ernie/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..71fc7db --- /dev/null +++ b/baselines/models/ernie/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/ernie1.0-base/baidu_ernie +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/ernie1.0-base/baidu_ernie/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/ernie/tpu/run_classifier_thucnews.sh b/baselines/models/ernie/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..428a5a9 --- /dev/null +++ b/baselines/models/ernie/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/ernie1.0-base/baidu_ernie +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/ernie1.0-base/baidu_ernie/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/ernie/tpu/run_classifier_tnews.sh b/baselines/models/ernie/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..139e7e6 --- /dev/null +++ b/baselines/models/ernie/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/ernie1.0-base/baidu_ernie +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/ernie1.0-base/baidu_ernie/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.16.0.2:8470 diff --git a/baselines/models/ernie/tpu/run_classifier_xnli.sh b/baselines/models/ernie/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..2750eeb --- /dev/null +++ b/baselines/models/ernie/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/ernie1.0-base/baidu_ernie +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/ernie1.0-base/baidu_ernie/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta/conlleval.py b/baselines/models/roberta/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/roberta/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/roberta/create_pretrain_data.sh b/baselines/models/roberta/create_pretrain_data.sh new file mode 100644 index 0000000..9dcfcc1 --- /dev/null +++ b/baselines/models/roberta/create_pretrain_data.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash +echo $1,$2 + +for((i=$1;i<=$2;i++)); +do +python3 create_pretraining_data.py --do_whole_word_mask=True --input_file=./raw_text/news2016zh_$i.txt \ +--output_file=./tf_records_all/tf_news2016zh_$i.tfrecord --vocab_file=./resources/vocab.txt \ +--do_lower_case=True --max_seq_length=256 --max_predictions_per_seq=23 --masked_lm_prob=0.10 --random_seed=12345 --dupe_factor=5 +done diff --git a/baselines/models/roberta/create_pretraining_data.py b/baselines/models/roberta/create_pretraining_data.py new file mode 100644 index 0000000..42d147d --- /dev/null +++ b/baselines/models/roberta/create_pretraining_data.py @@ -0,0 +1,630 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import re +import tokenization +import tensorflow as tf +import jieba + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + # print("length of segment_ids:",len(segment_ids),"max_seq_length:", max_seq_length) + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + print("create_training_instances.started...") + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline().replace("",""))# .replace("”","")) # 将、”替换掉。 + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + print("create_training_instances.ended...") + + return instances + + +def _is_chinese_char(cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + +def get_new_segment(segment): # 新增的方法 #### + """ + 输入一句话,返回一句经过处理的话: 为了支持中文全称mask,将被分开的词,将上特殊标记("#"),使得后续处理模块,能够知道哪些字是属于同一个词的。 + :param segment: 一句话 + :return: 一句处理过的话 + """ + seq_cws = jieba.lcut("".join(segment)) + seq_cws_dict = {x: 1 for x in seq_cws} + new_segment = [] + i = 0 + while i < len(segment): + if len(re.findall('[\u4E00-\u9FA5]', segment[i]))==0: # 不是中文的,原文加进去。 + new_segment.append(segment[i]) + i += 1 + continue + + has_add = False + for length in range(3,0,-1): + if i+length>len(segment): + continue + if ''.join(segment[i:i+length]) in seq_cws_dict: + new_segment.append(segment[i]) + for l in range(1, length): + new_segment.append('##' + segment[i+l]) + i += length + has_add = True + break + if not has_add: + new_segment.append(segment[i]) + i += 1 + return new_segment + +def get_raw_instance(document,max_sequence_length): # 新增的方法 + """ + 获取初步的训练实例,将整段按照max_sequence_length切分成多个部分,并以多个处理好的实例的形式返回。 + :param document: 一整段 + :param max_sequence_length: + :return: a list. each element is a sequence of text + """ + max_sequence_length_allowed=max_sequence_length-2 + document = [seq for seq in document if len(seq)max_sequence_length_allowed/2: # /2 + result_list.append(curr_seq) + + # # 计算总共可以得到多少份 + # num_instance=int(len(big_list)/max_sequence_length_allowed)+1 + # print("num_instance:",num_instance) + # # 切分成多份,添加到列表中 + # result_list=[] + # for j in range(num_instance): + # index=j*max_sequence_length_allowed + # end_index=index+max_sequence_length_allowed if j!=num_instance-1 else -1 + # result_list.append(big_list[index:end_index]) + return result_list + +def create_instances_from_document( # 新增的方法 + # 目标按照RoBERTa的思路,使用DOC-SENTENCES,并会去掉NSP任务: 从一个文档中连续的获得文本,直到达到最大长度。如果是从下一个文档中获得,那么加上一个分隔符 + # document即一整段话,包含多个句子。每个句子叫做segment. + # 给定一个document即一整段话,生成一些instance. + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + + #target_seq_length = max_num_tokens + #if rng.random() < short_seq_prob: + # target_seq_length = rng.randint(2, max_num_tokens) + + instances = [] + raw_text_list_list=get_raw_instance(document, max_seq_length) # document即一整段话,包含多个句子。每个句子叫做segment. + for j, raw_text_list in enumerate(raw_text_list_list): + #################################################################################################################### + raw_text_list = get_new_segment(raw_text_list) # 结合分词的中文的whole mask设置即在需要的地方加上“##” + # 1、设置token, segment_ids + is_random_next=True # this will not be used, so it's value doesn't matter + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in raw_text_list: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + ################################################################################################################ + # 2、调用原有的方法 + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + + return instances + + + +def create_instances_from_document_original( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + print("document_index:",document_index,"document:",type(document)," ;document:",document) # document即一整段话,包含多个句子。每个句子叫做segment. + while i < len(document): + segment = document[i] # 取到一个部分(可能是一段话) + print("i:",i," ;segment:",segment) + #################################################################################################################### + segment = get_new_segment(segment) # 结合分词的中文的whole mask设置即在需要的地方加上“##” + ################################################################################################################### + current_chunk.append(segment) + current_length += len(segment) + print("#####condition:",i == len(document) - 1 or current_length >= target_seq_length) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = [t[2:] if len(re.findall('##[\u4E00-\u9FA5]', t))>0 else t for t in tokens] + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index][2:] if len(re.findall('##[\u4E00-\u9FA5]', tokens[index]))>0 else tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + # tf.logging.info('%s' % (tokens)) + # tf.logging.info('%s' % (output_tokens)) + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() \ No newline at end of file diff --git a/baselines/models/roberta/modeling.py b/baselines/models/roberta/modeling.py new file mode 100644 index 0000000..5e6f38a --- /dev/null +++ b/baselines/models/roberta/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] # [batch_size, seq_length, hidden_size] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/roberta/optimization.py b/baselines/models/roberta/optimization.py new file mode 100644 index 0000000..0cf8892 --- /dev/null +++ b/baselines/models/roberta/optimization.py @@ -0,0 +1,203 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.98, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + + # tvars=find_train_variables(tvars) # fix parameters from layer 0 to layer9. + + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + +def find_train_variables(tvars): + """ + get trainable variables only to train from layer 9 to last layer + :param tvars: a list + :return: a new tvars, which is list + """ + # bert/encoder/layer_21, bert/encoder/layer_9, bert/encoder/layer_20/attention/output/dense/bias:0, bert/encoder/layer_20/attention/output/dense/kernel: + tvars_result_list=[] + + for var in tvars: + if 'cls/predictions' in var.name or 'bert/pooler/dense' in var.name: # 最后几层 + tvars_result_list.append(var) + else: # 后半个网络的参数 + layer_number_list=re.findall("layer_(.+?)/", var.name) + if len(layer_number_list)>0 and isinstance(layer_number_list[0],int): # 匹配到了一个数字 + layer_number=int(layer_number_list[0]) + if layer_number>=9: + tvars_result_list.append(var) + + # print train variables + for i,var_ in enumerate(tvars_result_list): + print("####find_train_variables.i:",i, "variable name:",var_.name) + + print("####find_train_variables:length of tvars_result_list:",tvars_result_list) + return tvars_result_list + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/roberta/optimization_finetuning.py b/baselines/models/roberta/optimization_finetuning.py new file mode 100644 index 0000000..bfd0ad3 --- /dev/null +++ b/baselines/models/roberta/optimization_finetuning.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, # 0.98 ONLY USED FOR PRETRAIN. MUST CHANGE AT FINE-TUNING 0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/roberta/resources/RoBERTa_zh_Large_Learning_Curve.png b/baselines/models/roberta/resources/RoBERTa_zh_Large_Learning_Curve.png new file mode 100644 index 0000000000000000000000000000000000000000..fd57c28280a867a93c8565109a1c6eee22876a31 GIT binary patch literal 195127 zcmeEtgo+YYuBz_YgN^%CBOAuNkJM5ofsVf0Rc-^M&cs^0&))m0@58SGQ8%3 z4nqb30h8WRTwF<3T%1zL!OqOm#uNcT=6jqviiS!*e!BLLsHic-EE%+x?-NB+V~D6S zcs2oVqLhr}P^ny;l-p}4joL#>L+BgvMe#}SZ9>1ch~O$K&vbvqisO|<(7j*r1Pg+B zVGsO{+o_MKf}BZDM1|w2@pm$b5jI0k-`E?94V^F)%6+60ML?(jg6hSmQ^?20!-FnL zs z(5h*oD7)|-QVYA%9a2lXl+jZH`0%?d5q>K1lgiQ}SUKnoc>!t9b82yoq+cr1f2~I| z0+fE~^uOZlqIPsFcvlQWs*t~N_2GD2v;*HGh^KB%QN`z9bS~tebqKk9bzmBZkqaj|kHkODdwn9FNUhxUV==a` z5(f#z8~EyYUR;C(;~852`}Ey7FT;;qT~;r^F=oBe&liMPam%xVZQfM!@ta=_vq!x~ z3s+e%z;BqyYG$+Ziy{;LmUH?PZEc`p&V)s}T^@U#snyAA>V+aXfKMWcK&^{_=J{#w zK$p9iz5nH6H-Clyr3MawPOg;Kzy4D)1%RuT7h8eHw3YA0PB$%m!`gyo>NgDg4Ciq0*lfRTI|Jl{P*krAa`MG6)h9{Xk@8Hp;gA@=$TgGwaoc_hZi z^HVXADPN+Qw;9hV#m7_12&1vmMN=`T#Gc5C4re`NLRy69jT-c#c103&+5X{m4}MklWQ@5tHF7Kj39!x9BqfxGXVh#K)r-oY}R zcV*A1JjfuT`kCH@ZK~|ZW+D6r>4X@e?H?;J<-QKLzb{8GMlb3nUQ1{VaW)X1lTQ6M zX-^A9*sInx1j>jZ*y33pt~4IDm89<7$qQqy{^B~%iqwOFlS={&*S$e$*;GeIy`H#s`iRQ-KnX2Ax+1o;SQ zStwa(5zrDS6_6EpCJ-ke?JPh@*csApw#gRH`J4^NCbCDohrEC4oUl8yUwhCxOOi#9 z**nWPzF6R@dlVz00+0Y``{wzyJa{85wT&30wmbRDBb#G-NLfT46Sb2mbF8<06Ab#9 zm1dZ_Cf8{jlz@2~78c$S)+Kcira*K-$QgzdRuwiF9!cbyGA4f~7eKp3+bf^W>73{{ zM7piDZE03km}OaCIOJ4hS!4-Hs?p5WjGh;%y4}WgR6N5xb2u|Q)5E|Ct-!BhvSNa2 z{9qPleim=dq|F4@*3_oaX0H5I=V4Z2K3vOEU0Ts}5>Np&4abTHZ+cbU| zLT0sNMps@`-DDHF(6opF;a#W;e-n02%s{NkeVUIn{d(**IICxEjlSpSPi4uRed!ib zTXq5G^MP^Z9FRaG|27}cwfP`-3wKR*b87c|VSXg(KxfZuIB@&bKF4fc`&L|03nSP$ z=75>EaZ#gA5;zRhUlLxTf!08m$??f8$w`EO9#)<`9xN9^JCeufr^yG?n-}AY%Yvh~ zUtnO7=OXhWP2TC@737^1Ro-$1Wl5~Pu0?ji6qBsOcA@&gph7?|Xy8%#i#r0VfppuX z=we%Oo3J0NKdJv4v0(p|s%Fh9O}eiYtL3XdR%c$6M0G`7yvBYv{Ek!H`kiOzQioYk z8Y*3YbWj;4sGrBkaen%61vT>9{L2FB!swIe18NG%c}dx9;@)gy>Yh?!Pu2wvEnHm+ zB%?X@A|q(8RXuB)&caaiy-P?n<$}rQC!Q})yEis+fTK2|yj{~}<$)5_ghC-e)$ymTjsV zoX!fEJDBTggKSHRDu4Z&lUj-gLHaLqAfbfG$K?w`K7& z?3B1#N^P>md_A4f(;vn&B`^hC?wwO_q&Q4~_&#_p9PvJvAh!_NbuYh++RDNT-V6S) zAJs_Y+;Z{tCL$;{sFwPrbTWxIG4><-eq@?_zI>dta|8=Oiof=FuI;2_!<(*)&O^a0 z{m>?T!V5Nar827!DY)!Gdpgumm1VA2^R$)}u;W>}*9^O+CJ7 zLQDHATCVdO4xM;qjN!J%3lTN9bs5dXUKfYlmo=fYIc0UrGR^h}+g&MSb#8U(%Oef< z`w9v2S6B-@sM=x4k1`%4cDgzt&sGQ5x#JIL)xv3 z;Ey%jhp>8~fMCIZFjm%f*7Q;|Z^Re<5vz5Ht^!c<4X9pQ+;lQy5ZVuPG^ zTEJJjrwCKsruA4MT%FyTd!gudOaa(PNOoZel-!8Nk{oPfAbW$=K^K`z9t664hNA$7 zXs}pa{eZGyjsf|V*aOzIo8uGAo45fGM$-`i0iWjg-xJx7Z;lZVo)}rGYB*`g z%Ly3U*|Hd#*cq9!xY>S&w?;q^aua|TZB3mFDcx*sfQ|xg!c_m&LI7U=UCl~G`Cm<( ztc9sG6%_96%_l7<>AK7x35@~^-ruV^JCE`^|JuJJ`CjxzgRAy#IEk}& zIGKC!aX6KKayXH_Q$B@f*crOifLO>Fg8J>tUoWPmr{vz}%5{BIq4hynB#p>e#Pv59 z2e+U`=YvK8=lv@HYB}r6Cof<8;iAX#Ys{1B2{Vq5C##D4&pzQbCO9&;5SyJ!@q@5G ze^HbHHMWZlAHPdv-Ihdevh~kT|24ce9mLC~`Py|VL@LTZTuRqZ-&+xG7_Yops&iZj z`43XypBL1DeV_AtBwU14W`j>1V?tgydKWgaXzhD}7zf$%32PFFui-`tR5G%{S&HXdy{+52x zil-)ojPaT4p?~}M&y`cCh#Xg2qmK1#|@&CF)Xec{1DxTu$|G%3`WJR;ub zFX6l^dO4l@?oTV>&pi8|6^8o(J}pLQZ~u0q{N)|da7sAu!hh1@ef%fP(|}J4-Up?V ze=;jpPvN|?lHe|k`X|imeg&TvjjtiB|72GF|C0XWmiYfm`hS-G?*e>DD}kh9KmL|? zKWm=O=(_5z>YJxug*v8}!ehJNdVi_S>v6uD{K+j&?Wshj z){CF&W$8`pA&g5#q0HNZDj*+k*qm`>!|(3i57rIyEY{P+Yjm`njR*qQyS{%)uH*Qt z%=b%HfbZL6J%6$5Yrp1OXpMnJyRb#a!a<%GzrDJ+%5Vq*DXv>+d ze@e<>A~SNRreiPZ+JQ#bCB=lOki^k_0a=`ds}FSM&6O;lkER=Z2Htvp(%+Z|yMn!! zrATZy<8@s$gE`_J2&j*=?OL{=pvQ-Lk)lSMX>L$8k9K+`kImc|7KO-?@Wqr06h0A- zf`{eer1#V6`p&|hmtm1L)QD6{_h?fk+OugM`-Ll%T`}4F|7`p?_BF4v2%jAi^P-G{tvNR!f* zoXybeP!@+#k=x>?%^x2eT1gzcN1c5Zsx3uI#@NjCHX7DrPldb*j<9f$wNB(II}=}y z!x!=&#fs?DQ=CM=6uYlR;Lq)+6S(K$B)peWN!8f}4An)a!QNU)__dC5?5;DK<_%mq z8mD^dnx=TL<4)`*d}ggTtjyKDZo{h#XDgmZ-M)mDZ?4fWN%_WTq~ZL!J3T=?t;T(4 zN7NL-8;*%2m2QA`E2<9T3ids{Sx?;X28-X!uMeX5@X3RU^ot5q^LNX@L;?JeVG)3+zDR0l31>rOikW{yYu8XC>1HG5LLkjR;2fr`RoeqzI9l-N1Dps{wgZ_swWcmHX8OHiI?< z*k+94jdM?gAe66eBSO$@>6IQp@Tj@U^JcGrbA&Yv1&3Cy@6_XMR}r4etKJ7m>ca^r~>weEf<#yC=(;P`Q^KZc*8sK(dC`VHT>1K z5eZl+q$3(Ynp)bsWtb4x6k>w zG_dR2%T~J@M;qG~koa@{E&%Hkh7?CW*>7`T$_vr!!>T=~>(z()y~%>6ICZ^wSczL` zU`K3CaoO$lPDXG+(X`KNuq5HlVy*2#1m8L;7~BFE=UN4|TGQF+a5?b@Rg6jr2i4@k zs2Gh#jfhdgz#og}_~84esksgTQ`?2=lL2HEt}t@p1!1_DgBF#w4!p(Dv!Z8t*#S5k z;)IR@9-Yin?Sr(fOKR@irS>_)_3CL60jI7DmULx@5-&qePxDiN4#vF^7xohmhF9X> zhi=#hnX2ScFRja3wiAIsFrvk&_LZM z(xX*tvlmBU#`KQJ=kXrqxy5476v=MfQ}21y!dr@g9|YLTjk&A#Al&$d9oYnJWE`Qh zMadh7?@#qeOLy1*eCM3S_`(Ht0o(e<}V2gFwDJRu3P$H^`)H4 zV1QVO$z{}g+~=tIY~V_PbG+=f(W-L6^< z9Z=c+7^LeoF1g`Z=LCWut6(7Hs-ltY~eHyDh^p-%#f7Wt;xmeY) zfR>-TIo~$vP?#5gYJINhn`)vo@6Al!UF_R+W(JEo>@8XPZ6L{C5VPJAIqS! zo0(W*6Vdh(M|*mp-tz#Ad~gB)kdff~ymw%$0pqnqta_ zaABgiQa@c~KC00^68Si(2oUbnM$bbSKvhA|GD@+nO;Ufpt{ZsrG)J8N5$5~26PHjd z*nN>p>UmfZ0WxU}poRdK8{LgbCr4CK#IB=o!vLRqEWUhZH9Iy6=fD)D4KOST+Y~UF zdS}D*wr3bh@c`|b!ov3rr#nnbnRza3v~ zL^i=mol@vsgfGj9EPCqu+|QBpNfze!w=hM$OWR^k7__3YA91V)Ki3N`Y*r}i_kR`j zbc>$!Xc-Y|$??@a+#D=sfjbCt!q9Z=S}xRP8=b8M<0`seh&O#*I~xUL*fY_|Be{;}9b-dvZD~vTNfe6Gka#8aZXND0$V| zfkpiF34?b+0Ju4<&ByQVsB0|wG$)u_lK z$pnOy+%tbwp5te6E585IFfaHJYgFy|qL%*P*>sUQt6)h9T!AG=v;==S56A361qEN> z_ly~j3iqq!jt;4i`X(KbhTW{)&aa;o8-Ft%BiOoy)PZ%}TSJd8%~lcblhKem-9SM{ zI;Hl$Yhf?Fuv6W)lTKV|4Gp>xFg(q5-M8XSD7j06E03;{du-81ng?~f~E!*grk>Z}Q2$(3Q+Iept|Tz);J$fQ;U zy~hN75BF0^0`4ZjoO}LRunl)kUJDMonD&`dp@7v27ageTMUR?7jeZzyQROT*1p0J8&2N!Nw zPIb)YH5;a>aX=*z@8p6Ep$NFuoh(1F^GaGTYc;NG!PZiW9V7Vbcb+&^8RMs!Y4+ID z@@^w~zV{blM#>79AG1~=Njgp8(coHIw3Q^neg~p%CT5|&()#rfRcu-e9AM0fg-+)x z&il%6x=vgbS3?mDBZ>`l->k97EGGHOBEl2*OZHclwzH*au&1aS`4CRqL|IAAylJsY zuXvOJ_1NJof*{=@sJ6V7MJAlE3-N}?&+WO|1N+)JfI>Q~(^V;F)5 z;q2s^xODG1@lxOn30%8|dTd5Z-`zF{fh8>sueZJ6TY|IstlxT=dv?g}@l04$9~WJP zUF*@Um(2A_jL6jcthnWJ0XlD9?a{aG%01#|!yI)Nu~cQD$!D zCRW^c7cv^r2|~l!_Z8h1etsPRcZpJEFV=d6SFmLzMPJ*8qWKd@e`HfI&@ezJYu;y5 z{Zzxoyed##uPZ2oJHEJQmiC~*6kD^qxqiT1FNM(#o2W&T#UL2 z{UFZ4wmuA`E*cC`6yyoy)9HwwG40k?nuR+JV@++0B)lx<+Ro@1iSo=Q*x{5D137>$*2v@zng8Uh}!jD3G`+z#o0 zQ{_Apaa%D;0L+(7-|T3^bh6ki7*bymjNY(OB(gDccECnxQ*2&IjW#GY*y4Xl^>Y&Y ztA_lG-LLVE^4Ac^%IgTjdt51=!`X!5Upz@5! zT|+U9+b|KzD*s(fuo*6&TC({xfu}}qrEv1s#0y`dTg#P~;?MpVu!xP8RV-lxnjUf{ zU&6H^S^4Cegy;O5FUrCUmh0Yyd%fwI;x7KwBk+m>?u)g$nVN3Bg&wffuOcRpvdimo z1DED{{beU^Lp%&xIA+*BXg8teV5HlkBFE(_+wM;l>h;GIc@?Ada)}Z?Wou2g@1Y~S zH^!n}AWFB$kqWPzB+_2bG|3k>Bz%uLCqp)0csY4MgTl-<{C3h7i;#_y(KB6sKe+vQ z*f}dVqi2MB&%_N;v(Dh@tKqkwv4oTdTH^~RdB9$0nCL?0uw~bb3@e@zii4#FM`fwW zupuW(L=5&PH&n>gtG0vH?dPF#;M7AEqB+x}yN|G>pai?gO;McIPPg==mM?{w?z{mV zFy;e)yfa;sJ$vvuGi|dqThMDSY&KB_T;#Un=h;hM3ABOMJM+-#gmlNjX9r@;6FYCF51S zdbd4W-WTC(z^jz99w{tbBu}T^U2-xGu7h;Ov##$2=T5wrkPR1`I`I*k-`;`Q9BJ%2 zuuqyUY(vojjBE{ia}c`&Ip-X4DFgA(QaWz3FLOd;8Vxv0Pi|*t3 zi2h?3D!peTseN@$YqWok118aVx%If0t?v+E23gDXKKxGjPP;BE(!a82Gox^Fsf%GU z2b6JNNPmmz*X}z48WZL}xAcAZ`l0mm@Qlu2%?g)eEGH!Dq2PX`$)u|w?3V2FHxir@ zh##qosygh-II@1*x(qLPk&l>NcyQV^#U{C71>arY{lXdgfI@z5XgP_!&J!oo2JLgQqApt z9`YX6eCULbHuLDOC7+uuUg&OVZ-YM#fJjWelKG9kQ~@u{(j~OvIZG96&Jn)dmY%$+OE{yHc|K-wo=^2G-Vw*SM(BrflaP+j1Okjch$@Q zzp~z8`*7mn2xJ;5Uo3NJP}zRH$GHBoz@rq{d|dxrH+Zi0j-Nj0&(RIhAENi0Xk@HH zPp?jlZy6r7fqZy!$zs|kD1~d2c9lVosgE8bF3z(d0=pK!sID%T;?@s;4P!Md#GWoaPwdm zu#hI<++Kr~_Ie}wA{sy7obSS^Zn<``p3@^e-q1@n-gNqtx_uzGd5PTZI4jKY zvE@OkZZ@6Fh?<&UR^Lvy?mbtBJ6u~IM7^D!&8dp5zlYXO>Uw@U;v>Dbd+Dx_XwTZJ z6p=>ZJ9HkEEX42VK^10qAOQ3<(yH=6^lAYzH&JsQwC3A{T^Gy zN!zBXMeqo)Q}2Db!nvV|juUwB8ij0i#&UXsB(c-F-@79jcW__=tq*h96&3c`1(EeO?o5cQL)AsBgXXTYFxz*NwbbnLlr7MtbLW(gBSlSSh=d z%;Z38eu+gdaMTg%jUP8|t~Gz14#CHA;8?qjgdKak!eHRM=9G?htAHYkmZSGHm_YZV z7VM=Fh-+Ak%}jBjHf{1vBjov_xfh*5J&BMBlrph~dc){S?==S!Wh0qq*duxN(jZHn zjap(}4jAdxe<klqxt?CPt77V zt6E;-BDkL;H?XkSk+t)WlLJNt^y z&T?2Y|2-6i!n4!Dg0JQd!;;Z*3!5lPdVsmnQhB|oWQB_-{?&88lR~;t-5ncf(&3|0 zRte3u*BJz+={0ex5YH+YJdu9=5Ujs6n*$2ReEUd17xsk6uAOHQNRnZUlDFJ~MA84N zK-YC;Jnb=~iesOj+Gri2wYsx%zo6JUUWdh}dnQeoI)3ykc*K53*D3H((BjcscgGv% zLQ)I@#uL2US~A%P*MR|-m%{6oprL6!`n77 z?-gA3?Rs8&Xegyl-b6(8-w7r)5eQ$-AH}*KF}>Tg3U=|eJi&lin-3SJaGNr_lY6Z` z*bLx$jtpSthEgeoqUH+UUIQOz79V$RbbLRM7SD+|R>U4HnLIk)FwoWKL<$XIcYNm@ zsp#bGb!yGPxC>G+ez6TW&JruVS2zPLU-zt3UznY6&8dZf9@7u?JRr6iM?v2i@X+7NX>1Xb_(~g}|p(foKc=EnUxPt)Z@OJ|g z9F!0pTY$WKEYpX5dnezj?WylGj)SQ{T*dx389MHCoq5|T9WnkHUbLO?fJE-WxO8K_ z{7hfp*5Xmm6$4LaOp~NkMaqpHPWnKTopaPoD|&+&(dN3k&GIL1-=>UrwXOiRz7uJ( zw8^UA0}s^%On)3pQun#n(Cx*e%ashvFbENqO+~T8v{+x1fc)Fd4i2S=U$ND#ImM@& z`@M(cXY73h8J?r+ig;@{g;G^FsUf*Jd{pJ-FJwF~uk-I|J?dT;Nv9`-vbJjUKq_Yv zg%5P-Ye(f6#pfQ6ZC$&vP|RYphWy>Zb+PQskWE)*twgp*^%c@J6fWkv!=*jky(YoK ziWHdC^T88^O-&yAmFj)RzFfpt+uDA+Asmry*n~Z7-=-tOjK^zBiF&dXcN>h+c6>h2 zfjbfD|KhB*|LxpfKeSRE+8gJRpI!c3bUT_?-jji7NzBAc8)@;f_+|7vquG|{PP`X;kjs5G-+Nj_ws3t@3_xiNi#2ZNP6;&w9k57;N=Q51U zI$Gp$_7OFe$5HPUZhPtXHXGP{^VKCJ1EM)%86HhMCIe^j&mX8Gh4)5;E}jA|qT{v_ zsdp5Et#*P$Zkr)Ji=5eRvh7H=V;hPMk$~$ZU9Xj-@#-JewEn95_Uz8mY3eBMt56?c z=f%wTTfgth6w@ws-s_g+3_EzUI!Dlk$Ct2y@?MU7ja+XdK1PXOU#GRKd4@e!Fc?xg zhG^JK-%q&?Wt_}pr1ID{9%VSNXJd`2%fe#?1GH@W8>34iC}ev`wjjt1bmeh5c3dCQ zI;a2~wh(APYNgodR5i=!=Nd+FUZKlrbDIU7tpr00&C^{0($-!p6R=-ynOah9ugKWa z)>G_28<$JQ4|O!)i!3GDRuZ`@s4Ti6A!Y0(mdu=F_TEC0vglt>H-3%*c($V?(OUuR-v@Wc@s+xx^%c0!+^YZ$LD9K;FP7Z74Ei9ZykYb4C#Af&u~e zm*6p=IH0NYzHFULZ!&-S8iJ!B+$$`6DCz5SZIR~ZI4VG6sF6hCdYOvrTHZHgutoVq z2s7qYJof;L=StD;)Zwm=RP&?Z%#>G-$ioz*Lb3Mc7a=7U^%{6Qt|qww9+~-iXgqHf zIs?sLyy@+&<$QB`E=%0Qv*=_z%BY`~@A&qMGp>D4SQ79M8z8Vhk?XDs#J8b%?9Ff) zC;Yk`3gDF+@MI3*<8(M`Q*j(#<);RTM~bv#QJjb< zb3U~ru#*|&D?);6FTanei-XzF+w6p#!cM^LI%D060JE{dd2b8rL*ydpgSu}i{N$Q< zPuH+V9*?*Hc)Bhz5AI%!W%lZhx>r3@!bf%s1k>bp!kE2*3-{yhf%o=WgoEIL^EPU6 zo!({F{@pqVu{I!ly~wU>uNBrVBRwzPuocm?gr}}gksbnTz`l`@wM3Om=a`?4Y#lt{ zw%FZQv?Fv6n}+!|yBkfO#p>6%!rXx?6<1NxVpFu7c7}AB zJLwU3FCNeH0*zWvTV8({qvZ}&{5{5p$VVl z`YVZQaX~L=p!yv{>|;O0wI%vKn~yszMo*7E&F`|yB5}9s@`!$QHb}n1f`FL^^6eOk- z!@W&n!@;u2Z}Eb`?Oj%>`*S&B9*fmliJ5^cmE$ac;S@#!ld-5FF{o`do$C zW&6TCy+H)AS*dOiy^8H%}&|A0zXjZSn zsEVNnP)@Zqm)dt5?A$|z64+Mhgr>lV}`bvQ%Q_ zaCqi?+`W_0P>@qNC4f9x6cC*)Rwr_I*mui{i<^hN?Ahdb*+8(gUsSTaa8xfI&yiLU zwmDt?u%sMr=+$Jlvws_x3$cLmFcpCkOgP5vbLb(yw2DsR8SW#HCf8oRC$VAXsSaLT z-oqow*}+R+mxw|(C#CPL0yO6rq{#;a3d%a32Im0TkZd}qC(9cLpW^}XrQhQ+oiB}=L289sy?58v_yF=v|=E&MN*RQ#bAQfVq9yIZF|xWz5@dWnR7Naya_bUlIz7q@_JT}4zjo%Yud<#uDzSK9R8h(Vw-R*K z{0Q2#d%U`V#}MnXVF!go zqn0pZ?8van66(l4w9;NnbG+%1$@C0rY{5R;ay>@a{EmL(NOUyQ{D|+pZR+i1O2fUX z+FH@7#DZtfO-=RT*+rZsW^s6oUh7qWtS5f4TuWKwh$k{HCUV-b&Nz0<%}ok$ss1Ed zeZ{O)HfCCO0Mb19912p`+Z^FPqVg^Fg@@H0(ct2`|0q40!e@2Yn0Lj(pO6Gxk&y12 zzu3fCg@FsRjZ#DQoH@uH`u2HWc3G#&;tLv*+U1<>4wFCMh^QBMQqTIXa0@&$8-yuNR}u+P%2`!ahuT2b`8P9$O8=XyYVw_2WW$YnxaV6yZr zx@hSek8w=I9;a?RlBTT4l??%?qtc8)&9&*6Ue?oOCVf^dC?k`K-3e6C@bWD0x!aj=+;S~dA}*iDs=zel0~ z%_lL+TXT+(M&GLa%vs+NXNb!j%Wbf`s?6c1sreZ3U@7zy7ozSSlHFbEMw3(w?5km^ zC1mAvS-5%Js`oyzyx2MCp#OF)vD#eHUB_)r5}1&-U`iCH#08~r6VUJpWvxvVtaqey zy_e@*EPM}6|2)9}Z7@$C(%}6j$Bqb@0tTmEKi(XaL6gkWPnlvczr`j@HhYv~PH1Qg z`#vsP@E$hoER#2leoECyozCt)nB;TtUiHUr>i1RrHn6kgk1f0#5iW&WhEMcC+>%WG zS-<|rU0x4;_izwz>2wK>d?!vCMj*vKy&h_lSG%HbirT=%aJsr zu~Vb2OFc5q0?Z3b$?GZ}0%x}Mv^C(+oQ(2Rb)n?hqO(>_7(qors;s1Br58~&@P3ByS{@M{%ua;- zxKTj68F+&`^WVGSJn0@~X(Qsmhe^yhi@AEaRo;sf>a=7`;3|2oab8xit#dAeyUvuo&9sOPH7PEa*j)9> zI5}GHcgwIvJ_8uFW?Wo0{p`SXa3ALVs}hxXxCJTnxGIi)?f1Fg4mV%O#g?wQ)IA(L z?n~h@c53h&zODb0n-W(h(`Tb690N0%Xeidn<^;df+Qx;;mO#aIIF()alsIKAUoj(o zi}^=VNW^x&N#V42&6ZOEJ|geA(v@-0D*O+|RYk=%Mdy z%XPo-E$m91+;_tHz{82-s(C}^byPvg3$~V$p&h5W=w3Hd#RgE@>}VzIa+M-9ew?>-88}Eq%QBy&i3#^9M z4R-jY3F^GuM5gjNyc~Z^YJ4>fMrDlmbj>0zYsF?BRuT#+=r3 znCi&-xrMcf?+Atfc5=P56XhDdQPS7-kdXyRC zc3KZDG}~f)_>}b*sNn#Wl5fkWDlA-6$#Ux~Lz|IQi<`(@yP4v?6uO%fkrq9!ke2im zo|1HJF5+uixSuHIF+DN*rL!u4csxT51@8Qwg5bzY*25#J=3Pmz@(kA$h1T3$FXzph z;P4;Tk5{H6lUH!)RD)#p%W}gz8hrhw8k-fS}Y<#cn!>mD*3eYYbMJG6N%i2Upi=TTD4TrX#F5RtT*=-~Ip>ewX+%gGs zf{&)4MXu6pzdtwNVyH$%k$;}dZPl+-yABB7VF?Pzu5c)4)4^mxr02u+P8HurJ2F9+ z2j?v;V}crD-IiEA^3lvL*oO(M04@3Z>uUf*^9zDRlB(qNy#xlx ze6H4@A&jM;<#AT*;h4q?+bYMM5hCZNy&jUA^V8{3VOLlrvf_|aFNLqy4Cl>n`0T8{ z?=sovHZO6p%5b=ViVLV;>eq9ELm^IHN4g`QC#^Yo#1q2q`{^?3;b6T5Z#XA3oh5y} z;Mf=Gfyr>D5b#Xo&Ag=B=*YINc7=-`Yn3N#?{sKlrL#4IL*%rNx;5<5%LF$c9lPFq z0B7JZX|k$*-g$)qa|X>+VRH=#i!Q@{?Za@hATp~5(ev_2Ji?yUvMwp-^T}wJ{0(ty zI;CgFC@T&)bMaJLe7G$LTbO0fupd3K|Zp5ZFpzH1#sHX_SJ=uOer7 zCZ$OcaMKn%4AbReSA00*Ox=)@NM9rqaQdb=ZYOfx6{V{8)MM}3d|AN`v{u@D`o3-} zUfaw&;U$HO7fve$(dk5eZ<1w!0V1t_8DFA zmp`E;DG~Y6roQ{EnpnjHhf+uHRtv)Wgw4Zjvv8Xfc%tl`#rY=Eg?YI@a2Q>@zFt zt8L~dxb0Siv}VmyN3Dkv*?n~Dym%Xaqu{w`G&)Jfe|FC5c{1zA61$R_3fr*Fur%&w zL&JGjj_*AgfF^K=%=ytbsc_=!b4!w~xR4oL_p`#G<0T|ynRNCA9LS4j)JNl9@0ziA z=GuDmeBceYXyub*iA$8!;1DEd+l8Twkw^!FVw1EgGMBk_0En@q`iCcczfsv8Cl$Ky znOvhW!O_g{SQPICSgskHRv~p4jIyOD%MKTB@5M65M5=_`PloH@=!Y9Sk=7i*&2G-u z9Q7@@Gd1h#0SsrxecfCC6}AC`sYy@Da5eiK@{s%%leX7L=2uc3DUkqp^3lexxj)U7 z5K&r9 z^_uWO{9nP*|A{&leQNdkcM^|b-al!b$@4phhi9e{s`+nyd;6sY9EXpOF*6qP#}WLg z+}0tJ`Ox8}(n$B;Q7yj?R7`kEKuPt)d*Jp^k_h5rS?kTO@;-E$Kf8_pfcNDN`<;4K z+@Y%T?@s7x{NOO!*%P|Iqf@1Cib1`@0}p>qm?^{=3BQzec;d)18^O{DSVPg zV7;2mqqk%`Wt8B0%AZ<=8)D|n z$=9u<3plS+clI{=*?Aqc-g>{cvXx8Z=se+d7<%e@bO3JPR@Yb}o5)XE^ev(#T1(Q4 z7E%ysO$jq5GJ(UY{v?wA?iM3mborTnz%Pa;=|*y%gahUPc_--=jwM}@tI1TU3btm5 zu_*QbA?z)nqWahUe+5KRKoEu&5g1BRP+F93kZy1Y=?3XmQefzolJ4$?p}QFx>1JsD zn|tr~{LcB_bI*S**DP2LduGpk_Van-{d(_bHWJhX$1hoa{7dQbFX7=IyrR(AD|xE+ z>8F%#OO_2sqE0}}tp#Wam115=dTg_x<$)Kl5{2fPH{4BojV)~_>oeWHA)Mlj1jn!B z{yID$rDiNB@zDfyrCJMhFRu80OH}8TJzV`lwP?fK4+_xrpp2IPeQpy`LjKu`H!Hy` z^Yl4MD+hlbNF`Fs8vV0|`rn`T&fm|M-l>8?WrGxIB)*GDp7)_+vkn=`E9O0R&QL_3 z@4=5DQ!$i~79goue?Z503l1FtLADJ*UE7r#4cG#jZASn|t9sxZ7?V<7O`0PCc~Qfh z!<4o*Kv3MkANoReL;RaSI8+)BwW!<5E>g}{X-a+|wSi>M^~G1?VCJg0eBI_ImZ7!Y zC=M8bPG8N>RWH{oVEj;46e$Zp6ANX1Z+7lNuQq{{a*qFcw;YJBwPvD5B5GcAurZw7 zn$@s7^+3%Bd^HOBPC6G$6um^CWC`yMY`0zge?H)QXyfD_O!BHr?`0FnReS63Fs&_fa|@hFS- z0PKYto7H(tmWH3@x>-eISJ|vGbDNh*oTicFgcZyIGPzgBJ{TGz-U$Gu$2Dipp<1lN z59huE#N4mS_HvuF*<@v)y&ALsM(i?-p6-4w0`q;J1|b}0v|UL4LyHL>(b z{w}x%GwLX7t)?7f0|L|d65|LDD1z7FcGj#|S!5(S@L$(#i?Dc_Z~z+tGVO9xa=z@` zz8HCo0$H}e?wQb*XMf}dNhp!sPEnP85~WKYE8*(zGv1w|`XXK+In@S$!iM8)P$vwX zE@`RGerqM#UZOL^X8!U5xQ%JZT=kD@0TiKIv(;9q4uM2E&gs_& zp7%QxK;~Uwb}5^NbvNXBw;H2v6km{*>T#9Jd(?riosPi^t-ZZE3ywAb8kFUvta@H@ zPSMILHoi}B&VzOfLTr;-Si3k50rKt=7d)mnT%^rwMSzu|ZQc50A9%GRL|Ga(-um9t zB}YJsoH^M_m+H3i02&4AgJqua?y^2GmM;D_&bR#>L-qjd=2w3@D%Cz_86iU0`>rVL zDS&iUaW7@LyXv91AGSVq=gofuftzH6&I4uC4yzCClg>9oczbrc)1|$Am^U>9Ll74$ zAA!L0ADae>03YMz1n>=YM7nNlF1?gvlQ{3!eik5JC5eTAmcO0v#-FH5Nc|M>zX6D` z`T~J<;7WA_*75Na^P6RtA{CwYb^%EBDQ2!f#lU-aEWu_jITO3dXsoQ={nBv;5?kvq zsZPfzb=7fw&OILHoq z^Z=@e?zmkp*mRQ<-@dc4;}0Hp0HDlU-6nen?Z0mVK;qTz>u=EvY;xt3FY2$3hRAhv zvIv2}v%H#B2)j5iT6%y)liiMc7Y)}_T=$PM24!m7{%i#GBs(o`JwRVf>#122We}+c z(#t5Ms@iP!jT?$Q2%GF3>jb!KVMW{i0mmDYUHd*8J-F(HA zr9`eDH{7$*W)sua2A)mn!UcCSQ=t#=iK}DI8OLD@@2=0|gAc$r60LMnelJ>Mu5xC8&=qjTmw|5Kc4IFr+W7^_oAX*G zE4=t0UIKIjU{Q^vW`O2sprVfF3bIE{nZAD8@NDw|1QBF(P9k`9ihoK!XVvT;qY!eT zCHN=JmmP{o>gFhK82zmPICBN*E5VxhtUly}A5mmror`k>DWHVh@YTCtok}p{b;*WA zvR*tJAPlVs&zKaaun47|o&Mf- zRr2`T-tabD$e*w4E6l=iyT(;ifs$CD>i64D_UBUX^#<6Er-7aGNEya@+LX9zxHnFL zbD)O(S2rZGs$Q_{9&G*;M z7qmccLHmHJ>kqMk#%hAib>w^VgpV)~Ws3kBS%aKE*ka9JWD1qH?j=Db~cC^_mBYPkC z+{PzhN(_!*U%ID|TA*lQGRd-HoW)7Q%Di)q!Q1w#BzD0pX~zB=_47B&48UPJt!3{a=;!g>Iyr8TeyziIVTdR(;f1A*8>QX`}S zxDa*Cg*v>R@baMrzsOGE&;h5rG-2R{=!55F4z0gjHLAWE5}92G7_615#cyU-_UU0q z0RU0Wk8$2oVb5r%xofAMQq~&h6o;Pj8=#G=H$$uOr}}$bfirRysg-#O6T;&&mp*(2 zd4xX2rlU_F=B#ok+CImass(IJ9JCBm@A>H{O0Z$dxkm5j>kIY<3|?N z3mb=DVC(Uc8q$^Gx6~c2<+)7sz}|9wTn1BDo-%oQx;#mt_kG0e&oD6OJlVLbE%zqk zE3Zh6)%n&-6plbae*L?9n*mr*(0bZ=u}BsNg9FnP-v{7EP_x$>lzbGxrH|@2jCsGs8!6wm>F>iTYW|m9iI6%9p+>vfmMA>#&F=4a!{V#>GB?yXAp)0~7IRfLKLaVg^~8o6 zV<3m4Z>7m|yWE~lH6DS6$X|-iT1nc>~_g)I`B$YQog0j{>H?IUQ}o{sK_6#_WDJ}I%wtx6P#$pASeK*8p3Mz zNzqJgjZ~k=q|NDU0}-31y{fsooK%n5?p8Ic$a5V&fJtveQ0V{pc*s*;!K_o0)##5b zZz86j+V(ljNd#H(kJ6#2;jp*Nfp{}iE^cvh4dA)#J?}1a$g?dOVF4Y$CX+*G`8WwH zbg5#*T2gy5z1=XyfrhP3UJ%sX&lv@ZWEivCjO$~e?jp~H(krGeW}~(jL5~V9{rWm{v%l?5KbG#HUv|GF8jh&@d<=( zSl6GL3AE-1T-;bg{HSc?+ad#QTO_3nf#MC4L~(cKj9550d7(v2B;_nbu~w-Z++~vo z6$vN>i51R#SZ<@l;1~5Pm**X8Epc`gL3zT9!RxpA!@)GNtxiTdB0v~zeHqAv_8y!9 zzOa(Byhbs>pG+~qm&oGT3pbrTv=$PESgE7^F_dY#SP7m68*Q=*=`m$c(lTMNa3raN zs=$?RfSAF4z0K;TejRdEO~0ikg@N-JMI^2f)8p>a_`oGbI&f=7{)-X?OtN<1I}}TE zdo}1Bc|$?iMgBBip&u3)@yHPKKry?)Y$7k(bFx+p3qmVhE z_%b{C;DS^obW4ZLFistSnL&}_wBH2?CCNu z&Z^_6wSR06&;xArh8_sb@&>S-?=^~RO_E4rbTL2=OEQW}sy@BL>cD!lbZJm0?RSb6 z+`>FCEwX52Ui|?@Ksd#-0(1NM@(lV^%P+0diM65XHembS@6%A=T2shtN~*{U9)21> z9Al9z!Q0It(;R3_)#miGcRW|goiw9~EpI6o0DO6`5m0~nFMVFre8|K}dSmG2s1!;$ z*^QW(5>Zzyr|#zbQ0H%64BoRJ(nhfi zZ~lCN(GteU;d}f^vo6Cs>Seuov=WFv23dcet!yBfkm(&-vm+`QB@fieZq91C*K02T z;nw$T7noo`jbdEc%^KR&|D5SSd>d|`h zndfP&Da;9a2dfGPuuO@m_+8VqUGDi8GI;-M`zJ9SfkGxIn}AdSpElIQ}+LwL^^~ z0b|Em%%g)BZ(h`R64uGt|8H0~^D2Co7o`;})7D%QyQlqR%q%i9N+K1=P~#&$9(HqK;?rzcIYi17sIhE&kEM^c*a77@4O`|z z1bL&gWGeDKtxb_SW(WsK&GIUt1*3z7i;aehioQsv_>W8+8lYYoKi3qRsAi4!+h#3$rcg{Vrwh@s6_b6 zBP|2Br6xM*YmhTo6c{EImqi|8utHG_vhXo%G&u*7%Jx@FAvrI)SR4i8RZnQ+m3Rmf zn^vY|k-)y@*ri%1pjLB?N22GnjncNAW0wgMk2;g+6Ksk7|A@(?`#rD4MSpSX%$P>T zA;NhSU)K-)ak?%6A<&Oo85XsPx(7-jQcAl zAFdo38thsQ??qjHI1wW8@`;VU4&^l~Ra+Q-3gkVM>IxDZpHVi_w=W+{5>Z9jz@6_`mzb!Yol}P%#ZR`ikVu`3?t3F zQnXnPVvKR*);X6ori!!|7)SI_QGEy)vfOkx`l8OQ0!pnHx>zX?t>4@r=tI;yXQg3A zRL@D;r<;K?=jM^bEDxMWIwJU~Cb+^gWu_~X^m@~q7_SXMH6%X0$sv*fFu~4W5~vbR zPOm44toykMbUZrG{WEF))=2*Cg0e9v^SRBooXhX_u=?V-pN7UNUU(GAY-@6OY_s~+%;3=oNRjC`=TP&#e{o>|krz4K6 zb%d`fm^qr#TXL`ieY|meADaT5wmD*Wm43~dhq}4sNG#n+{(d)KHHZ&ajZm@pMJ-)} z)KQJWSF!B-3dhb9@}j4j;w;r72F+ zm#ykpb!NjK1 zz%hyP#W?h##zx*P%uCC3xN)6&3Jd;~@ZIabMWo7bp3*=xt_;R@_{RGgX`B`#aDA3rS`v7aa)qS zitT4qkCB9_^iJ9$f$1#<@Ci7jS!nRQgK%3eT2CdMjh2;LP4<93CKV$+fiL(b$`QYK z{5%o+3yci`vTSJabyzgu9SD^h3+n`#C&)%!3X28IVejWM5+Du4`1c1?dfiToE;&~D z1=0R`_8--JQPKIEFPhT$iC(z@+{Ajc4zXDPD?ts|*!66C9aPf-O#|vXrvmwEd-OnC z$`29)r{A;5x@sn}q=>9SmTNP^)Nn%Wh0~KjtQL*blYiR!aEh_`HkSe;iNpJX#7lu# zbi9@r;mX$9POZwnq#*3+8=FMD!Mq4G5tnv4!c2#%o<%21%^7%QB`Xk}CriEy1I$sb zn2=R9NGiOC%l)gQ@6z?a4@Ow+N-l_#Qu->EAx*EZgkG(WoTFwqy01b^-Gxg#Hx`J# z5g*+wyF+%KD6;rV(!*o@H%ylwK=&i*VPodXamA)a`~WVP`|K|AJU1#`_6Vc|B|rz2 zkic+=J``jJsvr!^;zG37=&TdW?sn=vZEU&o zov!|83AhB^&Vy@;2C>mq+U-KdDRQZ9#}Wl;K zr6J&`;+27KdE}W~YGgs8)|MfZv;7h33 z`+SDuJ7THq7{o#68pp1)$^jdZH7QZrw?Hh{oxfa;Pykl!^g&7E^{J2^Ueq+ zoDJf%q95dNAO}F4{Ni&Z_SF~O>E9id6;^S#Htpz?0yVTCaUEg(0=vW{k zdK(R4r6n_4iC>}zJwq)AMoEOQ0PBa9pp>$WOK4?~;_0_})rQH^&s&oLT@P?@$$}pA z9dF2xRvA-=kS+!ewBhNf3COl4}7yme9 zk7X_$F45eLc)Y$YaA?r}$YShfh3acc%gy3zcO!6XPe2ZnQtSptO{f2O*x$zu5^0$M zr9W|lk}D{+02@CuZ@Xo%2b7rhW~pps5^M<4mf#zeRr;eguX`W27NgaSW)4PVro;F^6is4u6xnT{=OfcuG+{v z=df4J?C)@E}5l)|~?UV#Fdck1dKNexPjvkb2?g%gf@UZ3yXI$RTx zjPGheEaH8zI$7K5a-Ain$Mupj%s~mYEYK_3aY(*O;cm$vW{j0#6V$$ANs-jP<26H3 zI*u#A*5zCF>0mUW4lmHUGIEQzXU(2XP!gkX$s~Petyhah3!cB^Xa&2da$3-7j&`TT zpIVoem^)4D2RgGpc7JJ64x-0N3_4%$gj#**=-dfxb*6Qiw-NKtL9gUwaMc7Za|o1s zmr*i80RALhp?`Gk(x}b|Rd0Bcjh`Bs@B$n3ZQNkD>@iBmIydE_jxbV z1K*!V=d&`sb42N+T6Y*=YatP}QoshK@xNNrq?@8Nt2E$z^O=?D#n>B$D=k6|0n9-C z_DRSb}^qT@x%@BDa>9Mav#5!*_CSqbN|1bwc2&CGuFozy$ zrJFO%#s<1f__;UyXnULEd;pzdEXg~Fa#VvMAj+}L?8k# zGthaQrhP)&%&x9s(h2uLqE5K|zCvQt{JnwbeBSczv;16@`R=~^EpMm`!thm3hP9OAy@+;;h&ud?mS z%xbH+z>hl*fTod>?+D#og)&MB-XdX~0vt*oYfT=G35!+PHw=4Et708bMXjpg=uucX zc|7)g_I71GAW&QezedUN!iL3JIE_EQ~h-R3`Cv2dr8I@ zSC{|U$A&P26pOgTYvc{5fu_Lti^Y%DBD=Miq4UmL%GuQ!{ERBuZgW9=v6rLLI>roo ziL%cFRJCmUT>+<~sMZ^{_3n>;RfDd$#rPa*>L_9Dj+R3~+H#h})Bdp-riL9I)Lis# zv7?r_qf$V2#WtAW9m84>s}ceY#J*HZe03mveQW?zov^{-Kb7zP>`FVtcM|fTpCnhA zv&eIJG$oSu0G zWxE~Ga{XhLlkF8ei}mrpS!AXiqkm4&XOcmOHYCd{L^O1F&( z(P(VJ?s3Y>FAr|l^_Z+R&_uOriiwQVc=_3)gIO$Nh2z8*#b-Izl^AR40~fiVwL zO1Z*?AFT_T-AH^T@-D%FFi`O^#f`E;ADZ+5Y^eAIquiQqS9%=m)9>dJ6sN-;?BfnZ z0wNfhVzrIy0gI;2t~am`F5hFB@G*)WiFyQBuF2PpzzW8^g#Z=I{4?8LQuxy~C({$tJC+Gt0L3 z{Gighv79xv!ZB3LZLa6#kEORW)mHEVV?VR`SQlX%t?#Vb)a$|^XQJvk547nX)=@qaG%wX_bgJcKqaM6uW;CobwGSr};iR zjy>v<;s|LSfF5Ov-I?01{GjWaQuF1W5zOiAbtq~uZ?lJu#ReYbop{tc?#%=Kfh0r` zxsFj3B1fHHtCWDy@V35ke1-L}C53*HUWOxtO}m)bw$Q4~@Huwq7k%1u%cUg#;08(; zr=$K8*9ZrS!s53qj+12NXNs3x5&vk$|Mw5$0!UlUYm7bmfEjP7;jiiCKwWj8mC=W6 zyNy}U4YV8<`D0|~+%SL@{6XH$k9{LJ!gB3TKRp(X=@=81BK>tQA28O*#rVM1jw1z4 ziA<1rB$osj`O~Nz2Rixw8s>X zAe){of@fE9KpQvkJ~j6d7)fNY=+psoU`SGo%Q?F(Gi_4IH19DWGyT4L|ZL;0+Z8(VS)y|AmHamQiz52}K*tGh{RII!89G>^Xb=x;}p@$8B z#V%`pre>=wj8B-D{WSitWb|C&SIMJFVk`wPO(O~evmK3=A`%#%1@OR3r5!?}pv?YR z`DEiLXsMPB`4Mc4b6LG@u7v>8RNubH2*1V4Ab5lWG6giQc6Z3ag$ew#f97D9!~{sF z?}c!3FW4(l5?E;y{sx3c*9(9IXm3_Dg$WQ4ITh3Z_tSzaeV;yc_md$pV-j$W4`tdU zgWYF(x3`ti@F5^UR)hL+?^Z)NRw^al5n=3vJy?e>=YFjYe-(iM!}#|j=vUOjv9tKQ ztQRY3L41YbOJvKe25Gw)Rv^IgI$X}P^&6k_%*zIj+#R>$dAp&@4@AzCufY4c872Jd zQW2c z>sxA_C_di;`YxA-JiXSFA9xApco)}qGqM4mse)9(>uB%i&x5w^BTf%vu z4JivY$4%Dy530m1Oqr!t&U|3JoSO`rwoN_*A_N~}S5PkJ7P5Zlb@-;&Y|qp5oBzL42y$aa<3mVEmAx@^r4 zU)EZTk{^*6efHDjBvTVB5w_jeR~HUznq=B;wro6`tYMC#bh{Mz)f6erujB-*Vvl3R zN`W_P?&^hBFTrHK&T7;@4=}dT4DTN-aQVBPWB$bMC3+c0z**6BVwTIlPXVLL!*{C- z;;Dq^*MKdxxjAU{ebrWq3n820$(z@dd_7DQmlG9Lcxe+w|61w@;XT~L9 z+<7pD%QDq7UsqNNGijM{mf0;69{hY_3S!Kp1cpCMtPWJ}ya%cT$2a~yqlbX3>6zT6 zsj>o7+4*x}K_6Q|dcaor7q{B$p&at4rxaopENc&?8|+g^Rx~pxLH-N-TSBFPSEuf^ z4UhgH-ZUJx)u@5K?+0biWWxRU&8m)KX^p=73HlsQHc;0Fs3LC}5-{cKLh3Y|+~wzA zt@XLS=N!GjJh{_H!^Ea@_IwDuM^~RgQ>^E-PpCDDjQ#l7@(yDOu$*eMoumWSYZ``V zY*;4^K&!5U&b9(F1;wRi6^OD$i&K8-((n3MD$WYr!B7O&*J;9xxbG89=Bpfwer_t1MM86r*P6rJHWAZyBE6@@x6(@#@M4H*K&- z+(rPN=M8v0oWAZI*N0^y24wkwxRqd5Ke3~(1adH%ygBO%x0ak~=L^---mK2XGPw=# zwu7qX?3sWwbzjpL%?GL16?P3bU%~0^a6XcMPUaP!5Fo4=$J}9Lq90?oA#WXfd_d@D z#|423Dzr2U6GT3Fo$+!a=@Co^T4M?`Zg}PjsNL2VqVkhx3eN!wiL~*L>ljLL0l;Ix zU8AzJ<#>%3-&L#+ixUNiJht+A#gTfP;^dQMc64k(oK*TU>!*j;F5H|cu0SPjC-lLV9MgjJkq zSv>uSm7)xm{pd>`B!{Jel_UXs5M@5N*_vChH;J(d$FqtUE1F8WO1Vxlg;^+!MjimY zqa+T#0A^GIeH3i}Q#K!iVX4U{%W-c)D&dFo<+<%`7X|Qc&My*FOp7EStv>2?_cv_c zBv$i1QAbq+ zAJ|acy~$>fXz;43WX~T#nQf@wzO%kijUc&P-#%&MY`^KN(`6*xnuQkEj+T}zJDl&m zrw*-fl4j@Yo<|ndK_$gyq35H+Y7v*6kfM~}%Qlb=?si|b=o9);qDw2tC-H+QrP}RB z_oKiyJ^cD2z~uKIx@O>qmNvOG*;8uj>nVwpYoqb1&@~7HX?F9n*i*rki;odOzql-H z)>%zM45aNMwB+MomF`PW38EY^q4C2;R~8ZDr2E7C8**76i338fI?ul8eI7|MbPm!d zm~+Uu4B{N6JRQ9*LZ~my6e@nm?Z#O+CXvsp_z7cTc`RC*q4tH-Z5HRM0Uaz<_}c3@ zUSyv*bl}g|L?b4m5aBuE+X@#*y0ihO6Vbg({04ZR*pP6z3`q5_QvLDcVEYvQo0DZ zgsJfs4m-Ua^Rg)hHxM8GbSwrb9KnQG^VpdwhnOK^J^0~wdCob*8%51`-#m+#E4-Rs zM&ID5^ndbd!CX~QMk#VnE{`SD*w zn_qdoF^IcB{0m$p*uks`@wq<2?I1}~s8a!b>t{l)?Gdp!M7RT6oYD92TP>f_a7upD z47{V~t1L;+n55~55sfS!_Jx{7gs3*Dx11x|w-=i)bnHUjcE$;?ZP9bto)+&<`|=pA zWj%PJ{LjDNOi1baSaPdRw$Qy1gaJ~14h?Ge693&S`uT8&5c@Yg|9ao2Xo)movk*3! zh|neoO5GkECafZe)S^OxZvZD_9?k5egRY$}JU$>$5X&s)sVMIBG#q+bD@2ZLxiYkL-rr$h&1%~(rWwLVg-wYjKZ4SD_U5uS`c3kai0Pm`U z#?9Kz`2g^60g|EqQ!R%{_+Cb1;4B=lN=*OS$=bj!qnzwKmdOuWE3?^!iDLX=IO)4A)zRN&n-ErkSGSg3L6oqY6Jfd|m(kC4v6{ zi+Q!Ey=OfY{g2Nw(Lh4!N|?O+V)lO@>ER_{crRI1o?$iq|9osQkGxB>D+Omr{=*v+ zdIWrIWqn)l-EYM+Jl$8tNyd$H>*}#w?6FB*Dcdw`PEh0Jk?cas{M>kf>ph}C0y_0CkLMNS8a{M!nyB; zXwI*~G4}>i31fK4k^l2~lY|MCV~1GOeq|(`b$>#+n+MyMe<|a6?Ool?UNq(LZcOp!cn@DZ`hPya z0Ops;8z$0yGF3-7cD?CX*jqw^V5?jti-hW!55oWX3wA~tpk>YWnoxG<{d~b(y1h@K zT=*M;OO{{sdW`ztzTf}3Y~HdAnfeNczfKbPBTgpoU35X_ z!cmeJL`2tH<4kWD%6_f9($uIK$5_z)&~(kH_Ejo|sj?YEvMYa7_^Zh;jw#(-6{#Sa z;%v#V^2ATuR`LUKSm+(6UtFmeanxy;W1P3Rijk- zwS^`fUpan!9HSu{cHO7_b!LV*X8n$xBZ!%L>=yFnA{2WdyL{dJpTCeZx}UeeC|%Wb zi;8t41=n4^pyu~<%o}EVPmB;|8}Cx2KHbTJQ6%7enhO`Z&b9Joum_lLXH4P^J+w^@ z?kZm1{@6C{AIvYdv}!yGaz*Q_0+io#OBPkt&4l;Sle3?VXDbMYKZ$A7i{T_5FE}YL z)=d^_T=(w;8Kw2_)t+o*lZlK0H+XWKY+4A~zhSP5hkIHbG+gVzY@%F?;5p$IB_5P! z5wXTk{Qb{iX2o-IT6}?4GSRYpv)DiDw5^K=aLU{9tTI zpxla9?nhmsuoUEcNS4>!YN6iAj)jG(WIl<_O;$j{M9Lw||R*+hVTbY$t!!Z26Zz{S<$Q(#RRxx?l8e?q&1wsNH0# zp_pd_<+lrk^UGLWm9KALew6R5y_#h=>s=WZirwGYR;&}PseTo}wpc5{VVISBTRIs1 zzPXB|As(SKP!4_~|RsdL)_8x5zzvvvDGfUb8M!^Ur6~kGS+1_0;kT85+@};8@wi0BA=H6z2EoH zyYqQ+cTog$&n%*`n5{VXhIQS&>Ml?(XXfS}deUX9rN=asdatfN^9G!~6qmo`-=ann z#wsfE|E|-Q{NB5|gzH({Mx$yFWW%EI=}{(1GDap=My4L?$7KeLOnLBGl5iWE>TO2T z8LM0Q+ELZB(juu~jtp+R^0Ebiih>G6Vm5Ye*;v3$!S*epr+fhX|KqHPsSCo$35fdIma-&slTKfOf;A51UGi7 zY5r{5Os-Fg&$1L{diq(v!E4Daxr}5>g0K17;;z_k{pB?Xe_V(v2rMHfn zOILS)QioM`?$66ahmkw_B|BU-yP;~j)eO!6R-Ge*wM(PUR(%6zfcJy5%WO09$fwHT zUrQ~3G-|(E7eW3MhrdhU^7MpVill5u)^R6v>f%nR>2}bOq|7Sb#{Smzb|UtsQt5=} zex+}>Hz$>aLr3PchZFK8=0<=4S0O1rxyrb?iF7Wyou#7E^&DKzpm9*D8(KX!$OF$F zic>do{&--ssGc*!)*f&*@mchI|C-EPSMDmiOQpVSb80ImX>E#*g}b_+NBZJhB)L;t z@A(N!a}05eEDpIZ^d(rb*UFCeWg|KRxv*X z639t)vn{i+X%U1d=7t^|v*mHmKJVbyqLwkARlxa=y-J80x986E!$ed4izi(LnGBr< zs?i1`qtZm6zLKmbBKyEu6WA#;Y}D^xNxC{(81k!%yIth1+4$UTxO(|90A=-Xg<8ds1UeqqyW3V|tXDfn%s_vUwKdv8wh*!F#x$ z#$4Ch&gpX}U1-=QJX~P@4c}&kiDsncm$X}fl{2Ef;SAXd95K%%K1ts_9lOwpy`J>I zPdhmlrZaWB4lhZl24eD3bcX~ly(}iJqHJSFjT;?qFDp+{p3ZbgD8a~CTw2cOEFs;L z9OFa09s^?*DlXkE@T%E1imPT#7)oh-!`ZdlD0;U>vChu4f~O$OX;Dcor1@xNb<(v_ zhrU~>7Yc-uZXPF{_5fXnk8SUO)%?2ucMMP7j`EB<)J=e>&J})q4uNpFxXhJ~N)~CH z=LgIIq%&k{J`O}sSh!lvzM~s&lj^$SLAs=KDjsdmJZ(F?apy)_v4^W!!+Iv=?*#u= z2-F?sKdT{qF>lKR6$>@B&#oO5sxX7rFBH$z_vb^94ELX4KIxucsM39*9`VoFUYZd~ z*{!CDa$IcRq%K!9`-K$yNdJVd&aUOsS?reO{M&=822Xgk=7MT^oOG^(vx}%f0Oa`H z*+O#Zu0CTyAVuxyg4IMk7gKRAS22Z$H~BaD8AVKeSLH(Wx^Lok>z}B^GkpD!tY6R8&#yPR3fA(aEr?aSG-v~Y+ z=s-O@99I5W)BWu5qz}sZnO5!{*V|A@i6raYh>^ACdhT_r=4%gC?FrSYZcE-7g+R(R zj?n0t{8gL7XRe=Cxz=Lx_9>&Gb;pq>ztZ9c>j8s-&g59i+NEXIIoELQTce$Z#|_XMWFQQEX$adWBWe$A9}JO5Mw zHvhDvBa1G*sp}c317KfH$0WSDEF~vsPHI)-F|LbfUZr)jfY&}>JE*^&*xnB}={g(# zWxR2?t2L^lsL0eVD1{Ho&sk_Q__M2iw3>!AV(!{{vFbLDNozuC%YBubo zXcU7+cD&G{$fC?PxFh9kILl$UPC60p)r=nLhW2@7s|?cjdR0oBM5D)XEc;zr*oUiCs&_DZhr`jSK0UpUTNCYm%Jj9bgY6FVoml2g$gZ5^uwxL+)7Q9BO*&_3*rxQ(cS>wLU{ z6)a@b=h_Rs;~Fo|VZ5w@!5Bpt^UdOloO-=WA19s#cU>ACS4T*Umj`IJ^`UGHp8Kta zg$zE9j+^%Axw90!Cy*%;lJF$zcRMe=zHtWbJXF=Y7 zd>D!$_S#=w0GC_q#5?>FS^l~tRz_8$w7US{hSbxj06jn#2!)UDjsa*eDma~X3gUlkdQ>5hO2hx|#%?2QkP z_AZwub{T)*t~q%O1W*V&M`K*{L?27AL@!v;)k)V6Xm|ZuG-0x}`r33fS%0pJQ00nF zuG)Pk(;Uv;d5W0irJQ!(mx0SQgUmAEE}fr~p^85gUlbTb-5lpPxj)}o-b~s~#X@M8 zn+pnA%`CcI-_uB|%FCO`@n8SSzsr41U1A(wG;-KPYPFOzCdkr!rAtJRNQ|x2Osy>u z{Ug zwkc7W$8GJ8ocTGhCqx=wg__P~%d{Q(xp1e2J?sL{FeHe7^lmPMRxj>xl+V?KT&NUk z%%V0<0TPaQMtQQK3qBBEe zVl!CFef71BLV4BLZ?@DcNZ#wXs_J@1ey-)6-Vp=kS)nWz;nBB6p77B5a|lgAs!s(K zus6mQy|uaKl+JRiCgvMfWzJvS9dRZqX`EDc%}x^+ zr;%c)Z(!AuX^g`i#UE-|@=fjCxro0eciFCu-TjFyDajhWyHj=&3P=88Y<#`CiR#pU zbAC;!kJmKQ1*ywF)yGs-PL<52|M_kCea`H+#z{wI10s#4YiuQ<{mT2MC{}F<&u5G^ zs3%#XZKS=HqHIlpvGQQ?gz64Kbj+v+|7Abs!u)gW;@V|LqG5R70Y14?sIePwEF72CX-2`Njh@Q30K7lEq_ zj-oY%UJv?Q#j-EXK(RX_I&LfVU)#!x<1Ama{q{&a@3gcrX$^^x4c}caUj8c8@ zAo?s)pQG-(N4QZUM?u|^n8$`Cj7Sd>&TL^Qx&r4s7gn9CfQ}+WMPtOoYZ25*MT0vP zH8Dwz$R|H1zkK6weA$?4u;icEn9QcFHvQP(_8mEWfFN~55VQ2S`1lu{pV5&=Ylf2= za$D)XtF|t4khu(@qtB~UseI6{m||$P6V~rqshB^t5H_>S^;oFG!FG15MeGuDZ1a&6tnX(#xY>0jtpo z2UT`Ck(b+@U(%V+%d8aU$PliuNBH(B>B`dsw9q%J-ub8kG`I2ju@=~@$h zDZBdV@J9Dh4)e0+=XohOtvG_F8}}tKE7Ms$-g}iqi0?(ZvwgO6D_@r}``rmimpB-i zhEAG-T1K4Q;Z}DMZXYEk5k>ec-hYGJt;ff*|z@CtU9Ihuc0RD-k8c=_P9&IdQKH-#WZ+xh zM9UjfhUXL=ZS+4oxg4Znr7~7S>`dJKuH>D2@s|=x*ICxCH^i{%?Au#m%RCwy z{IEtX{XTkc*aaqCcb;xBmlr;F8=uK-Y9Ph*-77AQd9#F>dokrPa zz&k$7p7Pp^C&4Q%5K))f;ba5-4*H!mbk#kHoNPJ=-e-KJ({>GgN()b(9_K{m*K8$E z+m$VZig!wfP~XJ&+D-FN7;cZ)kMXvHl(8ibRnK*Be|ACGm?fO{^6xiEhP9e4OdEQy z(hVCvHKJKLI36WUW~ z`o;I&QoS57LdpwMIek4ZdMqoGXH3O!mg3P+zrT$)xV!r4@%@~^@V=ugiM8yDaQRqW zj%%WzzlJEvVQ_1Efsp%btL(Wh=}X-E)uy%Z!i)vRF=0!#&BP$-muxWvjWcL}@6NUK zbKP3i;JYCy9Cvk0Ye{4`KNDZK`)1I@0CRF z+oZfIzwhw!L47}Wp9ufr57&Mdgj)wjg5cB+U&RJDyG0I$q*XpYZ!A1!!?10fz2I`) zMTAZJFDKi+shVjvG#j;>A2*z)Ybk!4Y|+fMJF~rcW%J!@OJuQG`ihCzc;VM2?l5t! z)ioV0X$8LVm!-9Oq}RLceJqcgV)4RFYG{m6glA0trV0-d+Vt+W z>(SF2yK!DI1`fWc^6_Nve4$TYzwC|cnV=8NI{CB1$9qw|=3aq@4n~jYr+pB&zj47I z{D^~*8oUgpa%wuis=8%y^YXxrxXVE>9h(Z)2$h{l_xx~X=Vp=7nUSa&3ipY-&I4&r zXd{=%1RWxMj5QZ-yEb``bwSdgC@6m%7FmqG`$wpA7Ql^&p9doNV@)@Fm7^2BQIaJd zdR*Nz`0ggd(bhcr`*MtF5pq!UC^jfe&I5%@J3YQ*&F={YX^9bdh&L6&cUKMqPg9cWIvkh02l! z&{|(k_de#(cJ|_|>n!j2akkrvuxd_{J=|-)fxB`s`px=Oe(_64aN*Yu(Svnv>w|jP z28GJ*K4JPxZ4GC(uq00;IiJ_@wP`y8d zs;z&0Lr~(e=`)b*ao)BVJywHSqRc>8P0qX=X4@ZVLGyu<>sqMpEPun;{(FrQMaRt`2+C7zYlK8}lD+*8TW(6={f@KF=C7o=<9Iztzn0z({Ly@^Vfd*FDXq|J6wPmvsMSqU}!*BU?Vp6eop+!q?{vh=Fdx!E@l73K)2l=|y`q|dqz z+hmjG_la+^3cp035k+(Re8#F>ng6!Gq-&>}B36$IuqrO&jfvu#c6k5rJRuAh4xy{G zhull?8nb{OBzruLXheStJfBougEd|+&^*CjPNE7S)v0k=v#hN_1UUB94j^g+&O4~M zS&wZ7pb96*J^s0h(4wiO?g#Y~@R@YliGsCwxYc~?P)u912(m2C%=C%F3%Y zIZGcBNxmV*xL!L@&Caz##k}oM48eHM**rBC$#Jo^j_Wyc;Y0l*>V}*mQPKX_oAM52^8-kNoNs#0%cDZwdeNq76R4hP8T3d57<~sNOYEK*1nXhETtC_ z0_^L=v;DO&=mvUmIpwk43CqB6UQ5t^VE#EYCJ9cMU}7vCR+fiQbU@^sbbGlJMirJj zE9vIxv!Gu$WtWEQ8-|=2Rsd4tPbsS=H|VJHJ=Pm)?T{1z^e6)rn-R9(m{kL4QKs6) zR!gIM69tDu?a9C0^#aF@2d|*1gnAY=ntJbYX&O^(4`v+08pSN@oQe*7dd?>?qrqt5 z{2nM>eaXvCspaW5(quE8J^*xLO(=f)GVabK7zw5Z6)4drfcUzb7mY(5 zI5R6Ab)HR9{jGzcr^rd`ksad30feb1sGUt1+PS&5_dtL=G%!r;_)9}F2VZ>u%E!&= z)Q4}w|%q zOtQ^py__t+%_r{^bDAt2QwMPrVmaBPZwypWzCS#7cWS=M@QJ7fZz$eOs>5#E**vah zi7}5jW@Am$fv(UQ45=cNI?&1oGpSa|@SWPuo2cB3&0UB%EfRvTR!iT zQ)0?wna}==qf#yC)%0R&`;IucabNlniqm0VR4YLn(-6|nQqjGMWMMByn4Vc1cZBZM zkLzlLYf;stxPnUa8DCEV_q3wuqTzlh%jYq;jjoSA<$#ZIf>4(CMPn-$r~V!`uB3dt zr^_Zaq_nc<6*k9s7fJ`NO)K8~xxWea;Oj=Eob4y8nmkGu+qwJ(V`kwjYM!XOsk>{% zjZ?!);`*kys%D)NPM#g6G&JrXh@GsfnI^i15p1-4y{jfi*ug)eth%S_c^hJu1(SIi zMVgB|;simPxlsgz5P%5Elr>k@aPYz6%FneWUVJ{)0ON`LMTW! zO}cV%^1wh2z%u=ovAl7@L?t}M-7m7htm&G%MaUtx$c?;x7-XK)1F+tb5)ZefOOI|h};6l!7(ko|K6=J#E zZL^VVR97(hx&g@7&jrE=$9KDU#;Y-;Tx!y8A}9d;J9xwe&@Z_VZjo}Q4bK)r&@qjH@|3+$wBoTgA+nt4IG+(YZIHCsZliEWpgjPlgIFOo0!uT?y#ZEyZ zHZgh*@ALtDCIGlxn*^wB)FGM9^@XRWJN+j;m++OBGG$2_^D3T+II3QN6N()+#oQU- zO!F#)6-QRr)w1xW^$^lU-NH;c6)wQo)l{X+vUST|$W&h_g;~d>2%^<>??idQyt{Rk zAz|DmWC1YLu6`~CE_i04u5M4&bOwXjik7Z{si@L7VA;wFoo@ia=JNGjv{8~IPqs{m zIH7Z~npNAAo1+hj5HWB&Q&phctojx}NB2Qe9J>{wL+5sGQbPk)k4LKPWJcfV+SOMQM&`YE6)qdW^fG(PDTBP96~gomz*|@K@CBsC zXODOPp%Zw$VBmBPpWFGc~tsvi$Au*ZTF3d^mW9k-43 zYn?-|=2=?gkLN^c{A+BSk`ga;fn(@ zk4HJ^P}NU915R5OG5p2m^cL{eWh#1^QTA)^`AuBG4Dyq;98x7GQ8}06!#dE0)l;|e zX@OvEG(RRKDijUe@kHdLVVs-a-cw!$2XQEP&TihZ|51Rc(1M@6Rlc^GK^~kg$y+FT zfoYUy>)V#|Y|0v4#smGD!xI zkIq+lwBK%WgNFxdFZ!jsc$K~TFZVHvdib< zaEC=c0!Hp5ptxsh2D4bvPUP0)2*ygai*3# z=S$UG47=?GupCUhLpM*btk`{ z$FwJQrH!EeTDpYBVUnt8evyxC*N>5{bOamr7e0jNj=4xw_E0Igs>m>8yz2*+1X(hS z3GuAJ#BVX1qDtsYU4NQxLkfq^BsO3DWiGDf88Iye;}Z3bA^GVSk+B-4CQUV&xUMaa zSBo~ACNcWAun9=?RHMSTi21`}VDH1!#!;|_y`tKUpwOF9iou#U^~cvT!dVk+G`96Q z*-iil(cWjj8m+LG*5hqumX_pus}Qy6mjG{xzDM?95=(L2P>Kh$!hV>~>3tI!UIlgC z4!hGjC9P^pr`%m#oEH{+?hhYcV2{F58``Y1%+UJUU~@~U#x_6b@O!3*CS)r=#%y;; zB8*y8p00n+b4NifmDQU%zf1cp&#y2!VHYbjG-*+PI6Ow1a`u}`R(f{yG+BTD)Q{)M zKMR1LM9bUTqpi`jy~gdYha^;nQiV1P)}Zgqgy`(7MI<@gv>aScePfYC5l{--0}yOb zLD+6`g}BT@$8&`!Ob+R;P%ZT)n33u`sYF)8kY)1^)$6@#F*uV$X5yzm-W1=HCy(#) zRHhGrQR2yTGw^!(eqWF$R|Vj!fbR;{t{A6#T%>Y1ZJJ<{{tW}pzKI41=hJ4UQWv`{ zs=-9N*E>yojrI?(sT2hL@1z$kr->$^Mi1{sC|ziem({hYj3R6v899Y<(O@?=^D#N9 z`D)SG>;N-F(z_jF==~0Bw~IsL>cofhgGRK!1|ryui&-nb$=l7RhQM-;q~0@}V~1j0 znMpRP84RAkU3I4l$1>@Zpax6xO~hw*>>E>sX}|nAqOW>NQnZ9_UA5Zb8jpKcMWw+|@)*9B? zWx5~W#uJbYSNm*ww0a8Z{HT6mS=mx;nwab29@_&iv=ro;<)>f>uc1h%*rf~egp$V?gfUQ?Crx?nt%^XCXGSS27 zOeE<7)9oh|*>-)NU8 zObZWEi8MfdkR)JI)5+qCPO?k}0Li(<`vTY0M%(%4T#Z-cYxC}ugg@{iWWh$zc;eAJ;rWvEcQ`r0e+ zFemRkjW>sO)!8alMP7IP7q$)%K4C?b@do2T>f>!3N%7zOO9qqG#m&t}$1p_lZ}m7vahD9!B||&MqxFL+Lg}npduZu{tt-gJGnCy` z@)09Z2ub&4O6#w}dw;;1WTj*vivEF`UJG|T*Y3GdP!EMmSr+Ret_cH_cI)E$eXgia zGEwS_+Fw{9omeLK{A@#L2n||u!Zx3nu=-sqKNRRhQ7mtFvxl`)0ZxWW?za&!4_#*Q z)E5QRnjjF`Rbge^(Cp*)R|uW-EN@2%shh^l9V0;8aK;}s2KvS$DXG|*A{_kaN4U?+ z-ZVr+B}NVwrdEU(E#H3<8Z3R4{Hu`o9^BJm=%7X23lbhehKNM+y|@4*fiSLYNyPkvB`SUaq4a9F$bx_)>Fq0Rqp;4svU!=-!-FsCwJNeUoF8_UM2 z71T8I=UrsRlqH3|)e0Mrp-^^3DuA1PUZwwVS$+HtRdJ12 zAaN1DyUZOutIB#I@rVelP4@3o+o5Cb!&x%a9@rbbmQ=pu;Q+ zbN&wIM0Yb*d;xa^dkB}xh}s#L|2t-EDM)wYEO5ZWeS|y_+zNE-{T87lwq@s=uOD}z zi!6vCq}YzlFZ0?HlYORPo)^-{UQg_?@Zm|b6e^TlRKU}yBZ?GhMB~5#dGnQdULUUv z{}I8byMfNK?-lAF3oE+HxIB!*BDC0@PG0XA>PHJLryk*s zXAjJd#CP@J#ZBW9(`!<>0ip$|tOh4h!q03;fdJgzs5MQ(T_iW-!f}Hf&J^Q-y2?`i zzSUu0R5rw-(5k3fx^ORC4B+2$bz_nw-Meb39Mr68()lpLNQg+A&y~ITFCeVFNwYo; zsgdx716an-G%?@eDWui&qP_=1ARdhQP7(WtU&zKveGX<_a(W~*23Q+HjVLRr(nex8 zE)>=8PjS81kWe2j&-06vC8p2W8`EWnC39U<6Nm8M3ip1~Y$vlqYKOdy0C1M7$>vX+O=u#mzX&XwB>L$f?qs8zDm>GW7-8~NsK2}`AJ9u4YuouKAZ#doLdgv=hPUKdP;xu3Q#tB|6YS1AoFWuUBRcK! z<2bWs0huw^!{uU_U2hwl4^>KicA?W>Am&iBaK`@5ENgTe)NVxfpFU>H>TSJ@QnQ4}dT*>@p^669}~{j+GzOk(}KvU(mwxvorC757W1 zEiGgCi1O7(u5|2a*nYXBx%3+LNG%?)(ex8*T=wOf>k~(NQi`~iq$tK+>6#cV`9N&0 zKoD%v08%_f?7gYM?_A$qW|JEg<=uNG^}_ye=8TK&l2ky(2X;x?w@efrd zxN`a(!3Cazup&d;V}I2lq`3F#A%}pa7|4w|gQM6nf=i#0Bv6^jf7ikzHH-gPcVZ&^k-EQD$C`vw&4G9d@2or3+S34iucv{n+sfPRf+^3-$+r(Yf8*2-a-ECzekn;V``*vxZjgB4V=Pi}_rWk-Phg z7CA|`=-GyYm2^$-h!Q0(#67d85Iqgdyy9wN-j*MyYC&k?3aacVeiP?x*v3=xN=y#1 zT$TPaxmn^jZkebHYJ*Ph#+Y;(ZvJ)Ny~2C-bSZUV5G=`^(8jad3XkRLaa#MAYef2B z<)z04$z12qV5U`_j6a?_43!rAVIy)qQbbX5Q_&1XHK!!8^+@#AA;fExJ z>O)GDeezV@mW@31AZt0QH@mKD4%w$85U$zwo}Spv#5{O1S+9xJ$Ls>aUt87iF)&~c zcCpOnn=ng6!qiK#T(+*QeLu>crV>Ao$Qg7i{2D3CW`$6v15-C$`e)6e3S>nwo&*A4 zUpbKGNF>Q_S%?R|3XdKh#jb=U5PuJCdzOHE6oWrZ<@BZoweIc6C@}UDt*a zBDOM*Pb4p^!+ID+Gm&f>CaJ9vHG4~@x!bg(6tRpyOf9#qQ~f6ZVvdtC|hA3SiJ zmm$}}rOwUNB4ZcRBggA%RV><&k4uYJ^c_?mW@>qyMX0_gFF-xN0}cjhjoB%@R?$9h zw6o}Og4(i>jiQO@DJ7^^BBWw34aQ%nDRQ4rKI2u)>!ag2Raho}C*(}CPPScrfp^z? zb)ClMjcg#Y*V(I_Nw8Cyt}XIiyk3pk0_;T?*L^jVPg2Km5>A#(b6uqpk6~6xs8MXT z${$RTUsaY~NU_WKtyUsX( zgZ0P_0R0rxn~Hc%0Yw%e!ie!FagEI13d7C5xhIH(7A-O#M`PX>Mx|gV7?{O%jVB7M|b_HsN zYF)Y%bu57fnuUd2+64fu7XPD66&h;TD_oGUbQ5M5!p@z9(wY;zm69?AJQF$3G)jIBqyp8>O;@pBZF}QEy@u+o>VmX;bsNo?v$EG-!WImt^TNYxOv8Vf`*6d3 z@hR|QtND|70wXToy{0sA6Xz8ravb*`gOY*NFOb4Xbp{@kWzq>k`XQR$TF181l^>W>r$u#`{tISPnrwv{}@5F)HXOe{P1yAB7QICs4y^WYxn1bLNn=OvE3M z?wp+m>JPI`V8>lG0tQZra}M2X-ntR_*acU+;}ZjAEtzsZY&7I{nlSXUs8>-KsLE0o z+0L?9gDn1>M%J8@T^bTKB!d}IZLG=R^L^4p;6M3M|`y@Bhm65Gzhd*)%ueU~zKidDN1}-%c^IN{0sJa9` zl$IHSk`ld4B)m7Qt*{qsRQsqx^&!ZfsO`l84g{)U z8q8b$ZPGv2#@CdD3@;W4XNMTt_#Hl~;WgAstv>h;B}jREZrd22|A2pNFM(+l=|FXS)~3o@qajYi{#d{wI=?;HBBMw`5C(X$xS)T0nq5ye_r6fXv z-w+3EfM6ll^2hJ5%BBSf7ECAwIQD<9`=_`1|9jg~HL#4y$;rKa)3i5+$o~#jAi+(7 z)8CUeDNjCO{nPT1>X<)+)?l+<`|V=?@Z|yM9a+AujQ^#czdR|#Fje}Z@fMP19u<=E zj_s${{qu{YE@992_igRR8xlGGqD@y{jPty){l4G>Cez{OgZFexp7|P%;Ib4v6BHEW zt=C!#{LA9~`IAJ(je&B9)5*A0t&B-%0pov+lf-?}+>b=Y-%Q1{{A_tOb!~WcKfO#- zND-R_9U9AlgXZ^sSuoC>0aonYg4;gM1UPAiE7|XUehx2-@_hiZBD+QDT4`o#IwehM z^WAQLVZqP&zW`Kto?H-RS&P%|?~~nHUR_%G%=Z&sgt8_Ugdo^I;ov(7KB(5;7IA&K z&FJ*g1%EDSpa8QFEaS8|+{`+&P4$H#K-Be@ct^7LlBruKM%(_^c*S8bPK+wKzJSYR5}xm5b0$nE&T0Um@aL=Vz=lO;$;g9-pc;0d6Fy zjTWxGjL;(f&p!TZAMn*uebDE8PAkgjKR2m=SZB@Eqh$tn4EL|a`L?rs&{xZ+ww)Qh zc7YSSt*pHK4BtnnTf4?7k4>wjX?tg<@YAOW@oN>9qg{Z1u70MI_q^F34+SV7g0g$N zyLW~yMVV!FlC#+u8GmG0c(RoP+)fgOc8>q7`Cktkx}_{ripeI4GX$EMi25MOh0f)Z z_OFrO_-i#60I;TVVNb>$bZFD=L14C&IaNofN0P?OU9H2x(x9^q$^r!k|2T9{{e`yu0N#9Qun+ zFkp1z5b2kI9$>F-jhzctyUKwLX0`wj=j7WP_nTMvJP%(^19nGG-T+J)D#`nJmCyUs zRs0lOQHkMh)X=e@Llmykc>efoJ_Y(2rmi&dcy7#8?>Ai}oB-_>%Yg_$0|M`UP#Y^7 z#^ePUIQfj4FCmJyFnll5FI(`vls7>mQ-zwMU>ivF5uy(UBnc-0|Iu>b(bdB%KT85o z2VY{WyCo#aA(HP(u9M+i;DiP?UcW8JDeV4JpX@6Md^~kwj_d?4?8cF6Od@~eHrgIT z%W1-~jA8N057{8TuH*VH{tul&ukaLLzomp8tPHmN$suVp4+a8euH+FvR!dr!Jx-oT z6m<0TX!Jf?FDUN|iXH`c9>RbH+Uti^Pb{Z*B9LpT(R+O<;)sJ`agauOZv&u7RjOd0 zlqGBc#q-vgxESfBcLbFbbVLa8No2!6UTj)dg0>b36S*AqCXr-m%^)^4`Q(f zwbA{BJik~hV`Q<3pId%c3!Pl_VTDT6fD7=hZ5?V6~_SciJJu!AeLQCH3&QN1V%v{Z}&=`?Ye-6AHIrh$2c@JU+h#=00EWd z6)L9>7&2iaDjsg8sj%BRFJ^LjvRm)SM1;62V^{#|3ThuW#Bja0X zwN*~+o-Vn&pR%{qZJy^SCY|PDTBU&Extu3FgYj`DQ@ee?F>Dw}K(!9gCU>(w&$kp; zFHuYz1yZepu0i+@DlYsK4x`s-gSZB5jJ68{kn-+JWAi&{RDV;x3s+lHd?ah*JAI61 zI22bjB7WUc24!zrSWx$fLj7xF*>9J{c}CeUwU4Zi7pI8s4oD(Va;8yWO7;|#xtBZC zK3IWXAVr+?!qRpM=T5qq!)If=tHU;c%NjJ?^-ubo@6d>cytX*@o&+XQFZ)t#gMPL+ zq)ou2iy+Yq}tIb>J%n`tZAMJzIBw^$eyuW+9?wdIM&D~a(Vn&IJy%FDjj&+np zeJQ|{2GmJ1<@Rm7{5+#?#US&RR#gwsmf>6Nluu^2q6Hn7CP4jr(ZfOqdvv$TX;oFX z&b8DIdGZ24-;`n~ozYYv!>|v2ng&IXl?2?-Jmz*xw@UvL1T)@R?q+0O*W)vr@?Jh? zVD(OzDI_`80b6?|XtHVn9%MdWO8DA3aII~Z*$7p7^2-IlwG>`zuL%0q zE_NxU*)rdU41SEv51exb+CGJ8?*S|nA;@wJ_J~s7N)%QUH3~6GMS_pM{a% zaj9p(!p+T1j4_9q5>g{?vHiWXp{orLiZW0rv5B7_8jmC;B&_8uq|x0=`&+D@ii;(p zsc|7h<|p0Dedyamk*KLtj_b?C3!$0^96Z+*a#W3RBQeB=K7{Ka<|+%}n_{E@iTK)S zK&ngx&EM5}iu)2w{tYL)Es?8nDC|steyMffLUuA3-)3(HGgX=oHfxry2+xR9Y=ZV` zIOxtDxUvd*#B2e$vVrw-gMF$Y3-@J7M^mppRT}h&2Iv1zSx!Eh-8rmjNeh&>jxBU0k z#@W*hVFcJ;L2pon`3Js})|*N^*q@nYj|rv#Wkn$vQ6LHV@0jeT#{+MB`GGP6UAsDo z(Nhq)r}0{DfOl_9p82a%)^;mxInL? zZiV^BO6$oXTwGjvI`~kb?LBdEg!c)GAI(}tg-*Dkw0ApfEp!l=0?DXDW{~%#eVt4&kp;~ zU-T-0gYVAtvNh-LZU{~*IKjN#vwhNkKR*0&togs2{S@K;-=6&}Z2u=3zlug#!h(hm zI0t%{^H08rSo(Bi?Rfu2tf&XP9<=a#+e}y4*_~`Q`ecpODuIT^L>_v|F>Hy>@+5J4N2%RgVdLB@uouJUXsQwHw3^&HdJ3QSR1kfRY7 z0F9ONEj8}Yv9~X^`SdMdBj*F6xu(KQ8%JF#H5FiZ%Ew}I9%DI5DX^Z`79d6r#Rgz0 z=71HZbgWQ!8l(>OJLJslaqcLS_+kQ%U~+DGH&uOb)b3)91)!-&t*@Z_j`IwP9O*=l zZp~AU&bw~gsn@=blDgyN+jFfvv0zB8&hTJA+D<##M(P2NhV@Mwh^~3HcR<_wTUy^L zwpRNSIse(rn)_j8RWF6ST+}uJ@1k{wU6_R;&8GXdZea^v_$ZGdYPoMMO-hp;0 zp1+OlMZ$ISb}To+cYuQGLB4CqP?Hv^p@Y0=o?#=cUSG0yofH2n0IIe&{0{>*6o&wswz^dRS#hf;xG z@r{)*P=y2?kx#-^V)h?!z(=^ELrBqmTAweGl4S_B#oj6WM?at-16+XGRDv}gmJF75 z9C#>veq|4HSlfDTCa{cCKEb-_Z62%lU=)a1^iZM39$TuT9&~p_58Ej-PanGWDI( zUrYZb@Am7fL;yJ5u@}yCl&>W$ee1YMR^l4t^73KL^o*kT-<QIDD@>Mohf4UY)Z0uV)10`x_BI6fIOCb1r4(zYEhoTm&roG{c zw$G)Z>x?H!rPkAreM^JGVZ^-enS3fs#PxGoGf%gSAF=!<(;x2#ZbDy5If-+L`PS3( zKCK28&atkjd(+Plas7?lObG))1o>L6v{iQ(YoX#+t@wLu=Ox_1882v% zxI#$H@62u~#_Se&gzibFhGSI&o9S_01|!}nE4Lozy(t&sHD&+auX;I*CSpM0%0C!R z5Nw91+~LfrFrQAA5y_R`~i8>jXeL^Z2_uL1mJHd8q< z3kmiJfl7n(mY_CyHu@2Bi9CzWSiM^nJgv&kn=5}JsLUbVTdMKQTFxhq`u74z3YXRa+8Q!n7IS`84j7zE1ph2Q6mj52((o27{Y$X>A2gRJjkD;Qw+jaNPz|ch>m3CH z{Asn%PGf(+FY=gu8J>Hte-vj2fbe(DJrY{B2mu?vmRE*zdFq^*RtcHaJ)pX_CX!VkcFYV4V zB1jE91i9ST)PKBL>W!cKN+uI&V@KUM7~2ao6=>?1)zCNUFlA-{JT+)n42G;KpJ+|*g zna)LegU%C^I0&eIamT+HHy$TY`o;GmEcM12DUmP@&~{!=Fv1NY(UREe` zd74yu(2#6Wy+$ajg?ep&shvn-AcQ=d{ujf+vZZ~i)#lTrP#u__hZ_(I| z&9v=%<%HaXsj;4Z(}k%OB;OB6K{8m0jzX3@##ZMOb+%`WXh z`kF?*i!V&3UI78I?#kUH)pd^9BRXCTqfGj(3B-q_@|e}772RC}Ei0AxKIkRQhhHai zlw;{L)JS?hn zc)RfPR|Bea=gQ#IY-?zcrx~m7+kdq4zL%MmbslFXYrI5`tnN4O6i3$XLWByepWL>8 zS>1vu=R6Hn7-_giv+9qWx7ZYz{G8w%gY)t6G!KU-2_NI3wp7U+%{jQ5Djm>e0 zLnJZxbP?HJ#%~TS0rbk7zNLd1V(@4--HBVe*PdDor<-&~o~;7mAHDgdv#UbByW}3r zJ~y)i%D+enzLue`SJKK=rlZ|x%c6~^iJG6sB4u>6=xg>oaef$G2p<#DSZQEiQnN25 z38pl0d*jTqx@7q8;O-~c|Cdrwq)OCZX>Um8nT)~FHA_my{}2dylxl(ZebIE%@W_8q z8aASZ6RiEPLHoT%bX=U#8~4KJo55&a3Z+pd>az3Hp{7{MbR_ZBV5!IdY&dx&d|PWq zkVe=!N(mOC?oRoEbmdP0{p73? z7k6!fQeu*8_(W+^t0G8rPK5TiK3RWE)S3NsHj?pO4C7YhQ$CrG*83OKFXVa=(!!7s z!#^m@{|G@uSd27+`Afr+&l|@F9zMo&7ydc;QlK3eXr1`-RC;JNlG&Kp?jF$tY~TGG zB{!FzyvmRcj?xj}+;6hMlgoc!+)v#|+p_8wu0=$#nwO*IX%tpU^P4;O(moR-Nm*hM z_B?!BpkFm5ZuKB0LY{u1gB2t*y|A57B`@(9+G&?zkBl|f!g|-rO!n~0&-C%ImG*Co zv3Ck@?2NBsFZ<6*%m3%h=Z16k7XaDm9!g`a~8iyiJl1 zqLzVYxkPxLE{HS8kPL$qh!|GJli)Z*JNJ=TlbrVqoBGrD1CcT%4@$@4m2i3?@a;x(&No{oSMuhZaW( z>|Tn`IpxHT2Ms^^c3h=~%&jaf(*f|0R?B@5AV0EPN_X=O15vy@U;bhT{Z|1tK}VNtbP`>zO!V9_0t(hNwa zL6>xg456fSgP6ns0@A5;J9MK0LwC2*jdTq0d*(ggcg}m>?=a{0-*|alo4udC;$HW< z*V;Vu22+Q#AL(n11v#zls?<%3>b+h3_Aef**vGK3O)rBt+iQn$Vl9x+)eeYp4XUm@ zdd`WiS4gSUAxuu=`h1H!2SFFZ2-DBTcVCoI{PM(mPH<(!Mapk#HHG3hu25;(*{);( z>-~oh!NwIQYcWL=(S9OT{``2o7sIO()h;eBWo?E|3D~9O-@x(-qtog#+nS>~N9!3S zpQ8HDfpD&iGm@s_*L|#g`e)y+Pjg7@hF?f2r--u&On&9A_UP2{E6JddRvzw75@812 z%6J=ra{69w|IeTAT}+0?Ta+648FSrSG;ifNS}HbsS{RvEc8;?>(wxLQYhPm<&Y_Zs zJ^LY9gry{gE0Xb?+UnQfJ&%ornDmdU*hS`K{^fYR4XK|SEL_ysqvXrFe4Ns-ON%8} zNb)Rtk_#+2XhAK!#W*rejZw{CcJ9GR9k!cVC^qf5>39tr27?hc*zF&-e(H?9Vt>D) zml#94(168vYPHCi90R-b{VQeIiiGFKcfR~m6vnCGQ3E4PPa^i;1PV<1#4PKXEifVW z?#PIViOtQx$G`Xm#EZBUvg>>``EfQHAW4tqY|${dXWuxywq;$*%S>{uDCDj>^ZLh6 zotkZMQ}w{LN6TyT^?VLdq`3I)$ex7-l1<~!2fB3dUF{M5Re?BX??vS%sR3lC>PMmPXb*}4BK>g3l4-`{YvKu9-Z zEp7+)0|QCsmkLT3(uQ^C_1$C+jOQmWS2X%FGF1)kYkz28i-JTwFOSfUbe0e*k~wc_ zxE2V1WBGoe8Y`Fq{>zu_xG(07xbjAfu01l9rsS2R$c>w~Zxa(h!Rp88(A(0=)Sxl5HjZjB^|8$RNYjTlCt~;ut5Ea$;rbVhu8OG)kL<{wsLBykC~*W0OM*j3g$M zdy-xfV~)%GoNW$n2FG$m#JFN(x85nMfFt#~_A(HveVI)eu1`2I@@ZTQ8mmLzI3CP8 zx9P7D@i~D_zMT6acL==>z@0K}5{%-yuvUKnBgxI0q!IIu0)@qsi6P!6DMU7^^5pzjxCuF(7^^{r`7TQonBvDb(z#dhJFC5G~!*8;ipsPnZ4RE3ZSI1!e1Mo;;uL}1qnLEQUWL~S9C0jEgHua%Q z2v^Ew?E#fQa=!AmO3A}x^hqEMt267@fIDBcZ{x-8#kuj$2fric;};QR-h|^p-Pr4tEi&G0yopP!eYq1cYQN0K*)Fz|=st!A zr9zY2LI|;w?F4?ZMspc6`EAGe(Ot}`r$jZJE$ zuvLA{fGq1tY8|GURnurxFNeslrdaK}Tqy)>zWZ~JZgEEzta;8Oy^MFmMqQQ@C%1<+ z@r^4LcPv#542v?=osCE74=85Vw^}eR&*UM5&!3wr&Kv<9&O)T$k#X<qWoT;ZNcJa#Y{BA2lf zI|&P50+&3jzcltoQtP+EOHWd7U^)A;;o}fL(Qpro^weGUtB}!G8tVF@Mr{BCbCZ2`QhcB0hpB zEQ(a$r_p95F5%#?qtau{Yuyy0Trjg(G=e*pMa_fuiPc{{pN*FDtKKeRIZnlI*Fnlp{m}Mtl<_ zgOG+)%Hdtqc^Ja&gPR;lwX0m7zrjDeRUJ*L z{N;Rru2blNiy{vrdqf#H1utz+TdZh6QVX@TGn(O z9@DMxIW^NrqigzW!I9pHgqW#f7-ib!6$5t?c;HO+1gCqw7#KO}$MBCFm}QN73tx;>{{k#IPu8_IKg|--Du4xDFK9n2Y*fbgZ!-rI>hfw4u&2G&8>k_vbJc3JKJnd8a zEBMfT;!HnGJw;lwFgeB7PjnuNwlS+A3}FIR?0S~#eQHS%7iV~PY_5QoRTaU9jF;Os zEb~N{$BJrPOqHB9Ush#cy9IO~i{}s-7SQmf*aW`vnAb{qBmv6#QR971kNf?E)Nd9P z3Pl)rSiVr-=0grXMq7l6VZCM^-5166p?@W*a3h^=fTbFq?r~NKMU2oT5l&RjdO|1Y zk#AI=9h}(yci;BJ`3SxO!|6lc`vn6eA&*6D9{WT*=!t7Ze#jshu2NQPtxu$m>$o<{ zO@ohgmvuRXn-SwyJu~Y{a996I_JF zY~A$BWUYGh<^7u5NxO4R92E*S-QE%hiLgo}%^{Tdf5YPcee-|4TNB-A7)bFJOb7%8 zB-6e|K+V#o<@{Xo@xHBnr9{9G{s0R(<>jHs1)mn8#jH~eYFVuH^yBTHIydH`Yi~%y zZ1w6Sg*Qxu5hI+Tt$pVYFWkM(uAUbgy)+(Yo9=xD5B--z|NWQ09&W5kz4esH*}Yei z9XRP}E(o0PFkH9O9KwC2nY;+-uku|#acfGy&T-q@YHmvHzbzgJkLS;%eDD_A+Ja@XX!u#}62)f*uzc4X&)1Jd;B zW%v$It#`4-77C@@_5eA~D`lb)0ylWr#J>jfAb^p3{T!x#W4Z+o=80`PUR%F4@x=7I z%+vESZH5`Y_L*)pShb88J6V3D+VbS_AO6wAz4Qq7edC;}NYF{c2AA zVUFavA}&HvkG-edNnfHiBj#6cox9fk6g~TjHgYHcTPhn&=fbr3_{v-tKS{YSa124E z`~cuF(j;a0mdQubfr*X%iM`;NVigI7FImsiFOQi|uY;?(NM11DQbH^YM(R{TFzv8aZ5P=5dWGap9Y5_^OqZ&~!IVAZ#6(!?&6|H& z_uqb{N`rJ}g()Y1-TCB7Qnx&!E9?d{3Xu{pBl=mZ4iE`N0`6bZ5Nd7>Ah6Zfs?>hI zg0`$vC*n?VS1A(+mHES;j5(D+|5r2-FX!;p-P+%rY0SkZ#$do94(|C=H)U{qV)MzRG&VM~XssG3iw53t0$AgH}oURQeRA)c0u zNZbv2l9)xqj~1!1nuu>FTiH{F0dG{MC6a@gYnp;wsOZ@J_Rz=4Erz|Seqe?fr0_s` zl2aY7QMdh|rZ;`hkI&U3{5FW7H0W?IousJ^t0!f!Bn381$do;fXLB+ky*qZ=W3Ae5 zF7|m6gl&qkt&AeXr})1i=)YVkedDb|jIP^W0K5%g7plENfSvE-K^h< zi;G5af~VKvp^o&@=HQO~GQhG*4sF(hey*d;RKNM+>t8;7@$V+jP2}Gjxsl`C2>ttFo%%`50g!Jq|U5AmO41O4s+Q&Z)tYoK?tLHF}Zl$DX!WBS6gUb zeS?u;tN3Tz^RF#77po(DV$?0yyDB&ygpPTgez|Gkh^THV9>Px`SzlT(a5&iDIY7q% zk%PcClHe6PBO1Y5`3-NM%QbO@}lz5_ehyLAm#QyTHO?gAr3B5 z%zFOv?xks51BEu^1#dr2O~O>Wv71IWM?4q+0U_O7*=@9@a3Yn$Nw~RY#&kUr5bT84 zI7!h4AB}I`%i7+?|lqtVPJ~(D@t~cD2brc`l?*}+Wok;p~oG!&e{v6B%+Z7 z>EYFq1GqqryrL#O}=@Q#-E=^cEVw3z~!Ss-9uD?<@x_n zV6g&caMpHYKmvTDe}iET0w46t(0a-O(X>#^e^Ok!x@sMlhVUHOCvDlFgbGXXWq!CiqO z9^3Hf@!zlD(qW_n^exZ-tqA(JFrdK-TtN+TAd?z`g(}^2H7ylYvPkWcBX50}F!s0J_Mz92x-Y+i>xR$<$?#DkU#*l&N!%yPDuDhhu=oY&a^g zPZWl>O$Y==$(|sIu5rq4E(iN|q5->+H$IXkCw+gBTFSRX>*Tpks=A zUmfO>t6u0j%~AS!J?!eLT{p>I(Yslx1b>+hCWeddXT|7L~&~_MbM=+ z;80yJD88N1KmGhohaPA^7}TXzjIdk~<-oc&o(6)*U0p-oWEz3;EWmBu0-HoGd3ygF zC`#m2BNOIbQVx?mJZSVu(cq-_L+9Ha#q@y?0sV3F{t=OAL*|nH8-5ao(Dh@2m{Y-8 zlbJK!GJqKps zhaRyP2maw2s&1*cWoT7zN>@0)J19{ z8)u#nZ@sD=z|s`zE}Ugxa{Lt&PKqi|fP zQCO6$JfKJcw-dL4mMPK!2TNr%@zZ?7WnsYYWGSLci~pAX8?J-_`d%%JoVUl@;hHW8 z_K8G!ct0ML{I|b833;f`bwnX=rklJJXauGBY1AfKQ>y$casB<5q}(^}Vne-&pCdhf z=S2&7#%G!GFwMXGmA)0ilIrf*3~{yDz;EAopG>ybhkA{;TyM$CkU2=yLnv@ZcZ$V9fr$ye&82q*Wv+$J;OM!2N-tjrBTf{1zb@HFX&E7;X3J5R`|1=N(#dhwg zIza1b==7!%mZ4uWhcKEj7G(qDP>^eB@7w~W9Ag;nh%Xeii|z*8v~m9ciRheZ#lUbO zbU)i4VT^2(_48{;R}0UPT5~t522pzzYj33B{QUR&Sjp;4TyOT$YRx>5e`Tk=I-l0@ zn_|Kb%si#83awBE?S%Yqt*x@#Yg&FOqF%1tBC;LRcS%A)U48TBO(68xJrn7>|}i4j=)N z_!LM*exLingZY#xm#NHCWW0k|6*sA3q`MReHHDn)Bw2XV(~qNUH*W@tST{AKI1(@B z6beT~!?K;0`RC`&M{t?VG%Lgoei;Uv>HZ*QbK_IB47u6<%_*$?_pLm{bV?capxjTO z51JKJb|8EZ2p7ILz=6R>eBWm)+wVIoaeRCQBI~mVc<^vZ7D%Sw&n3%h^3VOZFkFc5 zGa`U?hcSuYk~C*GX9Xx$a|tUQ7v#6E)SMqje^}R~b0ozWU{Ce)P`v0*1Jq44f{Kao)@G?5SNCh)ZhJ|21Q2V)K&bPX% zr@$1cbbQEHWaU@2^!jbgITcb0S47IOP=};5POt7=lwX~#UR>0NI|gUmBZORc7%~n+ zw~;LQQ}?$?sOvJBcrH5c#g){Lgy~9x9?P}s*MkG$$;s3!%geE$>W)BK8T&zzofwAJa?NeNTHT;^I8vQ+*@gtPr0&F7)owcW!^xB$&T5t{7@Hg)Zm*ixX zvK-!P4iT6ZJfAX}_>$Fj55mtvAo4&bb?;#&S6ZCka>MK`*@uOptNkOw4PND->}hK; zP~BBq(QGD8kWk4tGaqQLv#qlP!A3}kp+W57BN4ZmH;#;-NP`=SReTDJnfu}`GpuKc z{5`RkUp(C>gaBm@Al&a}s$g1ir8Wo*0kAeM(EtRH!B|N$7qyxGz zeI0qPR~bVXk|~qo?BcThsxE2TeX}eBYwza$6UzyY-NkK}iT*7Hk{LS;;gm0trl)l6 zO>tS=Jte-?{kWz1VYJLtdz$ zI=l_j36tc-PLFipg+#Npvb#Kso_fZBz=?Yn*i&8jMIkv#x%FgWcRntS|2+DpMUVK=<$$Z* z_(@m44bZeXM2#(qrIR(~Eu)iesWSm-4*~5$k>@C5yeIRKjN;}4ek-r=Pm=nuigGbhP#g0e$sePxsl$HOAFqEjA2-lK~ui05`eVbm} zIv`{us_y&>RQ&*LA3IeHE1n;6aCq@sYBjiQX-00dx*0RVL17P7(bG3NX5bkDkX#9_ zKU8X45ub(t0?CB06v#p&K0S`jE%i!wEl1|tNC@twW%ve5Z)qU>z+0r!8dFc5mMVu= z3$%cuHy_w%3r2mhEM%h}C(q$MXgx0>pf=B_VW@IVgNLDSOqQkqWO(4!Qw7De;xqxe z522A&P_x4rBfcMtYFg$8tq^YM@)<{L|D4OcaFjL@7a`pfL8w_Z@?=-3`*I(1+V>e2E|2S8lt@}yk= zbK*)7Q@8uADG*`^?X0)Xb5KOzrGAd;+->apI#pl~N_`6jbpGUPKkB^3eBMRy;;iJG zoGyTIPG8dwi&qtj9 z>_04W{6`6x8iFbw`VzVd=s*XuqnIJ;frK)TGIu=C%5-*wYPhiI(*v_(N>El!Vstr7 z2yqhfs-S>estl7vV#lf!Pb}3}sRIXxzcPH71mRLY4uJSn^$9~oG)!%Zb;&jW13LL) zv*{U&fv7c_V)XeP)X8p(JH=oR8ve=kJAD?0t|+r~CsozjY@YHM5RHpc zKB`$Gha!~?_U7QkrcR3rvx^fjC3=aY2sEv-(hn$Gc2B8Nz+B}tVV*?@72Ami3~2;I zM5vlP+uC9s$ii2pL&(=42kRn&4oQ%@lDc14=17;0r$7ybj=i-&(P?oP)J`9;6@vlK zCBHD^NP7V!?-PWE&=uEGi$jPpZv5e{);GrX6#J^bUiKxbmHp=I(O#sHlO zb5|IFfH8NbO6mITIbqb*=usyBkIbOLbHbkdad7dv!Usa*l9~oCwPxo~Y39pG4-Q5X z@ix00g(Xu^Tf|{H7q*8c(6IbkAuK2bihf=_R+fbvJ!lj^pb|JWt=T7CqajJAC-rWD z*_gUH9`tV_e0FB!?-gAdfoy3_#GCEjP%NTq7hhC~7annAgp43c7%){f>AArxhL*!* zY?i`OVo|26A@=;;F=NNJcs;ymN|#Q9E4wTQopmakVAo>iLt66$7X)R6N#c+^=t&au zXfh#&QFR1R(R;2U+VngpkT6wiAlzx5x;QfXfFjNQGrMB)9*C(D?%6X`_@lBQzJ)1! z0&k=yTqDaNZfKZHzwCKA@HV8+O8V%W4Cm3dJ1#1JRN1A$g$53((C~G!RiK4qvFX*3 zbdP%j!_|zn`DF-vwwb{e+7!J&)Z~$6Y@q;o)O7)t82n@mbn%dP+#HS*={D}?lc@C_ugqk5@n$0b6ixPMfj%<{!E?%%u+t|#f2q7=A zpJ1q-j-3%z;StHfZB8b@0{f2v|2PCcCvCsC!^D6u*b3!@G?ItYi-QHQa(}z zH*%zGd3DohIZzyp&Bd5;NAe=6nTxl4gLSA%P79LPWL(TGs|>(P~FocLg8IDQT*2#v)-oRgslx{ zMR%IGwn|?}6)?c($BdcSt8nd}hLnMYuy5{A zUS`+%Z3iFh(o_I}1q#PiLB(9A;XF9nnr+yw>oB6UYo;Ykx}RYU+YoVpCq52FrhCqC zm|_Lio>pl$iydizC&6{92P$cb+J&LQS$v+t z%{ze3!wqobik=^@39)WlRz+jXWTkVnp$v#7mV;TBx7)`sB0CmQ1-NL;jQ`<1r=FL)#!3u{j1}>>iw+`B0!qP zbT>dbBYVv^WGEjfkaKt-nbxqW9IKFb6QNh5LdX?tee923Iwa3;KP@VO&hR<4KhRInLm3gEk;>)W)(mu_zvGKzfEEE2-|gyO z*AE)NRyekJx|c+ci!@cXo;yFe)Si0@K^%^swf#wVH`W71zE&J)A`;_81nmjRq~TTn%jL7QUhk95q0 zrMMDuPLel`N1&87yC&u?i2QVp9Iu&Yd*L2?DN;CTS^f`P=`Qz^8^BLEw)q{xu}hOB zottzJWXx5caHKs}YWA#jENzhI?fB4>)0HlXd-y8L&8EB8xPnXEWYEX~Q{SNw=I?}- z40;goTn7q#VWZOOTgx26MWuvcP<_s~#<3CY zKSO!>{saulBU1kY4dq?Ha#!@BUDw)DE-c(b_>i=%`cK!ctw?-4_V+EM|4iQCTCy;{?S#gRd%dd|LnCB)oRG{q&k+|2 z8y})!vVrR@H6#j-UDwi5I=k)BVb(R!Pq3~#gt1A35YcZX-29TFp*t4TNrSblGO!g+ z>PnYq+05Z}Eu8Hp7Wv^;2)ne`d&6Kof+N>fpMd7LVs4x=#9`MK-?(&gONPtL)w=^t z(_IfJZgs5dj14$2I2b?5H%Xb-o-Qups@;F9^{HGZS|`VurTQekE7k!^b!(g&x@DEA zK1h{LrhcOs%kWcVw@d08e3Bml2G3$kXx6w3&^yI;PE7TbpuwxW310^L5Z=6G>`lnC2 zmiZsa3QsIXW*(@3(UiA|g^+{E?ck`xEFP~FWS1UvjM*Qr0O5ns31Ie$uz?;LKKy^@cBSAxxufbO&rHpDKaL2miY50;~8I z+80@orr^&!-&49gl#pY5S6?2kG%5sn7btXXW$>YZlVMq%2vW|v&w4}CcFj;b0q7t4 zqX#puhjg@Vo)iYeYlUB`59`$fu}N?3Y}N|zxsbmzz=-+sFd_uY5fODmu%#4-=PMH3zSYT+aL$fdikLxmpOcT^x7S$5=$D zPqJ0pa`W{`k;&>Wpgs@7NGX^DE!G94Q$bdMgDz)B=+!f613MGo=_u*$W?a$?8hMug zA_=xwf!%rV0uJzWKCE^iJd_`$+kWd887&{Asc)(1R_tQyEZpqmI}^p3zU5c+_CxAI z4<*52t5cL~_MeiaHyW^|7Z%O=6aKJ)}9&{&p^GuTVl zdchgA?YR%y4}4nCI8MAiI1i@=2?Ht&-bb{0mh6!f5<)~uFePxwRSFA*o=Fo==_Eq0 z(+phAgv^Ti3Bh5pTJ4=BBR({^{6P_Lm_;A^N*+uLm?R@Tf??>{WW|!1#yZ&=>kSmS zP_refV+d?bZZ=wkq5gSQpRol}0EVW&=Ya}yTV9P(_qd^$_}j4z(C;lFxs*)0ZtE+A z9EMQ#@SB^4BV-*gOow+0pt)7CRU{6{R7sjX5TC0_96er()?IJ1G^X_(!2gN>fMmtGbQJM2jje*6I=4%ldY=SnLHwZI;J@WjYR@^CAv8b0ytzp}(_27+ z1Hc1`Lg!yeYxaa9c;1t!dOKOn72a=mdRBl+<$jZ4`;RP^+%iAa@(N$xV$Tki{GDFO zIW@onp6{imBy*;z75NXfjyyaQJKxu;COM(~h0WSMIljWph0c$h9oZ-AtC!pb^WBB& z7MP_>zu&L?@eRx2Kupg;g7T6F0VJgl2qu|5@x1qX4=|!&nH@LOWStK`>3 znvdoD&R6UFJch3pT|{1IXNY@o8#CN!D2PR%{vI~ix~~1^8;qFL(2`wX^9+oIe#Wzi zaBA)dYq+SSGMxAd5|~G0@!ILs4wlOKHk`oMpzG1e`61Z{;N_8%Y+UFVlX##H(Alje zF}Q)jYm`b)rYM$e?>Z|CjaIeH4~XniY+HVl5KY$NXMX6wlFdSuXS==G9AQv5EhsvW zWfjtQN%aY8A~kSb^?D8iLJVGYTEK5P@>Kn35+F}fcEAD1G;sRhJ%yQ_WxvGIF`Mof zykRCRPgmC7C(I$%YxzZ6iq{Z2-=V}&R-$M?Ui z)mjsV0%W!2HJ~=oy(?2!6+~|^C$M#VqSk! zBr><8jGnZ(n|3fr+4+m8YtsPZ91HLYz}Hy)!f%Sv&JkuVoC*G-iS1-Y3Eqq(DX z-mrlG;`{CZRfhQ8sl>n>mrwn&La2F-EqasNrij@ulVQFt*L@4Vj7ypl$Xx( zJf#6=vhR?g4HPyqv_Nm@TFeU3`d#aApp_TNa)Sh`r1r-r8B-wUqnWk|B}dj6wjdhb z^OP^A>#Qcbk$PF2)YJ3y)scB!KHD>>|IPc6;Z{kX1W+N?Nh+FWth+C#$uQ;=+dX?k z!e$;y9?1d#EVEY+g3*-u^_u&_dg{7}?~5O6u{O}^!zrr(eQvtKm)rS*T;lv=&unD= z#+(21pRr8-KXZORNcuQ{ig{50m{>i~svP>#Jv!N)UVGL}@nfo8`+!nlR{pB>umOB) z+=?=KYFh~nll43isOIV%8 zm`w#{sS{4*^D=eIDy>r+_@y~#0PVRYr@7gZzmxi^NY#!qYHob*Grnyc)eO3vNltfH zG|sMdMF=E`(mYg7K7}bpkOpUn+{%3<&4|bpELL!P&TP64%;4uKe2uj`cz-Y?M`BpaCWc{#uX z&sXcA^ZG|zxz3iWQ@k}v2Re>#=y!$MHqVXjmR#yafae4mC@GPE?$s+|pfJ8l$J-VF z_;aE-<-U$#3*qLBU6F-pP2AJ9zN>xp4)9xS$Q-X2IZ=yalH>NVhaTrmj1q+w_ki4V)XffZyKB#9 z0u55mL`nDd1f9S-PtwOAHN(>T6FTyEhhH3gqJqq^JaH0<+$k4e=kYvRO(u{wQ2IE1 zLOu6&5`Hy8#H9(1+duU~g<65dn*gwcO3DwB;Z`UX-o4#oFzizEBdS$Y8{GeCB}9Y* zy6}O2$}In^uYSPMW0Y@Lle~$9BW6BVB&4ypdie4#nu#7z+w~84S}B9FYlHKn+CK-a z2yuRuYQIoe)*QkpS=V^}u07cNeTX~jcf*dQwxD+N2YX2SYZ#oO2`}Y4O8muy#h=&)>cr2qntp=2j0Qjm82f#~#&`a_A z@&e`dP89@MWRX#yd077<&|!JxAd!343Io|~gwn{-tX74Z(I83CXbDs6K_pmRQ`iD; zIZ$#ul)}qwvMo!E&@hfYia#`xTqWx$92`~{wK%iq{OBh=cFr6QiD%A`oiwiCB}KS# zn#|?2{Wgf}A^2_+gng>$d@gdA1!!NOWw!@n#4D+!{A{|Kb z13_|*(%5en1RI{hF2VMj>`}c6wjkTZtsu+id+MyvG&r;CB$r<}Z-`{cfSX9A^as>T z>H4R5RY*23iWw^8)9N^go*-#|J+>-3GX9i_`C?~%MNMbl`lO5I)9AVfV!LtX2!@9H z9YmG<8%itSn@+5Rvdr`@h(yqJ%!!1N^Jz`!8G1gLxW36J6LQwL2_>4ITAnH}lNY8w zjfLL>4Jg5$Eq*kG?w}U9w5`|*=wRpG4Hzb)080G!7w}SDOT}XP1xl?xag;V>88it& zbth>g>QY6hb!IhFrim|5z!yq$yu=l-WD(+Wd5kOr4y7fc;Q8nee^#t7^oaIcNM;YJ z(}xrHi?zt)v~f=($Wy-gC9-{U?$_7Vh3>-(uW+5;5Q9obRh}^Ky47>RyZ_`0|A~nH zL9m*b`u_}EZ$iRR!_wIal62IOX7?Y?2QrWGNQdak>4T0EX-VxPLumt_=etUXx{Kdc z0{{^yl{EvrVKwkZS^zL15kJ-}(LMlZDcv8*l<-8&0OaD!PJl@Y$1L?yIxxEL605wu zsb3hAvHRu)MUuh8`tzf<+)ZB3r!D#tbXp^cCNAP!k7rJ8&?MNWydZe;rKQsY+Ygdt zedbFUC6Z*yZmXQ769&>?y{|=wBxpJ>3h7D| z4!y3gcdK)N_f2UZ0+5=6nY4Amhu5)FbKgLm3m~W+G^5X%AY9t4zTX45m+$V^A&nsT zQ`7rJ_y%_MwN*GWLu5{a*>K^~lxgqCzBNvu)(xY~lM3e2>J*dNiK*%Sa)sb$yj& zLT2szbFFP|eRA;7>k?M$%bR`%fR0AjHP-;Gt9)UBP~cF)*v{F)M!&RNlYq!YafFezG!Ld?QO>h5-`w zjkSzK*(myI(d|z^SQ?n3^9jWOlfrYyPYFkx^Sw44Y+C-$JkZn6lIjcyWgyh^+hI+U|Y(f+Jfhgo)YU})| z?^4A@p3P+C!}Fha&V#+Ti^S81fi1Hw#Z6@FL7bkgjr@M-S1R{W0h- zFYkoqwI!p3jDCj|#!(HYKT_I(G&w@Z$wEPtI!514m-Avp;;j1gwCAk!baU(6SE*Zy z8Q1_hMjswDjt3BZH3@|vDj6IW>d`7Jxve&vI%$ZN+KW0A6AsVwxorJ|3LW_^AWm=P zD~IlWJTPu4!OogfE>w<>TKtTJMd-`?3k{JfwQE$4>&rJBC@X4fo^gTBp$8DM3vAw- zix!CbBHmg&fPfphHU(IdEx(|VC|zMn>uB)X)3LR**I-`pz9=GFYEALd8VXOdUzAji<`bn3qQ$CNOrduZ}el7c$ za800kwT>btAPHMsy2qfsb+d6v-m<98+O$m`E+A^M7j;}w6o7s*C)R-0{fo$PJ>Lwp z@x4e!UY_-&uTxbbrN?phw>86zv1!nECfOEyG)E3R!RJD=?x758xbsji2m36hQpDv<#_Q+STwLT3tP-5`5bwk zbTKE;B9@aS9dNpma#~fpB)0X*j7MnG*9LT#3@_5o5#lCUSrZjM3UIVZZZjj80H5;{xX$TA~`cj~xup4_#x1Y>jX_)bZ(`1-+*~jrc z6eE3v$x@n>}DX{ zxL37&80ygXy+AbYoCYeKch7jwemd<>u#a|E?7FJkXwiTq;MsXMv+vcJBE-}f=BpBM zSUY9h@@Nhy?+ee<8Y~C`PtNMBPm{psb2}-ooblyZvh^BQMEPzBIT2P8VCu<8`W37e z=_@FGpHiNoujm$OilXDzOWkwPKN0uXqt1T_meUaYTq_@&#w)z;--ZP?4`xVsr-;I} z)y3e*oICY0do)u<4KG!Fw+?-cjXYP(j~9&3-3Vr+p0DK<)w7at8ekWYd2KKErHOu} zOn6_WM28aM8@IK#2GkzS2K$N*kl4`_kIpeYo|B-G%{+lsSqLI_RTxcf}@;b;*l<+g!;{hl(-uNf@ znjFcQrHM)EQQ5?)_GBfSO>UD;?5~sK-T5%_edG=<4N~)Xo5nA1(s`i&_u`Eh<~+H~ zl8loN_kAP%9!c}_>(HJas$cNi2-^M5Q0iFOc+2dObgyO@N6TQ{-Glxs5Op{0JT&E4 zSzt772^Tqe<2;e8yE8>{T|S2GPTd3cIyOtBLjiH&!-WvHg|}(M>F68j9hVH< zU|vz>NuRs~Y(KS7dlj!YjqWUTv}_cWKHHkEbKLC`$jc$O=PCeN`b+{rx6MRR@WImN zwQAwA=U!(%NnX!%3e8^64~Ogu&sQtg`f2_{HPL;3KYKn)RCOVwPyKSp{cOlEXPsC4pYR=J*3O9tHcjm1b51DR> zMToQWpzol`+$M~dp4$n_nMa=)VOO)kFAeLOtcVjyc!+UALZ#4VbLOG|R;w(Mpq5Z> zMb=bTn+@e^cKLlbxQBt=+8j(0EHTiXAfO@^Q?SKLR0-OZ{1}$qiFjw0eP5)SCCXM@ zEDqFkuN0A!CkhR4NgT!aC6u-#fW4=s}J5SOlCwW23o!g z?aA{rACcCHa`NUwvR4;A26KqElqVHF+F6S{TEn!mGl!A*HNMY!BRyk9PD3PcS97uS zFCTTrzVSAh^24xleMbw@@P{1 zsvTl7O3H|=6i-)!9q{Gej?j&o`EHSx6g!8>H)8-;b8viAmd1xFn+HPVq%yyDgbq$Y zgUP{^L}*leBwK4m3Xal>&iKRm$BkJ#Av2%;r1Zj24A6jWOmd0i}NsA(QLS4TRlbrfvQ z{2CY(%d(sD+B|%ut)?Q@!(yCgWZ7Bh0OpdZBY4ANSq@KaA0O9NNK_Xx`9pkOFU#4- zbVl9@8YnSuIo9}t`*=4YeIrfaQ%}#eOECPaZ@2y5N2yWzUZo2UU^j68s;9Ef{o1tA zVVR!$A(m-PKCf0A^#lbz6)CSCldZG$P}MwUN#bl{k8MFE}@z8*}DN0V~J zCZ5(FO{%&EZtVKno7!IVl|Qj>)Gg}FeQh)LQ!((VyS&&ees!GVQP)w**DZ{v!m z7it#dm6udKz8c~C^FS>IMLGSmX#2NcjFKj>i<@^Pzg1vF2L(!fh*DQH$A!ih5e1SX z*IW>*ecgsU3VL6n%qfwPLRA-WN_-+^QxK^Ffdl@zqZOI0@E_&vdX|Vt=S%*8|Bd^S zCN>z;&YWca7We^gJB`(m`k;JXz-r9BIX@99_AT|i&3KsX%*xG_x#6+%tNckynFxss z>8g&)UO0CjR;NlLbM>_muq&mWfHWBrO$|lFQiZHx1Vu%6WZOd1cMPH`IE9`iFXvaN zd7VTV>6Vy(9V|FcW6dCyxOjtb*tJfH0S!}9=%u$E$O>-z%d|%!g+CKs!27ER#eph# zlp0|NF&kkHWur^8(k;yszR7sr|3}zYhef$=Z*NLP1p`4EMM8!K=@t-aq#I=Dl>zj z7pvDlYgoEH``j}-M+;u}9C@~9{UclU2j(~X(QRU*O8~27a9!K046);6AMocz8a=~7 zkP}Xj%JyqPrxxf4rflUaSovNS*q<3(+N9lt?;KRQ^n1Xn^FR_y1ovAjk+Orla)q)kr}y`C}14?Rv`SI_N|EKFv^(1BO4 zTNm0YwiQR0M^|&X4}5DJai?MnO*A+Uo=?&<4emwzCc?pk`sh?qrH&n^`hbWV3(BAt z`YNw8@W+opiFVfhmk_g*%!xweGbuqDGjic3-e$#%;-RJCR082y3EVCnV{(4?!-`d+ z+fNka&}kUA{QvN0r<%ad0G_1g{s-(%12w2V)Ozt(zPh4?7}rAj70K(zlgYI*-TEAP z3A5si6*8{WD)6*0pO~Y!R0KtgJ_^1S>69nOORO^_Xb!X?`3+TJW{Q^kt#Htq?sx+m ztNe}NHwq2i)-~Co7|`Z({cSsOb6$~z_YkR$gJC99%#^lp#ay*=SYA=%!027?CU@lS zO2{77<~@oBXlH8@fDKI}5;b$WteEnU+xbuVnrGn^v06}DjxWUsgnpd80r4~+6KEFj z)uP8~nT!?>S?22PpJq^(^+a-6+LX<++GX#Jq8s``_W^Y(9MY=fu;cVnw2}ALW7@{9 zzk`rN9NS6q6o*E1^fQYqp2GI~P9pe@Cj&cSRRiX-g@r^Shv$f1$L@2ro!n%0#1cB? z&zv6k_HGSLx>T5R5{7OFO%Ey=hN#DRFSVduGk)5y@(#41{ErWYptl~&ryU*jH{@}h zl&!ppPN4bb9u(YkYx20W+N7O?J_B^O|D)wTdG6P!88E}gjCn0szA@ecZV=nG;rXCF zwBjT7Yb36wWhaO0*9Krs2->DL2H}R6qDd+u`{vs>MCfLGb_&n}_eIVlOL>+`J?Pmu7IktY2gcO?q$3af{Y7W3L# zyeL-*BF|_@)1;177H4b)#&w{$$jtHDEB;h@XV5cgoF|gK9^$fKVpLWdZGc?h2tOID zX!qtu7bLcsfVUN#)+@mCx2IF;bVf)p6}>Oyit5pVo+iC5ub2ojbNzW$QO|JAr=?f} z`H@Qqp~=jzPD8g{!pei6sJERpKP+sA{&*wOfJtq;X;jf+BeF%AR9mD%^}1~PrT3}Ax7--7ACKc;knojWTxyOsf3oyMWIOk}r{T|DIcls3dvqC@X~tEwcy zXuSl1Sx!4_P%iP*zm3b8o@1xE=3+QTF(1TlksL4s*s9!S{K2x6k6LcOz3#vN@_z;K z#`~BsMPqMv3ObZHHkjr13lcCRCUd|62lflOW^lXRa{cx&!GpV*DaD8@GMQP+b0}kg zno(FL%<(ANg}+alGzuO}C5W+4kq8y&=XE;ymi$3LHQ9;>y&KCyEmq?fb0o#SJ>rW& z(Gd8q!$2q7`sF3QATS6!y&=+mf`+qA{P0JM&ZNUzRmlaProQB{s|P8pb+rEoM6|zm z;bf4$IPpXZll|lI0gm5|fKB8jrJY90KKM{c0L);jd>VOg@Y@=0%({!dN3=u*y2GLn`s6xq_s%XY{;z~~5hJ6U%+5h!U>tP#_jiK`VstOdu=d(* zVlr0O@KsU6*ViQ8Uqu}7K;>$*$~BoO-lfEYD>++o(lyT~cj*}?toN!-Ittrh;%dlY#Vtj$wIS0dh zvpoI~-hS$EHw?PqO0sy`Hrd^_P#^jGk*&3dG%Xmot2=ZH+5Qjv4nW!WtNT?~u<1TE zZgC+vAZ!bj8y7c5Fp6}>g6DWYQl=TKb~#5=(lYlN9zXFoFSUwdiMG_$KTdgM{RUr; z`rGk)lJ4}4^ZE~S{{t17q85X#xqdo+#P$u_y6mwA@67Qad}yg{K8K*OXq>;Z46(y! z-!JY=kkm)?xx0S93~?M)xDFe?I15DG&e<7bh!v}oU1Y~xLvC|6FIcAbk6yZm4Wm1= z9WNe>9#wik&?x0P1m3+CNNB~iFne>rE}z|d)-kFhJmS70&%##ObnOFHyIdPT7B=q! z;HIq;c=j8^nmaQ>&_a1pPSm!vg_@R$J25Gky-JOhJEagl&HRiKMDX|Es^H4lvBF(s z;$?%rSEW<=n|AT4*N=uL4pg0Go<)UKuRdGLa}6TA2}^!;^_2F!rKSzNXZy<&!@x0m z$;lR_yc(>QpWXOLh{>uWe^(p2Ui9f~Tp;WTi^X8Z8?V)GgD8yW;u$S5!kmed?lFm_vAhwi&bYbbgs2Z40_8&kCKPPql7EzXLV!LcpKB#?X1{ zf*g*{4Wu&7iQzJRt4h*jJ`s}=k+rD}J(<7s`etP!qmwQe_Ii#;7P8V8SZ1mIn7XK+ zl?y-Fedkblq_~SX4Q=d96GcWKO%sG1L17G+@pirmpCmk|{klgy;Qn%l$a2m4X&)8V z>^+`~WVIs;FT8Ytv8%5u%paBfNa84Q&>V&h)J+$n2N!XwAc{py@n(cHO5D$K=aYo- z(SD8re&vDH8VHg*1q6#EyOsNjtnE8xTpflgqi)+J(^d(b6)Oy_g<3DsAHg`(*2T$HCiaVax;8 z_+Oe^WM6GOojYo+X<$xOR6sW#^lxAO!#m6?V%1l~sAbK9mph6QJC(yW4DmS2Wavo` zvP$L(8rB{cF$)Dy*f<)<*d7f!!foX#=k9y`e+>h!^59NrG(fI%I4j{?L?J>%@-7cEr;$J-y&w3wMwRktde?9zocM?Iq z^p7qDT&xqd>r!ju-mb(S{(9IhxWR*liR|9WRwLTbaAhs%+p+x3vi7$d2gHBK&NXW24BK4BR(Q>^F}ZG^N{ zddYEIie{1eXTvN9Yqj!T={?mG=Lvpu%b>HS*!RR{UQ46Q4%0jG2u&`5-hS3QjCWb7 z`Z_ac_Z#~hl2L5He}(?vA3?SS-c);{QPQ;sUxP#?@;Mi5mTTwM)tx=vhGIA)Z`-dV zb7iou1V-TBty_qxI3L$zlr#$)OkGaV+$#+V5jTsPq<^~+g5DMyH!&A7+kBFlcTq+u zi233#gv5k$MNXKDZvHyqwWu(Qk6omzjEAdLFLkQZXJ5?X-CO~JYY%9dWiJm%*7J<2 z_(IctJgv~@^*uMi`;Ys@!VcI9VzmdP9KIp&;d-vC}456I;29#*PMU#JTG9Ui0xZY*zL5 zmGhGo#NOzrBEMv?*YNmwgN+A(q%=_{H^aPlB+%40?lGH0L_&fXA+ZWZ@w2j379knH zt=t6?jt$@pthnJvI8+=gWnb+>jFa7ep!bqTYJQC|2Ael`L3zLKZVwnxuEVcUqa1i5 zsIF@gG759lhCE)t&j}C2vs4+Q-s!&2#NAczWm7nDk{KORWa<3c@xIIwR}0B|+4F<< z)NjYC$>`J3dE_+|cJ_WVXL9Yq%q{<4R>3B!h8@`^SEAv?2yGDXNfWtTrg}?|^UeJs zWIOxQ1>w;ZcqGull^@+zPjRj?+EFQNs<#Td*p=aT+yAo}|KT5pt13L2H_E`v23&%Y z0EP}u&UcLNLJnD)2uDlTr_RQ?W#YQdQx1;TXYuhOYeGhjhJ$WJA82NIt>}LHY-Q^y zb6sJ-U2sgrRA#=Cvwg9TEPMJ=bwaN1e>_9^#$Aiz=?sJy_)`9L1X-?pk|gX_c^dNN zCt_K31fPtQrd5TOl$?_{2=P8MtccaPQ#VB zuDGBiuP^omdua`K%?xYUpP?lc02*f(nZ#QFOYKi-Zr_OaFNy?>xC@=d&6dk-3UfD? z@`@1UR^0|nDr1mXU5~ml@UXOXoS3rOxhDHM*olEl5*T#OljhmE-0_H=tw^|(IrO1wU}hOQxo+L)QVx^08utW*`Ebx{Eljs5S6ezc7X>j zvhnOHJiSMGMF1$b($aTuE|l$k)-XOETj^%@+RMHhBRtHjv9qBQEQ}8AH?o3^Dg2z{ z(E=)CS_=>@y#w+J!S!+l$NVBMg^7U~VO=q2gB_60hVw#(106d1kl{H8Csvk; zWg}IajLA}VAl+oGTBPJb&ocPje!#ZyzNV2JV5SBfQZrUdAxF2(}gJ1#@ag4gY&3OR{{PwvgJWncASvgNixkDW*vKw$oSk^qvZ9J<@;aI88L ztQ(du@^!z(x2Tn=gt$GB1*pRYh07yd(;tq1@nZq;8Pf%xZnUYSE_~QesFtjV$}W|~ zzAHQa3zN&}QKFgo>HiJ|{*RaqQLcvB|Cwos)|ao(pV;aKMl5ligscFBA%F2JSUk92 zWN(e=xU-)ur09HoWL@vBdp{21CC9C4T7J=I=rGzDK;#{_T>bfgT1@RX$N-aMyfOIM zjGw^R7jin-MM@8oISlsVpi>9UHR>|1snxqdNjvyOv-!yW%HxFejheCvGjuP+JRQ7{ zXDg1jK&qD`$|Pu=Sj2x7k*6dbwY625OlA=|23L%(R&yV? zQpCx^84COV_JGd5)y1KwJEuh7`?iL_?@^u+pV~=nBmsJQn0wCGkWJ|Yr5Q9-* zmOYGqXzyn?Wv*=x#9<=VNHJE6FH9R`-kXPv-sR}r{S{pB25t*xN`gUyHzd>$G4Kn9 zGTm=?TI^*fs|B_Bp>k%S?s2OTWr)4VDd$;1KGS-wo$YDg{RHa`S$EUgCVpShtF6^Uz2 z=sDe`3bd+qRgE+TwavHf?;~##Kfgkp z)Q66VMXFeB4BEYwpSdy|N3LTD)!WB|jM%Enz`Dcicuc?Q_*--^u&N)dN1yDk)pcq9 z&&rdi5NaVhYLk(lXeSQf9qg=r0DaY&i)W@Q>Fi2g(bFknE91rd7Rm>9y1aui-zzBH zq?5+k41?Dd=>b}K6Xq3R|6#zp;^S)C8XwdFP1VD20vrso!-|+25{5>&60a8u%jCK& zuK5w#+`)akS)3`D7PN(fEhr%ii0mLfr!YYX* zV#~=%y4lx#+)C&XH2{_L4VUTc z*?o;?cF&dzy;+wLv55#3?2qyaqrBKvfAgPIB+wJ(b+`B;L{wIBXqTY@#l3@zdl=*4 zg?iikT2FE(-Fhb#tv|(Y7?=dFThW6=#>iD6w|2sw#(DD=1pQg%2lQAj7Dpy<2PfBm zd~d>pR$VbQ&+ZGY$>ZW4VV&h>39?DA`O;}yzr!2LS*5(AwCUO&kc$K*NwQ^HDA^5c znuCQy;=vA;sLiHlxcY6LF8a+F=9>TYZu7S3h+aZ z^PGQ(JG8$KjmTcV12r``X_o1Q6CPH)4RcGwHtpoWkElH*N>fA!m$Kl%e`>EdSc8D} z!7pD1>&kjtArIlZa5?b+D8H3;_{(23s(g}!4|a9p*%>lkLtA8Zm{gPEk`}mRlcO5) zR#?K;SD&G1tmp+41b~H+piMOVwE?K*D;Ol%lg z(`7MYFQ5Mv$N>+phQgkI<(Pgw*GBKVm7b^D;OmIrr9%U#0kTU;8sBrFJ7o)Kdtb83 zM3E}p+Q}d1Py+#+?`>Ui-m_2{^`ht&{+>Oc4IO%}1%jC=DFv7#%{AgGm&jUAW@jZj<48y7Kq zUzy0W3aL>HSU3TK`(rMqF>>La>X2sY#Qr$qT6Ar=&P@$LKRuZ{S57i<(# zrSiFkj#3g`6{gl5qdUTLyR_yh!fmi(jb7!6#Bw59`aUMwXKVg4Kc=;HPy*?D8u}2M z^%i9!Sq^f*L@Gr!^TSL$i%ZCdVR@|JWobi`fVUO9KF)pO5t4tH3aUhxFnB^Ne}34z0(|TLe3&Z@mtnCD3QrDRJPoata}EwDl05t~K#7 zoWROsenltwZsKR2Aya_h1$LF^Ohy~d#jfiN{SS+`1aUn#ilzmWS*LM-yvye-FEqK# z@p+jOnL5ZN{-Xq;zD*9{U9g^B8}DErFP|4-5z)qvPiqtN7(?kXx~Gf2a~2=Q|4LK* z+_+Z-yPx%k4v4-QC)ViZ?W=(bvAU!j{@Xt=FMg{F>=}}cgDq%faau4{R4u+M{SbBR zmu5h@(zAh&S!LrFr&uj*{wg}b7L(^cu2d~ws3Au5og>B9CJsqz7KZaaO#~^T=MMZW zL5f1EG56s&V&3{FN<+*BAy}Z0A~##3y|i@UgU`f37|crtQh4jegCB}Gj~P9zLW;Mv z54dn<{l_crs*1lSrv;2VS)uItiD!Lh0wH2RpX^F)%C1gPe76U|tiX6axO{nkFIc+7OMcJ+ESmt3QTJ2(iSVPd5x3 zct7exI}|BE(z?JH77^2;$$W=yn8{oFqS(?}OsK9aM&GVu9GD{^Px3B)5$O5&$E&SH zgHv`sJJ$>@=K31~B+Rul=s+H;-we%u5L;1&B?JuqY>cSWG8z@Ya7U7?S=9=5B1GJj z)eAPM`b(i^bvjIn3kd*kguD7U%C#7pp<&4kUBM}H$!u~2UU(0)i4)Q1(?L9*ofq)5 zxi~PVHnoCqzyqql@rz<-@WB2CUAyxpf13{Oi~sYFJ^I~)wyTjp<5}1ax`p>Y&Fg!O zp$}tgxqipXtM!y`hf!85{2O#S0)#`Zx7l4Nf3?L)OS0QThyQ;2@+8F8V>3W*zrL0; zH0I&3{?`#+o4@+VN#|60gU1Ky*$9@_WN-P2hnJ?EBoS~lOiew21ndX+M(=%xJSjDc{khITCh|_ws2?$DkGU99Q zC_Y86ziQM3x}Tty6Uzfs3w)gD1JWm)i0Z4w;jBlM39||+jHLI0XoZ+l?OV~)!z+Xg zZ71p%H||qCq-RYs?XW}9IKMsT4Dno=vr(oVwZGPGNT$l6mlV#G3iPame-WxY7(|3x z!J~4$wD~on_C~>@_!DJBjDdi9spOgh=TBq804SN{j;BV_*QsIC@u-Qd4cg-QMuoWn z#k}Ruxpv-`n?r6TSbZL&W)jirOk5gzn=>{BFK!eN%7_c1ThIc%3&byUEH0o9VtVhx z8%&NeW+OSF&t0uk1)e6gUoj{Irs&jL?(KF!cvVNr#hkwW3}e3c6-l-!@(e$z*Hmuv z*J63w;-==y;EVGh1$k0b}$Lcrw zj(OWoCtk8M8a7^Q8jPpTOp;f}nth`lR)D>P!yvrZn2U|?<%vc_eLRW)}dij~}d+S=;DJe}> ze1Y?Ph?fpMxQjr?2HXL;Rmjip4?r#=%YPCD(|Bwn)nzdyYr@6e&|`7L5u#nJp)@f; z$a{zQ7A@g^{@0p}+!@V@TE?o5(3zMcPF|OZ*R2jsHPIjP?Y?(q47;7q-%5&vt)9w? zrwJpv>vb_X7f8D4eSIR9*3bqLFw6(o(6f);fcnzKU3;8O%w8ZJt-7YT5X^4is`{#R zCGO$f(767NjONjTHiDBUj%$ECp1;(0i(y}soBDjq_tai%T*x?hvkw;=j_gwm%4$*W z2i=JKh%=8mxcd&8ZFCnX$}Q})kqOvt7bYOS?}CCTCeI-}E+;(|{u=7fP$Kcv^i#Jy zJ-nx;tSi;ix4MGD)6eT@FCgSTjWXZyOlP}s3B;?)B++5JTne`P;uxelg87PLwE3+w>Y}t$wZUS2#&a z>S_0m4;|7V+w7PloC;XvlH~v|nutEc%p+Zl> z6%6b%1~z5sX6Ezd3`n+SWEqyxYH?uFe#Ks;zzC~C>TvW?d7DekuSr={;u>}dF%F9VaAazj=lk@pHUa5A=X&T}%nTuTO@oGj?4wI)d`?Ta8~TXMQ$^nF zWp;`tJ8#BoArKc8fW>g0 zfhuBEAugzapJoO?tMgJ7-tRuibj__?e%SrNms?==;c@aUN_xHgmr&&j!Ak9mYL6QB zg>}teS*7|1B>T4Nt>OJ*>=fvi(li0rg?Il8kiilt8WW`Rwo!yC)z38hshJh2sU+)t5~`43B^kmTNvo~Kja*nXN+cM51;gS zU+l2nhb)dqB4|K1bT`2>%U(X)z*hq#PiP;Yp%Gwg;ztgXb3r%HOUy?w)ro=l@Yi$r z<0X96>QD4_1KMNi0@(m{9XS82`@7}*AcgOivHNA~jbH}Bf;NV!ozv)G_OvwR3vHJ3 zQq9k8{3f=-^y0F1N)`?{Pz^NcC7V5Xkn$6r-7w&guVKy=q^DLtV;rM)S$wiFex{bQU zwmYJR_xyWhAo$!?(vr@JF17MZguEPabN6KNRyrUJTso4E8%~na;LPe;b{#Li)dKh+ za}tr(%~Ly1gG+UBWzL}wFwn=&xoD9R8$0TDI!?RP!0k|Cqs*{~XS5Xfd*oO?g;AMv zoJK+0O@2_5{pO3jlIbdeY~K{@i`fK5;k6>Axb9L{1O%sqi9+hOkdzErNOi2S`v){7 z=$IJPeJyyS=6xF8!h961>SW?*;Bl*P@30?;STrG`rln8y^ZClZMO)d|+4ritcgN#% zWaJK$onBCZV$f8mzL{Jh58VmfhBc74()jwQWVfRk@GGYT5)^_xgySfvcAH`7Dcd_( zjf@nDz+k2<*;%a5%v7}gjMU$`DaajzUy3*hyLWxp8zMVJc;^};PcB|$-UntVq3996 z?d8&(7VdrW60O^iV?teR)dw}qlkJ&*c20r;sC?xN&=OL#P|STkE3290lcyQ7RFJ`G zNvO|Xh`Hr~;o0H=W&UjF&W(fg9lUi(_iPy2~07l5{Tq9Qjh|jqZ ztgFmEyhlsO6$vtS8sn%SCxW!-$9eo4#Q1J2FZ+?<1RXcOO6iZ#Ra`71UxeKSko*K_q^NFU}W6``gyK9yyu}5NujjRtD1Y<1!#4>jYmM6#(lBLt^v@2+HKD44ipb?vX7$8r%}A90vNO36M^jkU9=EBYc=?}S?;Sgz{;(_~Kai&OdZ zZM;6U?OAC3vr(3-FF{QIry53_KFG@Zr=PNbe_x?RMa_ES@?+rW^NNP>1ByN^sdx_) z@5-8~o5>0L_WnXR0U<p@Ti5|$ zrW$juJl!bhdT{n1TG?8FOc6+zft-}q9si8oIhb1@F7ZT(M^4kMLfGx3qE&(uUWK|o z=V;%E_+Mm(;Tdr2WDSQ=jDQ^{y%szi9vEd455EFSoa=W8gpYf4;slv|+Z$4@I6B%q z6r|7P4x#J`N_2b+SO;!>&b6(Wb$3DnMm@R9mP^2)21Bp`w#`>{GmM?Nf0b zpt5+J6w@6TFFkni$v5y_~lxGGiw`CC>%<4ubg!R5O@s6uJI2Ag7BM);0r54oej5{)()h}N@}Z9+D(s!2X+ZGb27NfzUXzLi z0EVqRR$|H?A>!^DiSNka@afc1?!7K0`E-G+3fI>xWSMO1I0*lKqR@hkLDi`i=xpf` zP#rC2vTz?BG-`%eMOGar++?m3V=fq~JuGzaM?;-8J_q*oO!+0)sc?Hl)<1jw1@6@j z1Gv0_a#2}-r3VlEwXE4yKsgFcLF~k`Vy?FdL#avBxSDwzaCTk1u_q*bsTv&WKjr$tDayb4b|f zPc=}1#liuYPL9%!mhgQ(SNS8h1#TtxO@;H&xy7H^@8x-1>On`-PCNY&8FUIIPHoCBUDX<#xCOC?#|MZFS||z>1(O=SrDQ2MYUlci{zv zR1L4T?{MUDns#i!) zgdB!o(*+j4@?5G`CL|H1L)Ijbt3K=}V=s&PSv~`)AHoV1)e%4_RAVd z16GgDqC4>riqU_wCp_^{sH{HA8Qn(Wo-<(EahH^!asI)~Bmd7bkcI1^*2{{==24y^ zgf-6LRgTZ}q*$y%k1G$6u9OV@0uX-|p~3ERr{=8HkWsyUTAvFTf)F{JAHGMm@qe}D z#w47d)l(&RJpW3cik<_oH8`EXNe!}UFu0SjM!N~2c7SgZUy^RbYheQ^&iVI`o2FD)S zDpqqK_db0{85u?a--^s2d^>tit-o zU5y7juVyn8qbp>|)Maay^7!j0;|GV-yn=Hdm_N8jNWb#@W!uf7@d3xrn#;6*5?O2y zu_wM<7%6ph(liH_%YCp~p#E!io5BZ9Cs&KNQ4}~i#+dN%Kp^8~>eh``o#6lUnWRbP z6z00M{0=@$Qc~lJ-LywO=%M40&kjnyie|N|rBj}LVmn81>pP2!6|m$pK=;i5c&joG z=!14B9M8Lx`M&SdyVDoC7Un;s&gG9@UyK$PoO#8M^@)s14R&|EsI!e7%)m5^>BDbm-Frl43BC%j@=YO919IErlttL(P z7Us31XQI)n2@|BB*8^gA5#lb{PWVMXt88>#cbF|V^Tt8vY*z9lH|%XD zXAcun?`A5j!uhtm8C>nP)!i9puZk$ak0@FlsN^-g{WBt|3ONyfAD`%#jD4iqspuL| z^}Bz9n3a|=->v4SXjmKRxE86hP<88@CFR$heIQiv)OvpOn(>YLm{vUHMAkNsjIgXL zf?v9&Vq?L>wjdh!1QxGzi%U`rC#D|z^QL1Yiw1o7M z@rx&FrZM=T%XWUzaV(j{=BbS_Nr&NB0?0=|WHeoAr?$*t4A|VWmcrzS?u3R)!G*~W zpt@I41J$|ubzr|n0L!G=aI|avB4qW4%z;Q47I!;Sgv!EuvO+41&W{ga>CWm6*qvx@GoT*<2Ezvx5d&u_8@ zR&5+F(k3dje}8*}mMDlJZ9bHVlaQt#qm)AI$$1u*RT2>Kh1r_Xjc+Kdd$W|R2{TxG zXlYM*+Y}9?9`gN37n#5D6G+E|HpQh)-N3>|yvy!EIf8$~RsUxwjZLKBBGB02(y~zk z8q-5QS2Onm{>+*+e>vMXno>ky^jYm(37-FQ~57JgLd8yTfq%?G} z8>x`=+R`zoM|j7Y{;+!aw=r^m#rbiPDDGqYDrc&1&M4-}_(=KnwWX0zn{lJufanv9 z!C`};A^+srw3BGKkCVu~uQYc_^@(4NbU6LNl*f_dBx-Huc*V1MBSC+KyK`>)s$vwQ zqL60&{){Y=uTXcI^>RP{{(O$(eOSv`=@%?RmtiN1s`&`ezD*ylQ6ncwa$ zE5eVZ|33b}frg!!H=}^YE1C#b8dzuhrxWRP$G$?2RF8o%Q~ui(&JUP%PWp?BSZT>T zB8qKKKjlsFKVJ0=Zyw$VZL*Cw7F?&h+RQv3Qe?X9brc+6((0dxaPr;SYTx;JV;pe2 zPP$SFiL2$BU$TC5c_)K@-{*mM6uPlX6oqK8F1_<1RhnAjox4xs|NMdyhS6osuOH3{ zdgv+ty*N2@)ql)~fAu}1F+TWRfKy~>3B8}tLLpcXw}aV*H`Z!moGn~Q5Z6GHPw#w3 zxJC?o>SMXBVa%>53Y#T=$p+LrZDukT9z$hugxWjpfqtK_&*S*=4rA&N+iodn4CB}` z(2%}>nF*y?|Em(C^vECRKqElqPbe0R7G3NsxFBRHZHaqB(Kl%hqqFE!}eNy9hIY zfyQ3Mt}VyIk2=Eiu~HlCh~wd-Z>X3U{F*mCi`6V?E!T1?Jb%cxIGu3Z!m7^0|I znxk&wIEM%n|2_wPjl$Qn4(ZQLZrU-PyNg~$Obi$vGtehSisovCSDn%PUEVC3i2ai| zm*7bzswI4dIi(%@it3fvE}9#;!Nul}AK1h4hjKVNoe;5If~%?|=KKUUre=9DWkedI zW=%Q}jZYfY?~TP3bBZrw_{&kPTqH{r7kwEktAC8VbnK9;+!QX=rZkwGDXVrWADLMy zFSv-Rzy3)JP`mYqB|uJ5tT#;IxzNqA)!F@ZLsRAaeycHx^jWz5>gs!wkX+W6ksQr3 zVMxZ-9FFD_9f7~ki%9VY0GISbcQy)HiQr18&a5?N3$*dB7>;cXCAYP$Fb5Rdh!%=0 zY{XGy6i38WNrdB5`|Jxz$;MwNjf%Ti+fWhlkSWjY3lFgjy~ywu%Wt2xD7Y>c6(48J z&Rg#I02E}?*Rd`Yy&sP%Pf6O{y zaJoeGzo+w>r#BWPdvQB(qolncCj7$WkGBS#!Yptzs~zLODe62e6ybh+P6C79v)UJC z-nh1ENFs1ln*VH<`n2iomjQj^nhKZDe?J@ZRQjzKJ>sF$m6F3CPTP0E>{uWECTSheOZnwrWYV4#d3kuu z+I}jsMOn3!TG7@K+i!Ay64ER>s_5$xJ>k$koDPTYh#61518 z#cF;X%bP6Sl4b!_mL1nsr^fc{nK!GoF&*(4t~Bjw8^fb?j$>$SPfAPmw^>sI%>O(= z;8~a|Q}cC4^9)2==BYd8D|idaMk@~(xG>dS&Aw@-QCG0%!yV;V%=*_D`alWKzb(^j zkavtY>~r@bn5=WDVq3f2Ui_-}AyT6|(bG@Yb6ss`a`WtYLBoa!(x72$bjQ*9x0@X> zcu??~cQ><4258mu*`w|xva4HKlCY3``fxKRnMGPAQ=6dOB_r$Bw}EPE`7tNaEkt`* zlA;EcY4I~HT?Yrhz59yyuWP1mirG08zk#y0?@7PlLvr=c z@5Po&)g&*6suPEoNG#3GUF+Xli3T)MN@$s`=I*ytYd^|S3fF>2UuJAXRo13{ zho097VDj%L>N$6F=W@~+)c(-5%d0jUIq!A)R0NvWHMr^Z+_ULfyKR@EMe-)7lWU-F zb+@Lwe})_iv$pGxv1pYkvRg6PMb$A;JaU^cvbe0&cC}vchkuz(i>D0#w&^PdG=5FQ zng3GCG@4@;codSpnhAQwM-c+o3X-096gqHID`$4p;ttt1<=Z%q-Y zm1zI*4e1HR&CLNstv8A@e^13HR!+;o5!uzAQawZwRyCVee;~%S;k`!HmUVuxd4u8( zL)Psyl9`nkkDA8Iy)-G^v!yO_e7@&c&lTOMo{t&MK>nCB5bzo{n%;){h7Q!kT)@vvA9et<$M31w6Nes7A0{MaD0uyYdzvqnPx`9+#lLbJu^aezTJT|5W~PQVQhuFe zN{m-VT&_&dwyqnQJq>TWClL+pkP*$FA);J%g2)z1n#w#awrr5nUt+m9n>HwnAh;FJ zY(~MSGWF``y|U7v7U8TR!I~oiR2`9K^EmYR7i9>?;Tb{r5h62vzs_>9sFm}XAXUI` zOi7^I=p0;Js||G>HIigjGkO8qM;15)wp5km%o4QNhr*{YzBl}hmPo?H!wdG_&ozit zdi-Q%-*&22vYO6y!zKBJK?lL_zv+C(Jm_t@+PU@SM#M%it*M(_I%_P<-&i4|RLa;P zp5+@adtBE{VDtCpj%s#2`-iN@omP06-^`$y!HN)Zlg!k zy%v<$Z+e|uTDUK&-PLxIb5How$(Q;GWKD58m1|^Yv+Aye-%%fTy6!8O z?k6x|_Z=jq`v&~=8^<~%R(TeAN!C0qfVR)36Mena_PUMBxF?qRpJ#BiMv;$bb&y(J z{_t(<2`f-Bxu*d$*fb4?7#`4mO!6rDp-}TiS;K?`RmAegWl6aRRVP+|W*;_cB+!~? z9#r|Uo+CwnPWgQK%YQ3-v!X@Cv7l*s>2th0a8_?MB36Bx%ym<|;?MdGwv*ML--WPK zIIV7ML}?g!4+56~xRWbaASC2fx}lf)=T43>?OB*EsRnC%&j=%NTZt@S!Ph%(+L3Vw zN{C0vS>SzSY2lq3i(=LR!`Ec2ta2r&P5oYlf$@zrJteJ2boKpX*)ocCkV0$4_OF&O zjzW3ksXOb`E14SkEyrh!Bct-PO|Ku56Ql_Z(D~{9z--@tTs(SIKjF-Bj26Y-4RA(P=7HG4?mQ1m-tcI#tkC3`z>QvY=>-V4RuMUPw z5zrPoPfR{z$->8sx^6o9F{v$fAg<@_>yu(xMPC!^Qs(`7BfDuxlm`d*u}6T{VsH68m3p^nAZDA ziXg4Lys;GX{-SsL*10=}jg!BgND;aBbhTjr_&ELLp}~bbehjCq@XqI3!N&>wS>@iR z2SW7)4Gq492jg@8BN+aqxHs{Z?Rq!NPgpNKcc+~#d@1jkNBXJ#ly)<0c@L$q+|R`z zq#n<}G=@?8!eVWv==z9?;GC4VH}7h~gyI}!8yc1xV=jDM^8AZL9YkNpwq2eUn8*%= zN~hA(j;lOkP!Go3&PphE3s0zFgRr`oA$@sFOuh=RdLMtDUa7c+oZmo>2Z^X?V>(Ps z%*{tOnYOWaPg>*&5P9vp+uL0hTc6gn(+sOKG;8v4>Rfx$;CFGU&U1eM+C|^=*N9KMOpYFEsDYW><*4SRFWEKhBTfgc z2t4dF<`gXG$>q2loyr+zY=c?*+HFzM8#H%6&R$mFl(BGO*j&Vm0*z zron9GhG$aY+PcqloXF{j0VS(xp!brv#!q)U_g&#cMc;^*it=WG%XUKKK1aVi$6$+h z%wzm6Bo9ufFYDcW&gDlqyLi)HI{x}`Em%6mHRyTy&X?$1;jJ-B^Jq%K4^S|IQL|*ib7T$}+(|%ebyCxcGax70P_cS=1ZEt( z^_|Qvkz5nCFz*h_7Te!p*$HcWUbwLYR|m#_QrjD&nVD%arELr ztXx|hmlX|VSo_)W?m4gI0@!2Igne>yBB7xg^TZ;Y{8b}HQxY5~tbOkF3@U*UvKIY; zB0dRe1`AJihJUUK3Y>Qa6#Aa)q+&sNrjAUsUTuk;3I=4WWNvJiokV*`Nxhd#>fo6^ z{LqY#N6w-)wf_A^UW*{m2LIi^^+cw z^_+?LpU;r7&*L$qq+p6`8mmE_>c<{p_VYGZyCN%+-9Dr{;KG?Nr2y zKXdjB-I_+hZWwJ5AWX zS!Nu}HJA@AcE0y`>Ck20HOnEWOeV!jyQ)Tmtc!+0GfCuX7Eg(vM*~xR_WB@(%UU`x z>P<(FPRom6!7IEjk|0y%9|w=w+`{75<@lAUs|q2Fn%H~ELC!-l_`%yg0z`ZVk}39> zC*7_50yQNi5tnv{ugE$S#}2+aGA9i=TZ-()UMglN#zRDE_kC%|@rP%hkQ4mQB^5B$ zpRn9j9W^+nz#yyB4igxm|0S(rGBrR%BDV-MC^<3P^NHbIeja7?4E$u4i1d+4UEdDR zIz;)2=#=zc_U4pE++gZ!gB4cl@hcEG;eJWt_E&5yfrA9l+Bo?!c#Qs00nnfJL1@a= z`Awp(hc*xJ>|(7XNr+?c?S6=DrlO!Zoh#vgUM_uc_Vh4ItXN}!y?&8W<%~}Ph7fGl zev~P)rxSa;0y%+kEBZestNRLk&e5dR6u?l|^dt0%7libn? zQ!*AhJh4jC92#2HmYsP+=1M>btbTK8DJ7IMFLhIijy1P}M%BIfgWT$eto@eSU)0HZ z=CF*pn|L4KG7KK^`E8Hsl&kbGKSYdEPfZ#JuC%DJ*`FUgY!!gbXzF`=flZBJ@|}3C z#%k4V>eFkiTog0{t3P9#lU3DmS_5N^sYK|F*)IV04mvjyEY8O%<=O>qyLULGCuI}A4gEZYbKXu!Vm49~N{O-I z89O8Sy~K?;!0%-C9cLsP7S!^*-P?wLd_+yF51#Sg)f~|NNlHpwbABLzykT%=EmRoo z_`5zaAlZ1grcB&>OAYskcV&FRqa(hPK*>K&E0KNK67T*nl`Njlp`|Ca&dsFdbA*+n z8h3P78MmzbqSx|{s(9?{n};!d+$R;g0T?>?%5Vq}%1T}(th4FCYi-vd;4qU}{or5L zSI_?(HQpgeM4FJs(;OLPRMjk{;VkPCDOZft*>wOaCw2ZerDHg;_Z8Y|l`LDJnnF42 zvEL%s^8V18s>mVZS7C}~j*1K>->3UnyZt(c^8bs|r?1 ziB-`jweaI=H^u8;jDbJg)>%WmkcpZ{Zq^Mh7h9RmiInob>mQ@?FNjKaQMm(T7 zIabfO1DxE0@L1FJ(~gZj2Y`tU23&T}v-;g@(FI%OOGecAQ*C>z${SM&l0mbNR=v3e z)TKDl5$3IIHnzPooe8WOFk!y^H}e_%9@FZkd$cLaPoO+-tyre&F$pwLO}VjWrB;xF zh-S*FCv)R*Bu+cU;#&Gb;bGcI$CjJ=PYxQaIwk^I>+eYRLPs>N5Ysjz6U>^(_uy8(!mR7*NqC0gG;HBwvdp}zlS}B- zkG3p>PaL^|9OlZqTonE!PbjX(#+xcbOL z7~RhIS@>GV#ngAYcb>ve1o8R%{{aT2!DSY2E2((xAQFy zO=8ohcV6l-^99vYolIInOmS-$f@wfsX;3Z`mLiwaRv?-AnJ4f4HTgaYe5t%zofZ3> zhFJ)tsL4!3o+t@1XF*R)gs>~76SFkq4We=AQOL+aXra&6Doowad&lTwzxt>2wA%-$ zs-4HVflo^im!Zfm17zy?e%eRV28uUnF)B@1=^q=Efkwqk)F1o7xPzR@aEoRps;C2I zY3zfHQUFBHc}Zs%Os|90v3I+y=s*t-CVyH+qgg_ou3vSJH{jfm4!DLk)j(x=86%Gq z=dkVwVT?K4sXWYLTg$Efe`%SZ(MF(#gcI{0u)n2>TF9SVWDgn}2U1z$CJ`s}jNd<& zN>h|AX;YEUv*%XzPCo!+*600%XPUx+1bs7Wkz*+qzUtJ`$qEblAvHjYo~iEzh~=JU zzIDhc>6#S0)=Evv$z~=kg)Rk3Y&x{1E&{5W3EBjY?_j_`nG6j_=`c*Ol`I&UF9pG0 z+S$m>{@mG4X!EeuxOG=t-HdE*u}tzM#vIq6nlj4cZoN!v`VtK+v9uPK$8wmrGjtFz zu3um-rXym}fY2}Z0#=RD7azCV)8zO5{0QpxCUe`(i$8vNU3f(25HE5&@~)F}J6kYG z*7MPO3kDiljUm|K=?+fqH}?Du1&YrWbR^>Xe)`rU6yfft@6ac~<nShiKhpw2W08FhunKk)Q!rL3ogNm$E;I?&ap*_L@BrX{SM6fW!E}hPS3)}QphZBV)S+JDgp+5ip@`+g9{o}k6 z>!F*d0NS5_Bt#UTX~3tC2+*lPK@@$aes6(&AR+QiCUlUpAO9^*TAK+Y>0_B?J-fxB zT_CUoYBuQZ=k?eHmp)X5BkZEJ*n0oh-sA}l_j7%{(h(&Dg0Wozt8*o#diR$U`>ZD?w$A?^`8}@ zRe(gi5I`Ru%-eY%Og>$0rzkReLxyO(Y4yP`O=GxKxi~MY-kVfb%L!7E$rx%zA2fqc zI7dx&x6D{XO?^h{3#Ufbq=ln>{~9Y1w?L#(?$h0E3H6i#sLdM}P6MW%rbsxYaV&Dt zJLP_UMuEJ(mYrNfFgAwrN}^i)Q85+Q z_GDLknNoVUIc?NmymwLfKz*~~_U3+8suxAn2 zEl%YzSbtNR@c2@^6TuJHJdv|`|Hm9c1yFtM#?(GXjMDsLD2_rM1-;Pj(zTq0dT`DR zFkl~Sy=c&N(<|;siFbSG3wm#KW}b?e*rPSVofHW;*Ti??XAl?6?LD58{T5=)61{Q<@x`w#T8ZuJsII#-4Drid&O`HSUPyrp^wq16 zW1=t3KN;HS7uugRfqLchcGCl2*Loqu^FcOWk&E^@PWWa^0y5Otz*Fs`lD<4OOFoW$ z7!$n9s(^$pOs7?WTrnb?x{%~$@q4YcoP_x$bP8>xQ47!#&Y9Suyn^`{6v z9i*B0{xZW5+U#0=<~VkhFj)jTu9q~2$+-=jQ#Ipw1^=9vW&Pr&r|lV;F;`7iHnL5O zVOv#c#YJ?wDi(aSzN2W79fNy61nKFM5iNM$(2Lk4I{Y4|aL z%+OXvhKBGD0wV?o(bJgXQ(D4*39o06c=3kg1@jY3Wcz?%Ng9eBnY2AX=8D5IO8_olN1z z<+b1#d=$%McgeWZ7#@o1IPeew$w0d%-|{#Ho3J)2?e#EjyLc~?GShQqs=M0nIM9uj z0_}d8;guNW7>}t4GS(m+TL;>4zERme4AaRfZkzo@R_tXDor>pW+%Nq*q1rEu^9>a7 z88boFrGmoHB&*mZ+bnG}pOKq@2_F4F7SKUJoYg%~09HKn7Z&c&U~8T4u>!KR?Zk(2 zx;<%1Q)c94^<-SlB(aLV3+`^1_r9szibQ?{jxOsX(?a;~)f83pnO9DBE;h%A50P{V z(2Gv_+r61q(+ZISFXK;1uzKG&*DzSFGzN|Z|I8h6IASeixIhip85(PQ{C(&0-5vqC z9;z>W8wUtwguk z??Q~}}Uh^{LopHq%T==!0zF*T-E4oQg`)$wK%qK+k{1_!Tr&x3}h2N!`VJyG8Sh!_EM=7VZP7d5>C2 zr1kUF#k*Me&5!SKAC9>Ve^Th6>F0g>WsM#B;y#VoX3ZM#yM>5|mQm zqbu@h=@-6di8g1R!Lh3Y#TG6%fiHo%{0IC=m;ky&u~F`$g^x%fkVn0WK3Wdq(8IJ* zjqge@bNY{r#0kyo#2>|wTum4<$CwiWF}sy$={HUjlv6LN4CaQ~`EGTqtp$KF_O$PX z``2zNEza0JndIs$FLMlFY1aW4bwKydwcc~#K8)2%J=Cu;bpvzqkIwd+=|Rv}fS=BE9wzva8y=5x@nsTdNOcRhM+;`?&TQHJ4Z5 zIp>8g*+#^aEH+eM3M*(KBv~m0XVg>$SfL$#QpZxdYI}03%p8MjdTED5OY}~wdAo33 zEzb%t1mZ_b8XB3X5RWealYx`;Rr=}W)lRf$I7awzj9#m54}zD2Z`r;+JN;0c0GpA> zBTg_-`oa0!?CGQx!-}7`CHpRM^oox@#u4T&vJV$K-5~|PjY2rnE7j2EArRJ*iS?(o56aH%iH8Q&}|+4_)XdNsqA4#v_3a4=RNUjN=(%2TNQfovh0i<=ksUr^^O zCMXb6Db|R-i2VEgL|AaWf?N;Zn2oX-v^@=z5!$nG{K-a#=|tlAQ)2=kJ5CC3lwiO;Hz{g+PsXah)M%$y60spzrt zSLhABqpRt$Fog+0hZZ86lPFMXIv#mEY{?0w6|Nq-dlk#|1`>peyd+C8Z>{<|3P6EC zIE3QT*PBbzv{Ws+BAUQ}XEihNJL_$y3Gzj1W_^9SEWR9IU>7sgBpI}e9*L)M86 z8|9?Tk_7~WANMMZva`J*vz$bqgenwkND(z8sZ&jr_n|%*66Nkn(Cq>sr{n|85mf@^ zja>!jN@{uG7Ajgw{c`b#uw_=6>RJh77&u}FFb&Dl*%M=%gPDoS#f>EW2S_w9`NTD z-?ndm3dT5ZwP)Eu=CJWToxhxp%YED&$~(!kDeRBvJ1O9h*1A&1ec5a#YrkHdECc?= z%q(2A^*0GwieY1gu5iot{!XECweD9fAH-0qvS8O6k69d`1rHIEV9sbwmwvAQ*rMLFY1rhT!qS68-w);?iB>NXK0hPxFjbL)-(ac_T<~aX zFPVO$tboV`Z2Ed9J$)Pb*0NpE}j2DLdjbL(h zu(p*!goJa$Ex7u!TVRp0nGpOyG;aB2m*<^0lv*Yc^-_m4^b7d1Eb@oCSufX3XX|7QgSN@2${LR_<3U_0NNa5eTe zJX5PtFACE9(A2|14Aoj7h+lD-Oo|GnLvp5PF|Q<;R@5HnvlU%mZgI3{LUJxWS3a6- zDv;{?L~!4m>|Da4x4?jPOLi~9-6bgVDByMid2mg6jQDOfJ^s-dap<{%(*x(|-AXF+ zO;Bq^8vy};rLm*8#Qv5wSB`~jmS?CBa&%+JhEtBM?AYh9q}XuKVav1ia;4oe@WEm( zR~+eESKW1Ec}&^uQ+;>li-)i|v~xeNOOisAg6J-!T{cwxjO%Zw!Gi~*{0i8%zl1vW z+GF`qdh*gmT%O}YIibb{J6@!AtXZ;J7{@rZR}>s1hxxDULeGt(oC(*T9m)qAXm}_CgHGN@n1#4vkNpRms|GhF;1YmsL zoC0|mE@HSwwq~6?7#wQ7b?Uv(O3DUTKJE~JACmXVVQt@csy5~7R_iPO4~_dox_+jf z=H|ZgpO9C80IoL=hgH&oHr%-b3NrJEa<&08E4K3&JJK3Roo+TpqcJX^(WLJ3fjupB zP61qrxWye=`sSj2%X?VWOPzKF?tu-h+)`l+a0mMFUeSzKY`XYwJ6vGwe3XYycC1v7 z;>{Z=``LkPVpq5!Tn6Y6wO9#$DeqNE#9nZ>zo(K&eEl9gxE3b3EvPZ0T*q%1$rWo* z-uzU~3_n>S+k2Jkd4dupVZlR9!Fxmr*YgzGZ61tRu10gR*QSad#ZQ*g{v>7;Sqa|X znd(JskZKd%O=TkG&`xn*>`P9nY#NldT@xyr;|-Jk={L z_g|DxA@-TrAXZ}q9n#@6=l$`d@kxYRBD^lKU_f}sstE?3w&*J!l^->kaDc9e$J(1? z%+h@`i^Bdbqtc9O5;)u`G&rGQ1Z=>^Z^}ol3!zk%`L&w{&t{TcWuFIqV2R;>&|vjY zFw1$xykW`QziY{lw}*D}PrfpE>vux7#F0+KREKcLD)CNcyoebJ3V-Ri$BSlZC-f?V z-lpk^A-4Zo5{DYD$x+Me8DerGF*+y8J2XIGT9$$YA363HduV*LiTmle7XRZ^+_O2x zP)5f)!A5;12Lx_msSl9T%Ez#&$Y@pHxrsI0hsXvc-os>< zy!la*)@&)*sqclM_Bt;ZOA+E^tjllO7dm>c8s^$(Qda3F8~$zOlNJQJ7EFmb3|wHE zUOxjC4Z0@b24?N9hQJqDiscDCTk)Z}xW{vM)Kef|`pO6!*}Q2=|Ct|lmwd!RH%`*K zslRf2`1uUmSZEwd$|@GAb-`aJi0C_UWKbDg==V z{A=2%kqLN2bSEV=Q5t*ANSmxz26#x9h-xosON06mFeRlrlw+#&_sRhts<0D@!2e^H zP~cMGBj7wM%6}NGt`ls)_=1rHV_33ntbfH|3Xpgj9z6X1yCu3I`?L^@z{oW0#wL#k zSQwwv!}axdkFaKe?d}`|iCw`?EI-y5izQ6rxp{}O&c%#?bC#vgf<#>Yo-gLfTgD4` zuT^HlOsc8R=~*MYcr`Ij5l#fnW{n>EXOFt|nBiDXxlIG-dNA89bs*^!zlYgZE8}zgC7HTr9?D;W;)E ze)ly^6~iSn$b8r&GuUKGNS%B1sF7jA|DtsanHq+d`NQ%o74oHH6PE2R;&nuOj=)O6 z?+oa_R#i^8FgcI81IgJ09hiI_+#W2PY*Uz%f80VO7q>XNLmc1-!X8I>}1)0emAwRI1E4 zs!<{1{QiGd1{zE`d<2?zMfI=A%)*Xe%4Px4qnWfB`y@;k%x#>0P$j4gWufBX(494# za9AAwe7|f)s3H>a01vm>%2-swwy?BEp;6!x)`uler(*eTfWYa?qKen)=XTh{5l0Sp z=T7<|8#-L=HZh?mxnZJt=h=ue5$`HX1J3wCxT^-`uq47}L8mj%ubQvTBY88z^SAyw z_YqHXq6}EObg7a7K3ba#<;k;UN>(`Tlyb7T+ZFZUB1~-`t)Yw9fD(G ztWJdaWiJfBv%_PbG!=hGQV>3yxFMwM>?5I1u`Atc|h`Xe1qw9;H ztnn&wsa5S?p!$!PEQ1Xqv){wwt@HmN7^4Yn0GHbnCs~1tkT#s4S4g40vZ|12lDH8n zX(~fM}Gmm3VPm^3Mwt_9wk8 zV|dl3Z(jc(In3lL$*o{Ob@+}<$9%{EVQU*0BSESl7AB?H1%6r0(aLRTd4-f$WU7_W z@-Q{&y4oi_RP*b>q+!6Ey9dt(rh;b5Yo^6JHC>*FCAF?I$r)MLJyb83PU9lyS~P@C z`t{h<&l_z}5<6K!wv!vhPF(Nc3}!oJeSu|%G+GS{2uj) z-Y+BrHtlPCmNRyrKL?~kR39QYRZyXQo|2!yE&&t%#<-gFbVi)_)?c5IzyW>z4PnvJ zpPc>wt5LqVL?A%|QD92zZ@@f^5-4cxQ!NzMxBPmHb>*<&ns*sV`b@=-wo~j;W~>W` z6SR^J(3^h^-vQtb+}UTc3RyEXr_-{ad}SCvz&Pf)F3mhZ$z?KsEdD0wcv!t@@OV<^Vp&2(bO_JJ+PJAR_mF%5qhTNcHo&ZTr z9lH2Sum0uBTqbR*N>V9-!Dj#StxZDI)B5T9ZUkr48p4HR@Iz5u`E_7qP@ijT@zUnQ zKO!Ya^%DS~Wd+I+-NK58U^@w4Ge=Bp4#kwy7(3=-3Uoli6>`M0)fo1zePQ z#XWSoTujcPF3o1Iczaaq|0e;W1wzK-Xw)RBe3bbW?Re5VJ=#v5Yu@=V#r)FLLDAl>|ZnJu2C*|Ip4wnJ0(S%p>a z((ry&??)ex*In^aJp6nKqs*yW5oJ4?fEKHb9Sf<52x}e@U?gwthcO|&zAom6q4LK> zx|Ed4l7KAT@My;I-zz6*WDhyE63tqWyF1m#%j$TYPu^}7W-~Xh*C)M%yO(MdvX@4l zzUSGZe~yq))TQH=Y54j;v(}bYSn)}wI9(q~Vx-B3^VIJQd}j6aJx9Adc&=H;!xZi+ zO<8)mrJtu`<-cTMACh%Y2_QK*;Gd)$i_q6phQxv`v>QmhD5RKIfuAcPU}b0{@z|1R znp^XQOk%BH&CF{Qo&(uF|1P`r?RyVmTnSKt(Cc!v{-ghJfMoarAdid!qEW!ystDoI z7oghK;fTzD5dL&y#n_xODk3A1?ACeKCBbzzBFEFiN5ghX^Nwc=_CO-sJClTnT{6;h zv+s7$<}9ekBnOfU9KoFsJ89d9F}uV0x=m;&TVY#W4Wu*(g;;XT*NfkL*O9OuDs3~8 z*6SSzkE_*#ak7$%qzja8GaQ>JD7W7+$;PeneeAN-|7oJyKfl6$U-aeFy}}{wkudXt z!>wau(s5#sylE}YV@a`vsu+PEcEu45guRy{N({89>NKMC?4@mmGS2l{3dNue@Z^MAaXj5v38=pu_S9mNeJK z{w#KOH`aQ`mHGovp;YsOoC&KR!MHS|MQzx9=V&^BnNHtsEIy=ofTqxs&NVB8jek`* zXGd;hz}!%Lg%5vjh@7ZzbK+tyk8MCQDmC#|%QN57E7!YfWzPs8zPu-i)g?xaQ$vna z-h0+hngSMNCEL3u#XU_B1X=6E1`?4Vtf>Nxc+Kn%zxgW*qp5|%Lc(wa!_ ze|dLNmC~e0cFzV96pUh6(j@gg*H`+GxUusXV@-CaFce@v#YMpA_oE~Pefq0CT z?_EFE7I)oLagNBKV_{bj{Z!R8|8W6fymJ0!>~(HVP_{?xGBq<LMRNcXt!bbtLH{jNUV>xdsyTJ%pA}YD9<3Um7@`UgdNDr{NPYysoY1?VTMQkzk z?Btj3J7VZ`0#jC7lf!L1)?#+rdpAPg>9#C0sG2bP;T^a&USjPH5q`@SQ~SKkkL_NB zZ3@RinR;mIkzDN+@qbyZuGud}GTAWdmiRj)e;@}x{t(syF9)7(!h;JG!gLbmH7?|B ziPM}qsRzxMwJx+A{W$EhH0pFX?uL|p;Dj+gFvTD0p+dCN@+F(4B7db^`*N0b8zWZ= zvO5|eW`#=BaC|*8_?|t@Dk?BSNQ;J=OQiTjBg&|h4lT3w{ZVe{=O#2k>#{C3hapi0 z4vg)2j)@u+TFmOkmX?!24I}rpC#UJy20J&sbhVL76hOd4E&jxSTI(mS`2m?cb;D=h z$%kcokI&;@nH*1c+Kj>+du3I%`Ar)5IIpp2jNP*fyJKb&b#-)XNl7& zZR;ansm>V4dL$xcDfEN+kY+&rP4bMes3!Hwig7AMz?zAD)i_?|8LW05zNp98EgWhw z%E<#yk z1nL`T(tf~^M%yL(0U#siOeHI(d-cFQTX6&moh4w@6qp=RdWmq%prixM^|g>lGR^LL zH!|N~J?jk7!U+3aB3&;o&_)^ZjE2=?jOJzzDpVkc_zhG9Zns{&MUZ*rkGE0b#ZsO{ znFW;}Kim>M;(Us~*F58TE)=7Wi57}ZtJZePwSK7ItT%sJke0GhZdi=Wq6mEOdP+h z)V!_fa^Qf%u)A?9A0CwkKHr1SMY4Hc0XlgM7CZze4W?94nNgzF?Jwh0htxj?`=L#c z;+n-V35UiIkV>+tZh)VG!!kMJxQ_VUmrE>{YUX zwEwY9aqxCaqf*T{`RKha-CjDLmGBDQOw4K_`^&iEQpRjVd0p$(>k?>forbrd5QlEpdimq_Cop_p}xX^q0+>#KEhvR zD>*p(e4#pal_79=!@ODfg>fwz&>UPc%H!UHI3%bHHzO}c_FkB%_c#kk8$Q})94?P9 zX1`=eHUJ46k;PIks z;Cxo=>9ahnkn%2+*Qa6j-6P1`SvtSM1Ks+zoVHJ!x0=cZd=d`d)zLoZ$$QYdnV^ax z4*>mDnn+c>3V!s~o`heKJI!Czs;E|a4xv#<%|T`M@LKtM14=Nb+8@5>+C0THKp}z- zX$;?-<=RIC`6xUH-f(k<`z zArT=yk@2NwCw3gV6i4UMLLHAUS6-YkT}9Nk?nmS))1KcITJRld)TtT!-G_QKtdm6` z?SGh#34=o6jp_m6<=6OVPn6Cx{%PpcWjw#Z!R>^1iME(>u6^D1qp^3D`AOKrRT#bY ztyh4|n8ch6Erm~~r|rot#(}GVHbHDm{6oJW5g%$zZjR`F_WfRMPJd>YFF21T7CPAQ zth*&s|4o~ImBosPG*jd1LoUMCFu}*eOUY7p8I@_MYdGnJy_e z3Kdt_tMJcT1VvSGR*(3# zq63L_tH%WuX$$c1{me^dVXrhl%MIh-t|sxTvz1yWDaXcQ;f{f>F==)lpz!jrDV<5I z%g2kF4e3-keNthHk}|}!>;)$?GTSr5lAvBfQ zFPvRHi%gMN&_A=A#loH}KE5hG6e>VUx_-$CmA!&s|E@4Fy5o)CwsL0TQyrZ)rZFD( zwy7EakG69%|kJnQm`Psb}LspH{>z~ZP|%M zph>bq_{?Lp3w?0&b0c%%aXD9i?rPuc!~duBzijg>GDrygdKzoy{}*2UU>SAb?dhQ4 zAP#Ft_M_FYF88pG0Ha?FoF^e^()8O%Zc)V=s^jw6qq{8Cx$Ux98-c_AB-4x57`Az? z7-_^|kiw@)KUTF8ZOPdL+KSFYYgg$J_)e^^iebbB3`e^k#?^4LA?OK_hG4X98;+;q zqCE{898TBT?-d$1gYYEDqCX&?+e5oO1HOoipmrSfazA}uRu?B4OO}?FA2N>^?2DZ9 z?09wm@vzO#ug-MnJ!_?&1;6Z5G||b);>UUY6XX|d0X$YNlwkTHBgc2LBM4jX*0syQyQfq6s*shF>jT}nAm(4V*bYNH ze-h}fk}_o;CBEZtLP%`E^+P@~p&dI(=hf%QfyHh^hP1$ka-1v11=B&Gi7D zI`d)ZW{Iqd*gIOdA9*ppF7v|4J*L8*$#uqC1>L!=^sB-9FRU~A?+Z9!cc)21u=*+H z2}}x0H3wmF@(ctP2o3|Z_CJtb1zV6m>CO?Fs~svU}&gw;NIIL^KwvkUn+U3jWpT z=F@oddVqK4E~5hKg9dc~J43S`(O>!c4Be8a#`{H>G_=+cIM>#()L16)C%s=)&_w2% z_+JF2g?#eXQlqK)kBkk<0%|7I&5kh(t1kR<=iz1KzDIQg%0AKX(9-F%12%+qr$`(6_W?FSVUm)S~2OwL7V92 z_rn*^ZaSJUMlHjPaVWBp*-gGT%040GZ_s#YFp7P8zh7d2Oaqgc*%m20Eve50DP3?b z6&>c=qPm+^{S-Bb#_cD9Hpdh!|1>{!GD7y+&l`LHeCs*%)v9--ODOIgjpmWNmEEBr zl)1Pb>Y6Q*D5#aEpdu=#YT+T{qZ74(^D-rR zT4^=Gc07D=(Lr{G#Att;73aQ=Ecv{S&{o^4eNFd{j>^cBX3s1Lg2jd5&}F1K&?&O5 zgZIO8>X`&c;_P8*Dp|g^%3-;RdXrFR@FFNYqx5V)!3eaxTD1XUCf`h7?I z>#AR~-hNFC?OYQ+*Ti#Nh1*jeyq!C>cu@ovCOLu6)+${CF@B(v*q%~g#XAx zQfT9eUe~ZANT#I2vZre}(hJ4AAg{19=#gYqey17&a0donQr?bhF+bhElHT_bYL3?( zcJ`rt76UhD+I{^B0rgYF#XYev@0uVQTv|7xC&Gt?%P-B21^>LJA!efbcTdCb79n3E z@UZGRdY445zo}(~AzId9kO&0#U2_(mMa<>9A>{rlUiXI{wPM-8=%X)jc$Zc^O)T=E zc~!1!mqw*V_~L7!qRelR)i4s|!bHA_Vk9K8s}X~uwwPB#N)R!4>9U3R=@MyDi)kD! z%KgBkw6|-!2jXn#y)Ww8}wgH@+vO-ueqeVbVs zEeD>wwreUKZO%352(B*V*A7=`EArOwQ-hP(^85vuNhoOPX7Z3}xv%P&=HpK!-1i{?R?zvDC}tw3ow0$*Iz6?Igi8x(Q@Lr*bUo3m=fCYW3;rNqTa;gTtObN z+MS;-`@0SN&ssMAYItnyRJKt5qtuy_L5_2xLvxB*q$|f>)>@8-NE+Wp!Y@&+i6z^{ zb)i?$orS_}y9Yl*N@z&qH1_zpGgP@cv0+|WESO-aS2}5n`!DsCLi**I;-SkS|9 z52uI~STugra(EJ?*^Zr>dytrK*xrfMc%^@J9ifXI(J)6O#GvNMK`-_@kXibW{_GbJ z9e44{cFd(844}9$X`9w2yBB};}~0ax-Lmr#nvI3 z@I<-6iufovnCpGvDCu<-C*^k+lTy#g&KNBbZU3R*M|xCm09R6~ufk0wD2=o|x#a|< zk*V-tFtdeF^NQF`{GqoFlo-6VG^f6DYjU>O>nS5Yd(;uX;&)~hd#Tl(;emSh-Je|} zQ{fG*O8yD*6<^Rm8VH(>CNJyXo*HIw0=>}|K`k0psUM0Fwqbl`*iP~1Q5HJE+4bC9 z^6;b(t7$2AkY07`gA%>vY=`RG=1eG%v4T#hJgdrpmX^&P;AbjA7Fl~vgx`$!)Rrg_u08U0nNug?4h8J@n64cyuO1RlvBCJ?m< zduQ~PvXK43z0skv{GFZiQT*2qO-uWob$0XfG$7Xiw}>X&7cUB+*4kJYr^x>2kB9?Z z?y;9Ri6?kc()@L}y=(9Woq`Io0gY^b8%97Ug+R$FQ=M9QNk-D2tpQz*J6~Dn0}+h@7GC21+-nBjP#y&5Vx2`yF^}%vbQKDGQkXSPM-%fB^ z`<1(SQAnN9^sv%jW;SxZ+Vs@NjeZ~iui4#x!$_*tPWD6O~iX20bh z&&YxuMgreyr``STRr;T8GUb3=1Zw1xB9jb<@+y)IC8v2Z9IHkbcOXP^)RIoGl5?_? z3Fo}XB`9mzx>t+pL`F~h@+tgSJ-{Mp~ zftO2lcPcEI$pchh*2IOyr1Q*&EliFpw(^d$rOB(D_-!WcFyP}R-g&Dla_A)8f<zCTf74;(oxW*8=Sn>C`+(wk}d-1b=QJDqa@Zo zT7O0mDJ7l>oeg!3RywEzUwyQjgpvN44q>){ym|CZOLC#gK#g#j)$&U@g@DJk-(B{g zw*&m91@)a>5l$*RiPnFF+F|ZrtySFHZUTSdNlZXjw!Yn-4yi{CJ|GZ#xKVNbYTm^Q zJW`^b1kw@XH5#jUE!w;?vKtZ*J=+#m1pCe;!0NeZd;>h?ec84Kr)QXQ1*uy-+`-W8g8s#m@Wz80QGRVqFo?ej#k{@pY}9i`ILy3}YY(FJcL_f8`m^aGPeaEo$gwRpTDW7ufy1YEdVcmDf zkR{?LGeB()NRLBCM7bO;hiur|&RX3aK6LeJg&z%jF<%^2t)pdXl*P#CL1q8dR>CA} zS4jO{x+s(Zh-NWe&*XRjL{60n87TzLej@kzg;}S{bXz46+g&5n_8fMB*Q%g$qw*7|kgY*i4-Ee~}6r68phB4z#z8_)uentDt zgzA6e<8hNrH7z%6CB7pvKIuF;c#oe!bY`%7%O8C_(tdd&M>8ME?jdcitnyAyFCtiH)Z(;e$yb9pFUL`i`lITgnt}&T@1Kyx> z1w?=W*oB%XQ)bAU{-c!mLj9syP`+9yMgJ*&mK;hEF$lDM;EH$c6%&FxaG^Gf_Y(G5aK0@pKgd{lw_ig+7PG1TSMKu*8v#f2)ra&`>Y&&50QOpl6?>MF0N&FDdwWNVjVX0DeP zvno9#Tuv#)A@@BkSyt&wtNzF{z573}7xSii#XF1$D|j+^@8n{iHd_uT5k16Dtfzrz z@Jez@VUE_WYjSJrAgoIj^ndo+x9p_PDxR+3ZubxBK!W#O*euZX3dL*W+Qzn--Z&kU1)8 zh=R}i;`^}UbURE6yW#>TbjM1}BXn|y3XoY`558B}vE`#=Cm5=+f_sz2B{`IN=p-X) z)-3LF>m!~S&`o?v>20A}y@%lOm=TU4tM-Q|Pn22L*3B>*y|W+fd3JkJ4|W&9yna{O zKO#WtnF(apLE}~UjgELwAo&Y#N6M7@?`H1;z+NVrOL9aX;^g}=au~vqe^53N6>l@? z_NBfJ86Xm?Srqh>&EZ7bHw(M}QmMC{noP$eMkX}e4xJsC?$?RDbS1|v!B%FOB;Dqo zv|WnUU8A(j8y~~acA++JH@ZcGcV)SdoQ`{ z7e3uATt}WjKOA_E^W;3|8h0JMCeXg1p+zzlQHjGX&t}XM-J^4&_e9 zEzTSDlQeUHdd&%Rh`6DG2NAQE8+w75vZspJgi~r2KJ}>zYpewpogH$szrX;zcKdf@ z;j5nwm3KbC<}NEB^$QN8)SQEQtt_kQIp0w_*~v^3{ZmQ!%Q-Xd8OAB7?5cz5mVNbT^p8(%6XKHd^2-Ef9c72!{V5&<-%{_&ox|t@@yE_IyC+^N?LDE zj>>00e7%uCyEZD|y-QMvzJw;?_t{_LFAL8+L;II`9`?q+r4R1r z@$x;Y=EB-(%V03L9C8VFXq{h=#CB7fFr3b_5$$3fGmuPt*T4hz1Dhd zm8o&!d!A5`^m%#=_r2j4Q{E7NYQHT`dPM^|iF;FZ zP)-E}>fr#*Br(gdy|LhPC6aMZR+e&-pj&bMFYgq!J71*!RK!w;R@Y=nL^hw9j4m65 z#HLi}?P$?@+;d>C3~o6&2uVntn9fgl4GQ1zVfFTk*KjgKBI4a^mO!?jwOT|kmj3&G zuVF_{^{$?p?U9=L>+l9%4F++f$S@%$ViNeRzl0!x6jW}y4QoT+keNyOPrJF9k!TLC z=5r(>-li*GhNuSfjSw{S!wWZ&AJU&ew7a*ucb*fv`T(#V( zoBEV*&Y9N9^*!!tfjOK;7kYV@@v^g(+DRn9pNQV*!;|Q2h0H&W`?&3R1SzIAsB|}oR%rbK!Th9~xr>H7O6UnDm%kyB$6vxgQP7X&}CKQW~`J7x}cfu^M!B@{jR?!NQK2}hN)5IK2;G| zUG=*atH2H~r2AeY(wsv+|GWwz=@klAi(qfiC!`s?CqdT7gP5}V^|aasm!mUo=QUTn z$`8tsrbHZ|h>&0cp*6Z*qlzY9$jXK>|3t)qbN2)4#%R&qrZfDMnFEaXY18~NN9;^Q zkB*UzXLS&XHEq^feB#TA<>f-ppchw#n}Q0_Vb;etaR?im;b)(+|CMUl8|;kwv4-Fq~!1=-5>dxp9>R?ThVnk5wwF`J|||af9{8d!$KmDs4~U| zuU{uJwe*;$G{FlRi}B872|wNjuvd^sTd@lSKk7su`G2T4Y3O}pOJg0<0hW0t=gImJ z##uTwjIMPt=bqq>kx|lX1&@##8IwG#Q>H$jU{aYc(qRN1kxI-z@MZ&K9LJ_-iXTCd z5{j5znCmtu6Rdg7mkFZL^p3CVlk};by0Uxm{YI+2=TjtN@0$fa8-vd)k7^GatCEF= zJRteEYs9l@CB2-x?$K(LJJn~GyvLe(uJp-1-vy*HA80N4-#NNmc-eSBh4BWo=Z#u@ z*BMK%-mntAOjyjY#Kq8w6-vs;4G(F?Nv$8bPFZP(*2#wxj1+GUe^*s&f6&_FWyuqP zXlLTeMucC4Je7Q~6GL9{$r(Zj0Cr&n>#c+CWrMfIrbhHouQr8E-(qt~=SckJj8N4>oiMz73zJ%K zLjCI!f|EFW)>H$+p5=21#%I!T*?Fu)8e|Tj0%Idl(ZQl5Lmm9oN$yHg`8pe@a)Kjz zb+Zx{|8auU{6Nvn9lG6xP@Vdhw-XqXct$1$`FgNamFHM2OC-sBNK}wdzg|D9d2x@l zjU-K%Z2ZpmooW%KksAxTfh;G5zYNQak(Hg7a_lhd4|(g${hdPWqt`3q^IkH4k;*u! z(kq~=NsEzZ?>o=llef&+QGQP*KPSuK7!PL@nB!;kQkO<>_Y>pR;)wlcvGcb-S}Z*N zUe#49I{M$9}C(+LbNu>eDX#&g_@Xqo@t# zx0boQ26ydJk9hKl#DdcK?+oM6AgfU?K1{vZl3rRtDb}NVg-fbszYNss(~3{Vb(P3?YH15rs)63Oxk z{;=m43k}$LKGCvuR2FmODap!FmAP-LjQWz~M$RwUqv&lSX85HkwVf#Cp%O`wkDq=- zFpxO!v{Gibj%~6AoL8S(Sg~%5}F>)A! zpY|P>;~lfKVnC!CgCnlQvU4OZX)Z>O$dj_;lFqIt8_oEdw||jcdseFYCcDo5BPIgT zCSZSk8Xy*J5&ynidUi$pfZkQfrb1lfNt(&(6ZzBx0@*y{pRD;tq6QFk_?VHn5n%q= z52>x2n=6X79Qys(F*rjdyHg?r5B5M%!_ww$>Jq%ju+MqO<1iU;;#P%4M2S|2L~AQJ z2dbvhB?JX4@fvJ2_ZeMJRu7DR1|t?RrIh7^2D%0j*uQ+YT**YAzTt6QBBF4bV=B7t zCqp9w{lYq>xU0pKbo`5quF05++o_;Y;0j-=z{%@88jp@QA@z#V`+XtaA2$kI3+{Ap z*esr+jJ?Fqd4_@5fAq?a>H3wW^}EG$zruFN=@st%1&bYij zme7pl{b=vr7{?2?M(f|~j`NLvteAvjO8Qg%p;71iUNi+8D>{08nE%%43h9X!ePRBii86c#J4;*qp<#u*P9BCV)vmCC3AEuAhR&{PYA>A=ea+jO<;CTB58j^lf0jHKB z5xSP7c!0BF?i+Rq_E!3)8?tpWdfy_Yx!CrA_`fj2ufHxenlG~p|A>P!U?2}W6_?Rt zXkexb*p+Cdun}Uv2gPT^1ICW}tfmBhB_ARBUp6@y%>%yjYQ#QILiAp3hfMN)Mbw=m z$$5T$4@SRRy}J;!O#sp~4(6))S7R+UZ`9sWy;CAe?`iD%bla>abKTb1 z`G2Ztp!d%aQzSFAIQ_2P|KJPcE18gJqg6N3759n!T zT~bzv8tCKt;@s4OS-HY1H^iduI`MMHf8!n}hrQq&T`Sepa-}QroXf zSeDfBD#X0Hhq0_gnQ;;R%$k_L`BFm0tM*)4HMs<0-Ya;YN4LM%U?=}%MhmAB3b+r* z^vY*MzHzfm&?l$vYgp}L$9|XE$7;YQ;@t-IJ4}1cl(1oyD8a(#OBgEt^0nc58H|Gl z`T3jUkv%3}8{M4Ydz;&+)+y;RO!SBV-%SS!Jh?L#iys@U_jl!uc_L;%| znJP1504pk>5?J+*?xo2J{l^b~C1Tx3OFIj_z>S;EmI1}^s0Pc>8MH#NF%@bkKUKMQ zA-o^yq_*xJNzJ%CBE0vRZ^S95vHQF|e?pUPR(=LodnpDT5)`AJ%} z)i2l9)acldSOi-cIJKHk!|T9`Y90(mnhkqoLCRQShey7uvpU3UD>^G3SV32ljtrQJ zdq+Pqi#Qwdbh%BSI}_ywM3;?ycNldqC^_UCdwN1IBGy1io7kdU$RTDZEd5RkuECS|q;5 zao+4IZcQ)#HCchCMxpLQh+15=Vw1XDA6*0@jvpFaa4@`p#fOH`Bzt~6 zPWoy&9K$80O4&2qp-XsB5ZUtdG|hQ*W>WY42&`9LE(_s#mzko3RYW!Gl;+67{$<;4 zw&y;Z*$Ewc1)XRQ&fAdA?;Q#FoDq7l$7N=o&P(6i47c0q6x>HuzPDk=2?}GQo(}>| zHz_#^H4xjUx|CrF`XmerzP*J6{b=gb26TkWJVq}yh`ND@3~cb-y|~;86FZN6!r5f{ zo0-W$C(~KQl(J{cXNmL+%_4*{>0+{FsQ9Z zYh5qFO|^~O`0ylY9rl!G-W?KD(AK~GmA=r>>w($VG4>|baN5v@R&rWebI6cw0}QTE z7V98uY_b71zr(0<`P2bJ%cMLcahuqth=R3tdf~~kh=>({0VMA>{KSgqUW4|9SGn1q?DlglH)zMC zB_hMgG77udw}v6KMH04Y<8;ZJB!7^k;7JwVur7KAVO2SF2DBc9`eu_;BzY5eN(wBJ zMM98u`J$AKU3YAUl-WmxpsS6RAQ%eo@`j{apa zkrdLXNUNjCTr{4rEQ zRTtZ+jJr56voL4|d85k0tW;4iEV>EEtz|#h_hpTJcxjQSB2p7Sleb;nYcDktE;NT+ zg0}+(DwLd*W>&&uBpgYI3`%8D;W1-L=*i+*V|%@--@u`Z8mrqkPNz{qiKmw3I~ z$z!(6PO=u(zK5T*44iVtb$_-tP}GJgYBl*J^v4K@C7Du~{40_i-m@zKtfdBgqweAV z@mE@j`dQW3VxkycbyaklF8VsJ-CMRgAVZ5F&aL1k?Kr0$W6PQfM=ofjsg25E5NY@wWaPbdMBI@<_`AjN~kzZQ#hGQ}qYn66(z%`>`MG-B0UdMGgfL zz{bm&fNq_5pm$2;D7F_-_D2o`NwTMb?#a~xO6F`f%SR+u#@@;qZTzJ1pZbw4BYXBf zRws>yuBWJA;sRdt^9R9HME7>yLt5$2-LpY67KdA$J#a1fePc8+)^AM?ry-7Utxwv9 z4DIs)SqFBr)zSufI9dk36Pi?jf#V&gn6%aB+8K&^BPQA6^CcSwLGakn@2ZB+O{WG?BgT-E? zbO=C%$tFbZ8uygShZ9{kg7F#B&VI3RKyqJg@u=Wd9KbmT z(~^p}2^q;z7fRzXu!iY6V-;i+bOyKJ*~N+fvDu1H3|)PY>8$gQRDAN++V{ zNHQ|0?M;dEE*mO#m^8GyGbHa$GBTsgj;iOXk${m6_+mQ5k9$$fX_06GevQmW=hl6o zIoieTg8mzD-^ted#o7VEl%VM5TV{Tersc&r?78R-iZKrDf|(u5+(X^nwB?^K;!gs) zR*HwgSERr==8MCNd7^p3dPNMN~%)VLX&Y7BH%YR#kVViA(b_m zEm8&r9;rIF4`D~Q_f}FW=o^#hMtFK`G^$Q~^390p(w~SG;z-WCMe+SLZ%4nQy9#Zz zLuU4W6*SH$7tppCI8h&90PxH zqSM-9_M~GXuUp3R41-7dy-4ZV!~9g4AO2@rz1!oD57uSkgq}?L*<3mu)3?`tzJ#}H z{(2xmi)D0nG#Edet@Um_7>&PVB8^rIlAcz#qC#@}V@8`8B>F>)AC2A*s`jC28Z!$avf* z1&3Ugd}3=gFH^@n^15p6s_?Gw6WkTM`_S$X|B_x~$(S$%1r*G(l8}V+?b#=!DHJpW zokjC95Fd!}x>1O`?Ef`1C+B z`mm0vy=S5n?Mn5=UF}3ypviR%uvC8F>r|9*Tv-96XtbtIf*B?G7V z+~TWd#x6rBvoNv7n5BDu)38vy`yCIm=F&1b4;r6DQXf~{DV*kxuk+2HD!Wr2X(pc! z)+Gj)jFsi;p?18=s8n5&Mj<(hwk`v#33wBF(qjN0AjmbQz;9**e}YKxqjlEtPd&wB z41nD8>>h)xQ_8N;*QwFu;}6AH2PNh*K+n1LdOZ)9TdC^r_b^%nNGWfw zz-X%y##0<}npM3ACRD&VH_rRV(Ett(z1q|nyT?Ted9!JKLtrd?yl4>umW+QlT)d!q z5Ch)8x_5Kxn@#T(6VzAdj-7aj+V%Q$UiX=jAoYw6qauVn0TLd}-~}VGZ6C|w>U;)L zm|7e$DFpEdzIcO{ZuyYVjG11{A$j};6A8fly%Xls`D-Cu*B(*ah>_t@?;s+c-Tv89Ts*NuhPB(!^Ug;~OQM$UhDmfqs0)mt zSbf%ya}CMel~2`s-dDF-o`0S@GhqY7Ey8pNP}}1FOdAvl{1?8#;NXfCLw&e>m6}J* z8noCsRXdrb{F)0z=5A|b<4n(F>|ShG^(hmh#boV?>Nvw?m!K4x`mnng@;GYE)DF`W zF*FvAI5j_{19&eTVKYN97Wbby(Lj{zZn|MGR^Y|Vf~jV>g(Mfm_ecuKp)nIy(spQEt`!gS=c z$E@A2*(RTFDk8{}RKntplOS;t0kd8PkHVO>2I=f2Lw8VsW(gQ2MZKo`}FTdzgrv5b3; z$$u3|``)vYD&Rh|Mkx;sxxk0%8jM}d#&Spi2;LH28c2~kg-VgS+|1|%ItA7oBjCSz zbT@S0e*G&U*>4!7JAghv1Yw4YHOd~kZE^McUV9b15|RQNfSw@t;ea)!W4=2sr7)ud zMKZ%357!RjUdfjk!kK-H3@PIqFE(=6E1I`)qw5h_#BS_|G!?u7t1n6n@Th}Qat4*q z2LLtMot{_aX-gw|eINThLuuErACRx-xPi8Qu;Hg1A{|)1_Puh+=X^dQKoqUqrbvQ* zB+tir&#Stj#%M=z+F79W34~f&>wSncdzUbHptR$febN(Aw@#kx!CjjA_6iW#2P9OO zR@8R#{>Mr%-2rr;E_N)*>s0+)GiDxwp6;^K@|1na!`5ElaLPA~&L~}2!tv*Zo>r%Z zoNK)$TI{RowImUfX%agGKsMO9%uN5?_rqrOrW>|tAORHR=x8ka69@|8$ zcJMMSs_ww;QyVGBVVpkx#bx766`dksaH(SNC{wcUwfqiM-Q|}NBJaBMFkl{GQC&$9 z{M2$$OWOp(w*Gg`Si0qOc3q7(98kN0>VxY4Uh`3UYKEAG4Z|!FM8KJ?KXtmDVf8uC zwi){@CDSWr`Y7BQ}%*@W78W#Gshb! z2JQIsCZs#(kb=6GRGquA;|eWtbT5-jCS>OSET8Kx-zAh+9_(e?Xi|^F8PII~s=zXM zg>{A1^tHPqNe$EzH-MKOe|X2-HgA9wbnpR{b4~TfYJU?HG8B{22GOOS)+pYv%L8$J zFIaR+V6<=aKBs<+Tm6*?0iVEjsVdZ^?>L^!>s7yHb@CHSacSG!zj5}t(Cj3j6L+qw zLEBad|8#@tME+`uQ8zKm;Gw8dMS)+O!iz#i35)`XCGf(q_mYTj?z7+&5`8;w)!hxK z#Bs3#MRAgh8`Q}C)vXmvO?jDZ>4^5Hb1^0=Knb2Ms^7bZxm=YN5aDR)Ef}x?`^Q=Y zgU+lg{W0cB9Oc@@wQRh+eFS4dUjB+SkQkbb!|+DZj*&U^7$&|?c_%G5O?!!1XK8}E za1c1qtow8k#1c|J^CQY24hj8@`YPa@71!=3e!=6avSQK|2d@%~9a*Yg@Jcp{r-=w~VE~dCnlX5;zZ^;Q%0%f*y zHMwtyWdkKm-T2{T_c}lJ&jyj|~nE7d|d*;6~g@ zNxxoHhuRGyYfzJDPT;Rs_Y_qo_J~3BRJ7@M13jqbJg3kNSmI?SMoQGSQVwdfX&zcm>uT+@N-Q9?6?3K|X~4 zo&p^xd4kEsu8XQnAz7guE`pqm+(En_S=@S)efYdP5UP zos8N7FbQhft$8a0DQxAzCm1U;xDX_`{MR(78$X~OaVK>1I6KK0<11yU95YCwWXyv+ zlby#ISO$%6a9z4C5`d{WBy#U}wW7Xyp`SCu2Ul^6M_uBm#a*M#RW#;51naZm4)OEm zWxn0?!cx!!?$#Wt?agFHfTTfmRBUG>EjSHbE&-hq&^v)L$Z|gmMojR#h=~;KaW}@6 zhaFSvehMDG=`$UQnp-D>O6)5_(8_qv!rNTA4n z#wtXP@|h!F4LukzH6a9Qzz!o&DSr7?_hBx!C8?gg^Na#wnaq8Xll#TTzZP`rWM%gN@v2UvZ3go=`~zXk5}~9Z@DV-eUQk+B%>XU zWND5^1JlQ;4UO!CQqmlIrob7YfX_vz;|0SLSvNMKc&E>`r*F!IH;wzy9nuY^g7s+J zs4G-SZDM@V+^SnyRDMQuVpGa{L~VTjZvJM$$iSm3H39K+W6W1EFLLO*CD=P! ztAAEiunPbMEshV4SqY(|iQKQ?(b|vbQMNRDm}&&xir1icuC4d6xt9)_QlPG_xPpt)`kU6rJV8B|j!G<7+D_DoEOY49{~D39ljF=gMuuIf(A{A zo{W!~4B=|%l$w^6wT=dOJMv;%cvo|ol2uR`TJg+l`l|7C>g=L6!N0}xnAiSo0>O5T zn1AadR5kmN5qY@U!CRf4?ZMvoxc1fLc*b+@A<39Zcx^bHPJm0g4Qnh+ZeIS)4osCT zAu5kQU*EQoDu50v2SPYpxE&jy%47Az{Q}mQ=+MH5)LN`;NIJKzDXbPH1iQ~W;|?e()(v+92JU_kciBLPWB18UEo?RJptx&>`Xhw!(|Vo6 zz@~5dmwD0SYfj>gR^U?@jpEcs3@=}27Oa*kovDQMjGo2Zk3A-M_h_a zc=ipqH?=eIjE*FSX1e9A;h?UgRlBP7gl2v)qF?*D*`@M!l1PA5@W&FJ@|o6z3F`>U ze5YdoY~;xKex}PENAwSCyBmB1xx_Q(vf9bTWy1ms4d+6T)0W$1)+O+3M!=~v8+`S9 zGrZt)j1mSG3?uxZ)1iVX$! zc5j+N?_y6Iqg4#4cz-Nx4u}cnTzWShD|84c^yui*=UA~|iT;R=F|QnsCIeI7m}Jhb z|A3q)9#b?Y(*QV|d4UT#XIzz(5xVFZRzo+Q{piOYS!%i@(c%MCWxKh`##f$3T_uFi zZIY{kBkg?~#v?l;!a(%;EcKfB$!+I&(CP-r`bYmBW&mCv2blK!>Hq0jbD_~;K=*?+ ztt-f|o5M~KqtV$}#4Qx~TeN&`yY>#sas(JV!5rv@&6{0DMn1iBbRBP5ThaU8#A9g3 z?>e!qStGF78r%6AS@|;deO*ho7J_xD2C@QQ%dN;k7gFW&rQIXdfd+92QsreajX_UNseeW_ zBdu~I(TjN391#CbHq>koOchm2i44(DPfnnAMKOMNZ{I9QuXr7HBmXhv7S%Q_uoZV7 zBl*~bRrnHsgy&N9u9YV*5x9Zc+gb35#Fe@#Rpq*CsOA2!CdWXy7yERF&5n0y~=Otwtoe6LBxpdf&F4K z*YSPN|Bj{NF*LhsN?T}c7l~%6a9C0qg<;pE{X#rs>%9oo$gABCSy$(lk^BszWzuE= z;KEbA6>_X3UN){|m{A-T;r@P-N zO7&koZ|vO%S8Q$YPfmnpk1289UTu{j@mgV*k)QF>jdG5>L8Y_5ugSysgTvWsQ{@rh zsaT;})5P`D^Ph{(qiMy>a<(ZyATyf?*vbz5j(u*WV>`s>G_9HDO4@usWXdD${(q|KR}Yr45=sIe58*CN|XGMH9w5&4q%-C@d(L zzx46%-A0-*19zz{)kQz2e2m7yWG;#>D4rZA+4lZ)89^FD=U+HabXL+`*Fo>KwNHc| z$5X8EX9KqjN74f)_EI`GSDUr}At=aV`G;Ly&uz5-e_%8r6f#ASM))&0c&+>7w!D%w z6+kqF9!S<1>HN<&h<$9}Dx8QFVVz-L{w=fM1Gxo95@1C@R4++8wmb4gMFIZ7#C3v^ zGQE_Rwx3z*d$}RKSx#xX#iE@`Hv4@+EITG0j{F%R<=8C>@WTA&*)L{IT52K55>aut zs;ZVk@#$m;f*!)u7-Xbg@G$@-Ikk&4l5ba1VL-&*@StD2c{}+$dl)^EmnU%I3;GPZ%kp-POTBrSGJVMaa~~hy81W;Cz$ag63 z#pigZfwPp}hbcktESS2~^SU+*xBJ@h zqW8N7!E}KkeB3qv5-=XHw2uofCtEZgAB`WK+rKwpgbS-k4Kx#kX1S<8g{<^epC^Nd;zB zI&4KHY9l(Rk|qWw91A5zmks&%Or1vR;~T*ayC@v#p3PdQujP&oMY-4U~8JusZ@84T-tiBqE()-h=Y&G+p_p8Hu9lK?#P@G zw27a6s!p>Csd}vEJ9QOdM(dXPPL7tKBom-SpaSPKK^msj>UX0Hk~G69;vPzZaw0uI z(`xHeje3YM#oyB|{`(o{l50Z)VauyHj>k2UVoVlN`APk^oU_P(ziL94!2X!tP4g`7 zpV9tbM+P*Cg-}-QBBQ23)VnYr{QYvz_vOke+UvWv%K$YX{J0`k|F`%})sS0ga|(UN zR*cijLVBg+nuw0+y>Nq;E+rM?4YILfB!*e8>`QJEyad^un*R7t1YM#v>X0cYuGw15D0M@ys^oux=;X-i8< zT76F#EBj2QzL#R8BJ;6)-MIZ77Q48FK9xxMZbWImP>(LS5RIR7SFS}H@)V{*sp1`00lqJWIk@qwld=g zuLh7$5LY!xi`bkQt|#y++5b?VnBSC>)*y^{ePfgjK46W)0IE7zqrn>S9U#owV+uI-Qta7#uIYO1 zcZV7andDWx^e`$W>E&#EplC>CG(qa98G;|nDBI@`NglR@;)1cIh^iz-{q@{HEo;Gy zS>u{g>U-#Q{74lmFe%YW_!Xl$;avsBU;12FwAXIL|acKaiL%(98BS? z6jjnMbv5V1bCE#0y15T+#xXAh5XTp(Wy|^q>15l6Hn&L2lT&zF^I^ONCe=<5v?PA# zxouCn-iGn0`Do9qwI``vdx#a1SBvrF`73{1EP2O6UTfEYAarE43$Hu1q(t<0@`z5n z=SrmAax%-Zf1n9;PA_o6@Qx_XzUO-dy#;A3?w;)_$^0e%p)qwpz3gGNsNkKj#Y&K4$A?_E>nFpQOy7ZMB#JKil9qMPgcE zBKg1zFy;Dd^XyrXstXx*6!2}!JH&R*8QSy`-ldcos0o%|tS`LMYgg;b! z0to_0gADM8J0e;6;y-%u=Hp?d2pM$6NlTf|<=Lyse!b`I>e^#a*DtA+^FbUMR9x zzP7?7oLpv6yZP#(CC6o}BGd)^cY8R}=ZM;ps6VY|pR>}@f1+f3+`9F-miW%_(B^05N0HyfPvXrVY%l2 z?c??Lh6P!~gx-!hxW;xDMGX_xq8(^~!ztNAN@c*;Ho$N{3)I=49CA%1-E++dJZ911 z!B6zewu|eou#HO^a~s7eCZZ@uvKghUHs&&%wT!26Mb{=^FB_D_Be+F@xrR>ET_MFz zh%?%iu(uEiAMg}fOPy&O|0V9;iI#q7`!5zeDwD7_5w}aZuc0nAj3t$;``g`y1_-`pU3V4EeI9 zhkDcmBxV4)X2(})4Sb4^ml^VuZbV~LT3oRhzv_J4etEtf3o9~Ap^088e~8kdAZnhayDa4+F#Nyw;duGi)p<)oM+;)#3jy_2Y=`!R zx9e^hu5ni5W{OH|23qxXi>au_E216T4{~e5Of2ddUx%W;@&e)wj#V(=Xb0JELxhkY zc3f+^QqfeOlBUwA5Gn~~al*Q>oP%Ww^NeE>r*J#XdCwFqBVYN{{+L#Civ%yeq0rU> zB({($vlJ25?>ZHrE3svg@lBvvqrDH37PWc*2|LEZfbY6}%AU(T{x`(-cOV5X`o)(_ zhCtO=c9usNFknPAnCX-TH83SHrA0%;eJAuhmjH+gw;?;f!4@v@J&dfB)H8qorT+Z> zYslm#6Swucu;@0}7>N2&#Ma?(D?st0YI?`+qNntQqbV`E>E)eKHx%`ZYpE8$Ak)*u zfnU72Mg1ADH-{U0+IT}sfG6gu4`;B@T3V@qe_)6W^GCfj?&l7naRRo5wneqwFR6L8 z&S$V+m@C^NzR7tZSFSW?nF!)~+I09ORe2NnNF1vroOtdZ4tdIyNz9_lc~nBRO&-p8 z$oBGz!bfzuysGl;&Y6UuvV?7+EQ|Q%ZkI+U7i{`EH@cGWPTvC_c*e>d0y|-7DBsSF)Q$!c_NIAwb|4=FC7b7_W0@7dXLHoL`qIZc!f&i|2#DD z@;f0LAgCU#{I1V`2|s-_g0;_LSu7t&L!YA=Pi>0#BN`C&kygWG87vZ@?nuwTEo_Il z;xN9lM4`7Q3LOPqo~Y}abN&s*;SH>22LTgWiql&(B1ms~`Cd7VM*Ba9ZmRBn*hm>{ zveIa|)+Ew@vxH{*tZ1#Ba{gyq8ZkdGC?tc#wITf<)`TMX0}D8A*{E*6Cevx|c*6_T z!ST%7;T~qjPS^y&?U=89oG;%JEF4Y__~0MWLWeRF(WXYI1M6;mrmfajF`lDih}#F1 zl(AW!1)gqN(m+X%wcb@Nl}Zpp`jG}Ix`fW1j5FH8BE0*}!F(jEK0*uqQz1REw#UJFt;wUw)I!w*94skd%N4gn z^n5JVrs zW8v=A*~gk(7_NrXsBQTAQQ70#!R-5k;c3!^m-~D;%Kd!_qYOv>yEp3Kc9im)9qmm9 zI|xR0ZRww?#2t45jm{Z}IvYK2hdAO*r|IVtl5MpJjEpGKsV;bqigU$*)F^Sg;=mGp z3keFzMU#V5Pznmx;ws%$kG&v_Elb>)+R^}mXx3>hKhEx?>P}Hw^&BoMGK2t|4(MhZ z6vT7Uegks|+G_+YNHH8W(=_;1A0&||ri|$wL; z%T2MR>y++MJnuda<1TL3B$~&<=njX)%2P&0pbjR@W_0w&V+7aXSS%=xMQk}$^ld8q zRU1X)_M2=W=d1wGUr?o#XpPT>b_b~&nX{=$6+scVJsXV6`mvjf4z##X*P1Z2@l=gF z^g#KRZeghv20wk41KZw$>L-{xE=i2*xs$*J*Lt$&Wsg;G4wuOtc>>FD?Ke13ldbZ& z!>7idvAE2&D=%+o zlOH<;K49Og5r9^cZ^U|ICdto)HsKIV`e8hE7!^r>JbDF^Udo0s5-S>ayUVi;;_Y2h+ zF<9=H!E-<>WMn{MDwzZCA(hvd=>jK~)BI;G%K(lwXG4bJ%{i@c{nMZZ{b0p4Q;U0qA+y`o{ zUt$lI7|7+WJCFru3DfsjmZ}NDqw-g{MILGn#0+04yE7w_D0YhRBLcVSCmrM_SROiO zmBAz+#y7ej6%QtducL1;-lH}g^f1l31n}5elry#9!j2t2r9fgZ`qHgL6UId4JMk?( zWWsRpXWh-*%p(&US|v#Otlv?Qvi@p%JgsJ1JK6vesp`%X&0PGPkSg;8A<&~8q9Owu~7d>xU5)Gx9b*dP%cuxYZ zKfvy<7LOjZKnp=%QT6A)Af~RB3S?mz}gcI3f>Y~q^m>PJ9 zvoSJxCqwmk#`)U|E7yINnUo*1{%hJE4hk%W!gsA_#s8<|2$2vA0S20~f5;Vh7k$7a zjGNY^^SsBln*DHatA5*kj$N-uY;u+|dfoJSej&=~Gq$6>D2t_&Rg$ zNVe?cBr_peMZ~YLgka8HVC8k>@XzDb2DRYJnoO-oM@zXx_R1oSGvRkq^st@_j&O;c zE>8u=n&E>~(ei?iU579qwUvHLCD-t^H7w?P#H=P|gl_g#ORJI>^S&zF@F!WwktStL zOvmrPOZ;ZEs2da+gNe(WY=U(wl%F> z@0#b?-@Etw{f6tluj@Duop2nhPRiK6(tE5g1DlhTza2-}>Bm@&3AKIE_=E;dr(qnQ zLB+-CE($_g&;E}Z6&+6mJ9#l4O8<7FN1_ErS6BJBE4?99MD~MViYWfF zgcuH^<^ktG6445uyW#!#05aA`0Mvx&t?m9e!{G?0Bagm=k5v^=_3TKTeE}nOlDq~x z&WKHqSqh85jswUsQgLH8YgJm;E8C|KLb61hmHUH`Zm6}Q1ZZxZQW^iV>jN6aghBPrEN zT;)DA=aLgKeNif8JyyFLx-Y<98GI-#voI>gj;xh8%S&gI;7Ez6W(vF7#vvvm{FYLd z#kxlGGkuHF`lF`zFE)!Un*`De#h-N8)kR4|ew;0Skr?7Kny7qSgR(7Tyyyth#ZV_m zF@A`FLIP>$w0lpcuAd(En3TsUSO)ofc`F zsm~P9ht_>_uhqLBIN4zP-e`m&#<2Ih&?bd9vs{&scb|3XXO zFzO0*=UOHHe-1BNL{J@^^IBXNMlxk=6qIcAMX)~K&`#1Bqts2}oUBNznilwl6yWPz z_WS0ircMt=!~&(L9vKRvrADSVfujI6R}>u@t3rA#E(6snoe91y6`&?0Q@eYSi&DPd zB3JO6PnDe6FK(Xp+(zWZ)>aP?8!I6;JthqAMX(JdeC16Tu@0m;L5)iT_Ht{FL+W@R zt{b$#cjQH<4=o2KQKeU>n?o|rdKBzbX)7#2XG70*{?fk zL7b}Z)lIkjH1&>9*NL?2&MLOmv%AzQ`yvh2yX?rp0KLxFsD_SZ@Hs%gcR5xU;1+aW z5A%ao;k~KhvgYi zq|~ep3VS%Ucli{8nL&}ShrgPEkkhn{QN1(Ar-oM$R>gAD6`-LhZhv?&w#WlZ24=qE!0EYyYEMxc; z`D17SNyvhPDpP;MFB@4Xs#opFGovxrn(8 zEMbUc4Q_nDs#SE-z_Y`xU>}!?w)52;OTit7X=+yn|2QM;;Da#7GhXld??Fs3LX}fj ziqFWVm6&t;h`PRFK`i3|ug(vkZ9&q( zp|kDm1?{yUPG+^6kIr291U?vX73B{C9k?Eg+Vo=kY^>;7e!tY5@EEaP>sz8+TVfXp zf(`Vz$TLCtK(TND{_&z!i}F117j=oKfhY~HYY<%5+rWI~Dt zetd7aG;d~bCq{KPDehj?T}_CZtxEN`P$}K+In;+7MAHgTPsXFR(giEDH{P9~Sj&AD zTaUHL&vVkuw#iy&!txFeYvaJ_i(c2EAP)bhpauOshWmp_ z%{KGtYe1cPK@W5c*0sSbJ^o<uUVH50BCyedAvLzJiy5#>taUu44%H{tgPNFm5Zg zleL+N6=xhPeA_MOqu8ha0uKb@z_XrXYyqg%|CBub`#hC~iHJ96g_yi1u?hxQczd4Q z;)eF}913)l{+^$EI1T8M>-L>V2xBC$Txh~xG7H27h`YO^jA#h^O`v&Lb21Lb_+l-< zQ)>UH@_VTot)v`Rs2ZCfLCCGYspg!oVSXe>3@V*7K^w!J4#Sy|w?k`A+Qz&rfSh1LbjoyK)6 zt|*${eVt>JpK2v;lxRXygapB0g~wO{`UXln)>){{L2c8@@bO4GHwi z92sTDV2mwnRYV)RD2kprb}l$ZdS?rlyr7$CP1CZ*r4Wnq2)GZ{QQ!z~^2=RziHLLDrVw(?`qcesZ(; zIXHSwDG{ZW1ld8{d%eKR;))hA(4V2}^F#zdg*Nn9P-=X_6xTKPRC5;FxplKjHdhFT zDQwppy|v^lIu<^L*JBDo)^s3OX9F7*@B#%sGG9kmc+=5BU{gsX%dtDPMsv zE)jm!{ZaY@$>wyqF7wt^N~-Lrf4F?|o%d3WY)6Uzxoa|)dwT)+GTOrle9*J>Un`LQ zk0;`v5jnpc%~%=4#jw)Df(Gcmp)Wd*c+F6j7e=9@RzWq?rZEDG*+q4Fbb25tgS`}S zGv4Aw2@qGphEhbKR0>8B)Qqns7gwyvpfoftmfwc4nAuDLt_7uvaI! zxs8p71w73t-k9`{kU^u4x&^#5kTw%Q^=Qz&Y%!*LJZA>ohZ5{al(FXvp0_m`n24SP z_un0|NGzGikXD{izH7MnOSo-6KNoyuw7(464Vo7BAv zSQ|BXl8D5GS(Ko>P3-NYV@ZYYGG+Jh*7&**yV7j;RGpXOtU5c%jO+Sxac zhgSkhhpFoIvX!8)iCA+c-TqAv-h_SAr91flz1tj+cMDEFgt@S%v}`kcMIYhi5_#Pv z!gn~0&-O7Czvy`Uxsl?vbLt}GA%T+p(#&X8{kiLF#uYdx%<itp_94wP;g^fsB}SD$VqaH*5r#fQ?uJPEy%hJp1)vXS!+(FnMhW{#zWnpCqmmP3;^p_`?SZm9jEc$$j``h4VCdJRT(z zu%ed3cL6J;0p2#BBg|Q*=bP#-r%+`7N`AjNgZpmoKiE7}-cn%xYhvBg!ms#UYvWC>x3Qza31K~z;uqo?WFOb{4 zZyK?vWFn?ggre8)p{V7#0|^O&)=#>zXN==Q^jgK4xa0=-4==|PbfbrF7m=#&=P`Hr zHq3jsRrhp&rhNk-YdF2J*1lgyv|t+(w``{Y4h-FWreUFER7NDql|1h&jnd zokvH<_16T!0CBZFe4=67?`mTgl0urpkYKRV8olOhOG9&qdU5ro86E{;rrROn&rpgP zFW-6-$`_}TEO1gX=SLXvn~Uz+iPS%o&K?1H{J(icKDz#ooB!|dVfE>%a;UReEfO?9 zZQ~i5#Um%p#x-+6^UN6dU0hiMJ{m+>b_1~WQCamyPmylMDrpFCnfzhXRLLF>P>FqA z%0fwclJkJPXlxaxi0kh3Xecdt^~#?r!WT~@n{5$?vO(eZ)o8}68kB6tjew$oRLL{% z@RxV-3$1`ZBuoP6(huqO>e|ri<~dyHf{Q>5?W1 zMCTqoSqgXBZQ4C%qk(E(p#3kxn9N6k!ke_OJ+Iw*egb!b$Jqa#0U`JdoKf0{RsPF# zTOdHXk*e6Om7#sda}={I&8~7GmfALZpNSLxUPNpwZn<6^UE{6)yQj#w=qVz?sSY8s zt?3atWoYL+h)KHJlcUS5NFaEZY2{$e1@i9GN@RODN9Nb{x~blJ17}d1W9c?YX(wDdSG!;x7F%OBl$F`*OR=jA8L;d(TZ} zVyw!cb0tds_Xn)T{Us7!9Zm1EwBa?Jw4U9j{o1(8fNn*YaEii6HXKHs43k3w^lNto zEw&-QYsYVLaYO(;lITK9miY4N9Ge)|i{1`r2O{jVTSd3_oq29KFi~3E!pEn!%=Ty5 zZ}$F{1()EfPn*&|0gt}(BEqIS^pgxXd$7)J|ua*^e@NOzGdXq-=_GS(C)?K z%9CyzpZ|c3aH4MRy5k!&c4;C}H;q}iD6V|<gu^JwXWb{=v4!v>*jjC z;_9M$ov#nAb-a@hFE&rVQe^)5-(!xj97iW;gVKv{Ye$bg>LpPf{sdD6CU zPrwVYV=jb-2kfJx*mrk#(5`=CIZyKv!_e{32b+p!htLu&OVe8&{sxoS2hm7)nJ0b8 zxMA-fwe0?=S1kqDc`L-WNaKj7YVrWiM?#PivD2 z`z|@vKV+{u?l)H*bex6Aet*4N!A`Sj;OPPOIl2k+DXmPy=^Cp=WJaVT?&lMU*MB(g z%Yz8yjc_xj@X@1OVn%ddQ_HF4dAw8exZ^H98~Kvtp!V0d(|xq7cE9ada64nS#79Ap zo47&pW*OwY*_`5tcqMfnD zHUc#ySq>VpH?@^nPI8}k+;E-&EOlH~HldWA6RuIOUqQN$aknU8-sIhSm9mLo0eC5W z2vgm22sGV@XrvhYK)@~^d3|GMPsQJ=mik><>5jIn4< zaG%6oYM}dgHJ^Q)61xQ(_|GhDLUc(?2~SW#;qdd4jg@AifD(C=c)2wjhToB)6ls%*aoK+HwsYq` z&wowAsqyh1HOU9jB$VmeOXv~o0bL#34w)V9_6JO2!m5|Y&3gR%t#J-%mMw1Zr#$B5 z4@6Xr>c0_#JG8&0u)-IQP|Au|rx_Q(EsRSbbS|^G79OD)6ZP7cQ~cMK0o`?e|BR5xBeK$%h4 z3dL145LnzXj4~c0;eOyPpjoRP5_6j}h`&|-o(@ugMjn+uL{!YY z;oQv2nr4!R$mBDRAGY_^CM(Zf$}(2ZF#q_A|K%b_&M*poO)0Gv{m&v;5fn75YGzLQ zkB2G-D%d5an}^j#t^i#}a2ejI=wQ(35?fWnas!gG1njdykV)LUOxcffThDTYd2dp> zREJCF)t8i=--J}+JopK%ggp}OZYX2+sP-{AlkMm0H$Wo+x=p`fwk_b!X#6t5u@c%+ zrqn#ds_mJzZ!_sw1DdFEO$>f~$LLNfZ0p$j%dr=Hs5BiMbuA8;<|xFijrrgUzE0pZ zgo`aQ9-9VXMJZIG&q2_nheHFRO`F#3;QF~xd z;DGy`kAhJMz{(63VuJ=9>*iurdrpvDBgPlT4Qlst;9P~+MJK-jMzi@7nJQ+Kj5ayx zFr;D>Db<9Xp6T$&RS4=d#wO1N8s)UjrdthEfJ@ikjKgxmN=D87yATB@jsZnUcrKb= zEylk;E;{7sI|6etwvFCDwB~K z*jRs#7Z<;(*Mw?)70YS{+0>eJ<-Y16Uy|As0S=dIKQ{OiG$M7qmInfJ*vH}WKAs^Ry~#$AldA7N#*V`VdYK~j0*h{hA%~?>2cO_j+EOljSy+J8*&J>I)Y=k_ zL#z=zFvHaW{B;lr7K;&*;jwUhf`gDm0e$EEiuTe*#OfVD_h@bi2HV?M&XD%@^BT3q zH57Bsx2XbT%J20dKX04BNrS*MJ=%}=8__sTs)SI7D;~E?#Wxf3B|A;rGa;VNp$H#o+OJ`qIT*VvY#tKKkSX9#L+o9S7{~X1lzDt` zE61lqFH(@YC;lcD(&B4PHI($Z9!0R-U;x)-g0(2mFd*?kZ}S^!YO z@NeCAtgRAXmI%YK$ymyEO(3`pyJH=nV+y0Qb<4Fb6fTL)Xd(FU#7!#O(eSAz!^MNc zSEuD6XG!G)ejfgQqkjUPn>Ml=MaG_<&EV0~`1==TS)5YjaPPC3h=LpP8;nly35J@j zo7PfKk7WYtBsvmYp?Oo$WZItyvW!p7U;WWF$JQYix0{M;Be^0gDx9?WXQ_uCCWrzH zn?J{|f)J*AfYo#g{@4Z%a;)K`n*ffzeUb%nvMXw^?K+i2Nk|i z68t!%hHpKOKe-i7zjuk*uKV`__y8VE{;>boyaC2mAK^ZKQvpT*nO{T&k|NmV0LfkY zIl{Rs{Kh1tcbqZ8&vtC$g$D$v483+hmatjp~h(2 zw1|(OoInX0vkQpMILp=0!{#^hDOa$8+*iYntzxAxut%qb9wKGKm1V*t$O1h7irA6* z){22X9(FltXL$sS2;!_7A!WMD5H(xLI8UEGym2uTX zhK3$1WfCiJbp__`B7aVOX_*$QJW@Wv)7d^dG=sKV+Z`WR2 zn2LHyhd&tiZApsl3z1>xNPPW|gHoFP(aGyr<|)2915TsbuC1@G?sDp-{HF>YFW#Lj z%LB`Kd8)6y$!g3uTXCTlZ4< z=k?F6sYhnfB)P4TrqPqEs`)$5*0Jedb%gh?OhxD{nE&5FXF>w%B>uPKv6MZIh3lu6 z{&XnuJD$5N3rLuzgNFB?aX&#*d7`G#?#cWh5Djg%Ov0oTWCeU=r+Ny^m)?%r2B=D) z=S?aUYE5@~@$+^^TvAleb3Smy3H125S*x|Z)QK{usNOgHy}&@=es6rl8JP$b1_c;Sp4j&Zxub86cG!5|^ zJEUX_AnvhuTPDqwA#}b%%d$$U zk}$2VrT7ZIU_(+y78EiT8z=JFtuvNCxLt@=v?MJ?fO|LjC8>tgB5{`a6dfqJ9wwQH(x8s_(74U&PR++jvs8=Z^RR1n!-8b1aHrx%n(UpIrmmqr zKXL(affiXF(;#lbwvU2~LCM*P*yK9!DA2TqBmj{ob;1948b!Gg0TCQu748k~Y@l~# zbzEi*T)#~kCGL5}N(7T6Iirq?c9skI0&Xjvj&J_F;ho#@^JsJJlg&Q^)MbF2&F1T4 z^=ciBZ7cIXD=kU!z+VBCxMbS@t%L+W{;ZNv53p=WJnN*Y*+&S8DjB_XdixsW?$cUd z(tTQ7p62ksI9X!EOie+`d|`n@lCk&eYNXxWU6sRDjl%$dckt-Y1oiTvgC`JAaG{#5 z=@`8(yWbg(@>q;kj+KqQd8a7e?*c7w*G~I*&wD+q&QZ)9s&y)hk!x}NVk8Els%=F7 zargiRPc>p=po9)0%rc#G=cka2d+jNq#F&-dOf$IfaPC-ii+%LOU^$W$k|-PnD0nxMj3mBsMFi?V|*u&<>KOXtRHJWz_rM90-{8DsZrLGJh& zCMCPwkJSoMvef`oWwFH?T(-B|;pZ2ldfYXYuyhpcq*O4NtW7cQ+I{aDO@?(Xn}2+z z_F`!czT+)hZ7aYg!g(?1Ugi|jU^x}%#jXLpxwU(uT~%gSD^KDFWkeIaH|un}+((I_ zPoZpRy&X0b?bi>k_|(6K%PbwPA6ov8!538vM0%21D>Ng>IFzL^) zNQ)mz+dbtV4x#9(L@c>G51)Va7M4)8w2>{<8<7a*(L&bK)`FcPvwuC>owxhTmD{OI zvo5d0B?kse2um~n-1ek-a1&b!ib5s}fd87}e|CTWz_bn4ba6)K;4ooU-oyorv>T^z zpFQ|1s%WF!if}4f;@=ob#pBWV9o0v_Pox>r)r>jZR8}Glu3(&D_OscRm@AeG-%J7a zyM(2G>OtB)DmqJdJBE>D;&x^822NWn=% z3%H1CP=k_H3iAjIuT#^XQ0x_e;3!cVxaE=zBwJXxS>}fi z^@{6{=_^Qt(`wG7O(wFx_WSnX5Y2cU>PmBJ4 zLNJ_=Uji^D-4O3Q|CtE>6aNQibrmMqwnec}f-A#`DKD*FKT`w;{A?U9Uq8i?)iZMQ z9>7YG=8q>NL22`OMDO68Wocw%wc1XfG@cDQ5$3;M4IZeR|HjyH-d{zPCflM8gwt*2(w^H(g>4y9$C(n+7I<4rBS}igy^XTafQP02khA6S zLat#R#z zKDN(P#eCr#CRQY+_zhNyN-bla9*y4ppRunzQ?bahA}v9Ldbk$}2+PK@yk}T5Yk4+} z(J;9bpC}b51c~b7qe+C_k(PP!*9K(0FNYVW=b!?1TVhP%y`zh($`3dex`jPZk;Y%( z(AznhapZEl8}*47t61G-LXKNZT=eqvUmJ&!uN^Lf88p$s`xOVl?}?tZNDL6ytzg=BMhl)e=#u%|v&ppJza+E`2BLUV6zv zWd0Bpf0|xI!9G>%oR?oeAduUX1nHdgs8~yw*;D?25^J3rIo%oafzem3Z0Z};w zmT1paL8M;FDM}RRUV(V&O}iuus^AEA0BxCQG_dtE2J8kcw8bq&4yn|T_nR0pTZUhh z2g&ioQo;R?9M6&<``jP)04vUi3T{Gjqqp^^Wis7%m}uuS%S+*2b5UG9vN#L{bmW4= z%e(tdT4lQn1*_3d@&uxhME11{A*dWp-%i+x!-FvT^lz2H-D`L4od$O6lc4@KQf$&Aqg!+rGO8o$uZc&S%#-xjxnxc`#}R0+{dE> z7WuWR3VvN7%4HtE?d&FFKf2d5Tege(Xf0+np2DiuOdeKCU6YBJ$JytNp-7pN#+4GN z8yolx^9kJl&as!#_r0Ah+JC@$1a9rWhB$$9LwhXKXR&UzCS?g_Ev^Xs8&(Ra5cJpC zN;Xru^usole77rvbN)UUGCDNN*t*cuV(atTTYihE&WsI~u+>hxOSCo4eocd+2t3%H zcssYGniPysB^e8xMY86ph6SQ!eS>r1av4&M*HfL|WIe0iRu<*7 z00jd6<`K}-JG7tP|1^58Ic@llWK8s+HZH=lgIL+!^Je+1SLmGZjf0)ET_sn@s% z>1>r+PuC~m3{o#_nW?$KYuC{05S2KAr1KdIx?*-5{+pyHT4Ga2KKd;+vlsoDbq|`H zIE`0!#2f%N)Fm;6i67z)kD3d>b&Uo@3)uQbz0q>kKZSf}0S-kODUBpnPo>Nkd z0J_FcymNICk9VxHjkg^)oymI$(SSiAlL<~ZVQ+`gLE#1!9Mxt2U<6~+sHKEyqf-ipns?+n8eRBM8)m-*;(*dK zsf{|X;A#`^A1JCZdakkta&Hx|{q7Pe_V?)TpH}_6<`X;0M(l-OlgKaQvQE(cg=)a7HIXMWUO06@#I3wxqW3|xwgA` zI$9>?jj#QM79LqKt?!FUe>sw@F@v4$yNjRY%K4s65?z%H+9qIShis?tJdvG90}m+c zbL@EzyeZE?q@&i-S!}OJ-$+j}NW6Qct?INFk$ncKm(_LXA$*D`!I3&fM32^}>9(@f z(_f}lD0dTfRDrKvflBv*4`2E4>LaQNEcNC}=j(Czm}X45mr=@$Gu zfz80rv(i6(6iaN^GW`)HGOvklx<2W$G!k^5w(PDf=^ws1Saag0a|Fz5&Zq^d@^Y1Y zi>Xr$nZZvm(=I(#+~GZ_VoW;{>Ganv?huEZtiS*WJP{2RfrYi#A(e=X&CXn^@ncleUyICzplXpfmE?`=xk|#SPu+biG-c=nXC0PG6c1&s)Qcb^S zwLUZ!vY{!XPD5m5S)Gw{a%a`$Coj`4orm{%B2r9!ZJqDYSZy}Wv%9{psMF(BG=XggJ-=_B>uWJV6sdv=yi&5SD zad!ruLL_tX85w5pbe8H!D>YJDvEMJDO*Y5q=F`~5XIEGcBPl* zdadko8UgR=aOGO{rJZmqctgbjypjjr{PihrnYO-hwHaiZ;3;5A_Q4S5-=~PP69|kC* z=&3o%cy18jb+WwLOfGiP{kTPtgF(IMf+>|BD4YB2KOD1AG8_1uh%uxDY}faEcZRk@ z0S7+i%~))vboSD1i7t`&0%_-BJ>mM{5VW5~CE*@>Us zf!|cgNv-8Ha5YRR0~2+^W|X2o+LqK`hyYhbW;T9FM)&GUQUcv!p$VGi3X$=q*9h)N z3lF$8doL{hV7vkChiT~!WT}@K%skuiyA~e}*i{F&Jir-tJ?N7)%BaiEUEDUPJtT}< zrI$@3BlZ@1NAE#$OVz2N&I=*uU5o}4&W_IOlm3IzG-iEsIGIb!P*!D58Pw{TvtsU) z4+}1qqdOl@_76~*IhubPI>cZ!C`1}v2Zx7l_Om?)hWw(%h4N%tJt6KeX%rCxS_%f( z9uJYqwEE<3AdA@z|De?3dp>y0>VG1N=pnm^bFb^1{<6Oah4qs#`fy%zq_7>Gw-N=^ zhE*TZ_e!z4#AM=1d{qF#g=cn@P_3de)tPML9h@Fv^|KkU zNg?x-jx815%%v5zE{2>Rsl9p-361Lc*tlloT!nKR~uwNkdm>G=_ zBPTaT7r_1Uy02~P5MY(1|CsraeO8sUWzNqkD2LT9^u%>!@wjj3P>lQRXMeqS*FG_-k|~B2v=zv{OoWReC;|gvWeqx0b-Y7_rxq|evUCsu7gb7LIJ~#Dln;| z6|mg=9iva_W6ieykC@f`yMie>!QH1R&soI;7%riNh$4yL2tGr3gGuJ7#FE37oa8p) z_db6oa2JOOrC0`aHL!fu0?CG(#e%HJZ zqsP|0+w|&sDa@8c*=VL%wSkoUAa?nHyI$l#L`&8Y$0E34pEpMyyXXJw0*Khfy6|rZ zu+C$wM#dcW6{j-sqU!U|~)3<@} zButAAYvqiIM6MhcayILwAf)Y%5_h^>7HX595;l|#biG1FX^VI`768ohc6T9TU7him zcpz>2;BCSmM{gm!f4=4B$~qLAKO~{*9a1+*L?xeC6PaiX4hQ4M${`h0o*4X})%`ku zwdKo2$FUI?)Ztj{;}wRk&1ck#wjCF+F<@)iIp<%NV(u>(wU%%T_%9gEhocnz*5XU} zRKT-}cn6hKv4Jc43h{QW;0Eo}cZ+Jt@3mvlV|=g@g=GT+@cS+$DUxsp0?9kll6J`^ zo49xASlWL|1RIEvPfjIFW_BWPelOs-#ExH1P!FZ=+2@ifuZLtxZU!7O?4550j8IzT zy!!4Ol=>Esm05})(&o}C?W12|C09W5<; zqJD$U*h^N>CNQ~U=jG~)hP7qdxxE+XNBE1|^o3s9gVAlL^25@59F&x*fxpkkUb2^@ z5_;B0cY1N|M??`%qVXLI-RRh>gP)?8LNKy5O(#T)>W3*Sx{AH{HrLA>y!d*#?utAY z4g61Tf}nremUv95BJM51whpYyKLePlDLNYRB3aqWWX#mkHj!wtoRwCfC|16_<3_?CZhIWR(b5FAou9jYfnrL`+Mop z>$PRcQ&61D$|suP)YFiE9)^s+ZE*^P70>_cdp1Ug(mu>bh)g4!Umz77+5>WzSDuIj zI-n?!-^AIV%Y+h|%yWMuMM-fRqoRE%;+0969*{lNDQ^_nq0m_#?3z3*@7Wp`eT8z= zv&omK4?YoEl#nI8sZRN8ve^uEoAU|yV>Ka`v?z4>J_>LihgD{eq_Mq=R%}RnwDfP* zd}|{{z*LTpe?B_Zl%qnIprneVl0j=!!nBR%73CzBWdjXZ_}=YMlt7GLuaRsVRI|6g z+@B)Pylgk3Ix}tJSoX$n9rLMo>aO*uMQ_Mn+*x@1BNX-;WF=b^qZ0%Uoyq`{)||35jo-ma z2(Y2rY|M<8KTrV3TZ?bu!XjX8Et+I%2f$-&I`=%UY;%4V*9*NT8aikZ@`AhX#6X1- zt!1NvLdOLBsJ$U-LbQNIqIn9tG7R>iVCqPcEJ!p}(yOpQ3C1PxL+n@Mll+<+qLZoW zwP=oRWk}h}-%U{07{=G%YvhLkO6(}!(Qs6BnQqVc=PTKzh<{-gczb$^_!mw52ULi_ zpc%7yQk!L|1{kxz)z@yxsF_1Auo22YO-Qlx=4{O4Svkxaum*|~6z2w_opiE}ay*g@ zxRGY2mmh4+qszD49vOEk;RsJR#}v*{wm#7+*LELyDZ%T_ zkWOP}u;Y4*AImmf%fEIxzCTK8od$rW6{P|np_i>E_eiH?auX3(8(rn_RpU#Wc_EDA3Bv5_0fqUz)moatXHMzIS+K?KQom^#jl!g@+`=rk_ zn=36KQbs)30yq~;%hv|1&3@(ykHBAUg~<0q`N^YVu0&T-!gf8~py$K=rd^a|)Xqm9 z{b`_jk;g(|77g9Oi^IwTyQ>3kT@tMQ7q6b*0-8k$YyQVmhe&wd>(Jrj5Vwf|Wxq-4 z=n`#|uDr(DbMQlBMDnCpI5Y?b!G&Vf>N0CiM}^*x^8_eB9IQQ8I?NgO`Q)@QTa$Mm zZW}5-r_;B zr#Q3em2ASv_A_rA+Kmb|Kx;I>Z`GBVI!v&U3o77IBh7Z;r(dfaZu3&*rZeXosmv=8 z`6AE1uZl4tc$y`E#Wkq@U$+Z0T7J%y{vS&7K!-&lQd#UqJs`daCmJj@M8=hE`Iz$T zG+BQqJ#ke9d=h29m51crf`pm2ojCO!EbNY;>Z2%rXcyo)>i}x)f_y0cDmNN;mu#BH zH&^zT-+t{K5WBv11~#AY`S@8GtrC{wQ-G*n$@?2;^-6UJ;=O$Hb{eyQm_@JHOswzX zTK!tW$>(#4M}h+H;RuJsB#KY-N8f2=Hk?-d4}2c$)^eRRIRJ_DMDZN{yR${?nrkC& z-yqY;J{eb?Amq!|QcsBb)mSV?m*Oj+yrZ~B#C0}#%ql7c74CpPLUdi~1HROWiZCM0 zWNzhN?v=jE&j$m>0Bb$VcT;qu+dlk82NgMP2BBGnE+VBatF(nKR zvSsVO*!Ho#<8VEa-}FYIeH=Q~n=?#@0jCgcJAYpH{4A?w?D5oGubR!tJ?|u?wS8wp4>&vPYtO{5Ap3c5 z#(!g9WYt>I){Ra?^JE$Yvn?XB^xd~>dKf?PAf&DZJS7a7VmewuN#Lmi>LGKLz@0bA z^Zr{5v-e5U^2K!wEmhzO6|#g{_lmZcxt(CYj(%FnOv5q6twB}O(V+_`_O|jTDE^z>2uwz^a&$~np`~;=m`IpaYlYU6 zq$aVs|C``fI zyGuiHdFH-2QvX@w!OX-Z_!nQ4Crh69y!)hRmsRiHx|McK+q)JP{+Ue=yi$(^j`%tuKSnMZRenL zeP2&NM<(iSYSh_VCd7tQwcCUU^NI%wv^xna4!L(Ade zdHkkAFHg$Z6UVM>M9f7WrIa=5VH};G_AOb*2ef z&d;9*+_%O3Ohg5Zfu|6`gtrR{J*l-MmB0oH<%C~hWt8~Q3^f!v_UsLBs&s-c5>&&c zn;}ZHkP&cy?(X2jGc)lyC~FJk|K3P0@xOrND#l>Mt7&ZXolE+Y%qOrrLk&t!D+>*R z`rX!?y}b0uE(D=7M0#594Pnch;3%qIJshM+MF^|YmlZ1fjoN16~JgAV`M z>wM~ynR~I;K~b(!7m{oS3uEBtH<4JF4ub%z#`}u8z8tvbIR5qH(eG^3jGd!M)cRG% zju~>p!{%Nepoz)d|_wuQrii&f)uE8Z8W33zhO+bZW zwK2`agyL+uI$xij$6RWY35i0C>ig8_e6!1^1z65ezv-qQXA9}9&#-WfU=cXC{vx^Y z-4riLKR;*+C99a+L7k?)RFM}3AQ7z=4 z$=gXY8K#@E2aaE=D{_fKOYv*d8}bShd`#g}kaG3|h1}CqoVly9InyMY7=@T`fPE*l zr!5{GiD97jrg44(%?J|VOXJ6*6$Z1^x;6dr$;nBO4K&9jC2h@ir+d(u)h~jZ@tJU@ zv^h;CF1HY?=@Y2St^hBK*0$EXh6dK@N$XPyKyFe^(|+^t=^#A%g4V`Wdgl*@+&`{D z9k}=P!8nWKE-Fzgy_Oqb-$&@GohYx37vPNeEF5#iC+@(nL;KPV#5-MVorQbbFIG#^ z;-U1*#-Y&r+mtuqoawE9dBvIMCODDKP1L$`o~w(m^fq%9`8)!*{>`0wCOmx%P} zR?6m{18^gI+JE)ozmUod8A2D;88=AM(C4zgf)m^AbyQea`8z0zQpD=Z;NE8q{9`DL zhV^>4g1+*AWqau)yMPx!Zi}zNL3*aiOnr#QWkg-1>F-i*I-e9JvciuSPo^uBgKGBJ z7{A>U)9(7zfL~le?~>a14Y>nfR0Mf?p)Oy-!PSLgw5Ets#)?V zNHBAnoj(2{Lu#Asr)4=&g;3VTM{C#T{gcKq z6H~Chb%1&cY<^U*rSN9in=b+EyFP4jZpWk_5Tn3+R3LqnMUCU zLrnf&m)JbCLqurWJjDL>w#3}@oWCw*>wGg2TmUwZ*$?>haw%Rd}uMCK~Z@LylB?P1qSm~B-mQuQ<8>FRsX%Oj_ZjkQoR=S%7mRO~hMq;TY z-o2jZeXjfdyx;yi^P8D-&YTJKH~`Dc{`sZG9sA1cFYa2@{n9Fz=t?!24xM$xx|D7D zECK058b?@gaPNo^=$F6xj7X`iHX*0shZ&w56wwZgTpMdEBgC{)T~r_3Q{f zhLtYj=uk6e3qF5^Bs%{V&w=9=;RM41{YbF?#t+vc;_v3ChT4_LGRpuZ-wtm+Q+{t&tdeAgK-g+n} zk+%gdy96t&*hw5*balB*oa>!y{r>DJ_=En)fzp_*_gUmsKf>6x+@yQ}F?X6vu0_#v zjP8~*6UQMHuZ4C-zkx#y6V081;i8D86{O}+7^ufyM~!}4nzk187|i?-&`_iM()Di= zKx`{7sjWVk7~-e%;{b8jYaii%`H#bvqKdIqvUGi@aKcv!@uSq^GMVs_kFP<7^^sf~ zm1SS@mOd>5^oRQAbYKg}U3U)baEavnt7XEChvC&$y*Q{mJakF!hj--bHP7*o%1qoK zMz)n$HeYMjswfRy6J~oh06w zjh)ngo5(XYH&EYOpg=>+#UT0VP-!RL98A*K7yb|gXiVLxOqO9e|#z-X>cm8GwpC1lY05V3vQp5u^dtCO_>a^d#>-%NX z%fql@@rvy=R{4^UgYl^e)M_O9l|U2Yn-0*bO?d0wPoYm3Z2X|KQ1^Er;C#leXp$70 z9WR7me@#bm>BsxvilfQ8ARyWH%&WkwwrIhk^)=bRv!X7dF(!x{aV9Fw+jk%%$$%;7 z5prkqd;CoCv(ejHsN|s*{g8bVrn#^nA>MY1xte*da{WpXK@-~J%ygb}LSdJzAL6`_ zV8bQxliD8yT?~NEIynC&I&a#T?MiNcA4Hyn;Cl%np4-5vs%0`XpsG#{yg0>bY3Mq&X6VS?*@~qNb|jZ({$ZT%K}G6d76boK@-gNtyZteK!!6tJWKIEz%xw>P2Y!c z4T7k=i_WK^V%fsnPB%jBof_tzrpG^c|8?G#)f%3`w^ zzV$N5JQD?)+!f>w(-@I72s_oaK4HD`S<3dUqxh!pK1gOfCeORI z+7eVE4aP|2huU7yb*&Pg191;s<{o~XtbxHiOwm>=y)kr*`ah!mnj7ixLlh?JeZy_v zqb@FMQqZakH8;?zr2f3O#9wKYqRa_XN_dsFCmZOQB@%(6zEQHwX(toMpkc_1=8rhGCKR*W! zjLSw67hi5sZXCIY=+P~_R!n)_2$~Q_4ZV^u2WeAW;L0?v>2vJU`F?F+U(w~1Cj2Tb zpO1DJv!eAi!b4boX{S!>f3&i!0e_t*Ud9*oX?==XL;83#2%ThKAd@xGF5%53*t7sY zN^~b)7qVA%Ko)HGJQ8-IW-ua8qqD`BL2bGkCe>#5r_5#*Q8@zni9StZ;q*d?viX zWwY-+RN;L-WL}=A5P%)u>QK*7I>|;9-BN9aRU@tsy_@OGU7is>yLrA#S}Bor?}uVx zo^QZ=z;0Cm008+rOedV2Dx$w&}5l?R@5v zEgwep8fZxCdW|WUMe2LB)4kg>e*G#|3hho5Ty5zkD`!=o-DVk(`ssKCk+?4Z4S?y@ zn|C0}C2sv&tb*eWC!H?uG5|pZ!AcQJw#Wea|GzOKABcgPC=a)E2zH%HQ-T`tRvV&01-qSo$Kk0_-5C?PgdJWDjGQ;hxp&1L_G znE336Mqx>k^qg+wsOa#; z^on4XWoF1WEnB~wx-uV*fgy)U_l?{w>A|t!2iJ=Sac?E0N-|PL?`ja3mt-PIzZpxL z#z%(rGcvKeio|*;A1j7_tsB8Iub^=o^>u&}ny#sa{D;yLX|}Sk04S3_-l;e(J&)*MHEQE7vKAAQb~Q zY(!H|!1W7xPas&j;dGYH8yHNxO9r~Sy)!2vd%$)jWL7KS`r_G@Pa;Yr(-GrQ>V&PYk%MB9sM~T`KhAh z=XfC`gBpXzQESN!KN~L}DgT46+MJ+YZdE z;L0L3$A!8_aOI9x;$*Sr1#IQZ5N&3!oJ^s3pT@uA6AjB-#XI#r^b1&F9^c7y(-Cgm zitCoAF(%+)E=w#jW088ILnDaFp&C}F4rdqg&d@v6XcamBwvB{?dB+-&7@Ov=>*XsJ z=L44U&$DSgMr%wc#Ap~`R&vklx7_vyorvo_R69%_I{GOgpP+I;3x#$NSylUvz%}+N z&iB+d?7B~dT1YvDcGr$%Wh~~HcKlzrsf%~NguSaww3$BOBk!`rQmJ>l5XwCuRo!&9 zmMR-+eVq;s-XQ$8i>=bJ_@~DP!zU8Y_I7D49$Wqj7U>pS;;N&kyLqAHpU0bB;^>SE1OtJLD$jt4pP&C+>S#BdU_oW zO`%bg_~LYFA2FqRz5^xJIW*h`?CTyvU>bz;k6&+W$><U3^H%hoc^c;+xRB**I@;g%JXBXTN4v(nO37Bvd)@FhA)r3*el-+*PY zJbel!TwT)+7sM)W_>mcDq0fRRWkd-?8hkaaZ;X~iUcFziL+Usf%aXSnG|xG#Kh#V0 z?IIWd`<5X>2(Q0`qUSZpt4L5;w!)9fVsp0TNMFUX`4F;iM*k);7 z@oWgwMP3KbnR>f5tBF*lo3IWaiwCINbUhqljU(M}Lk=qbru&6$vuj1WxlFeN(~+f^ z4pVZHw^U4HVo);5ed#$~#T{w>l+XoQ^s{!^7`m=nXIsjT)ok|OdR}Wped-TK*%*AA z5qMpBCBQOp7|^KlMacS9N?e*@ugu2T(+Uvxnfw^{L*Tlos5k#4!)Mdyic>VyVK|C!%hLw3?0)cmU%$+JdD39iPW>#JK0X2bB$m zk7dWt0y`Laa`l(QN(EyWJ9m#=V~-yVHI~v#bTofeZG3#e(HW+vPHB&onwfl`p5oe) z$V}_?iz9*8+#r{GML9m}Xr3l@)Y*QXV6$W#D}tw94F=hRAc84DVXN^#fsElMkWn~iZiuj@!E0@VUbBnyNw6Yed%<1p41@EjL8HKhJi~im;i1uMC zZo2qdvrEHZ(9ge=>Z9}+_6+=UFHot2M z=ux{IAx~{4w&s1_c#Ji8{zq_4#z?R>HSW?0>5_y&Bi*LL7M`_}eY{6F%j-_M=SO6_ zcnpsa^whL4YP3mvR&pUzJnzwx-?ugA?XZQq%uZc3f5e)z2pN?nl%Xo)qTNgf4`Q0@ zD9jL-6A7q&S-MPm#gS#V;k$3bSfqXbd&e^~i+(2)d^DFvQvqNB2ke$!XXZg$kfvnt z7uX)n02+r{i?XkO;jQnS0=yAE7f%7VUstF(f|v6-DDSVFR3k^c(uKUALa%6sDy1Y& zlRlw?C2gai@}^&#_%yoXOwwH~0|xzCHJ?I-`QWWi?kxeq8P4hvAIf!fDD=y^MjN@X~uQ^c%OXOo3B4# zqrDljdNyG8Ke6)l88U2wHZkb*S7UZzxROs1+1f}yGP1$HhP*iiY%w;%gDdFf5S_p6 zSn=FWBxLD--8WzsWqbKNl3B-C??OL)$@}x0|)^oFT3=#%>eh@<#1G zSiMu}Vq3>vvo7NkflrUr1SObjEYs0YX!9=-*vK6|`mq~3$nR%oZN+UDb4ztnbu9Pp?LCkwc?rD zYR2~(XDqW|(@gDm`1i{Qb<_9GX)iwN84GGG=~5>^%%!O->v7Pi>arx@J^9}#P~ZH| z|Aw{{s^qPNB!{*_#oV5axE@EuKch_ z7ncy7>-pV#aCE$X@CEKeYiX&XmeyYt+KvJ+;XGkf{+lw?V9b<3!L%zv*>THL;n6Pm zVIWs#Cj0B^+sm-0?iIgV_;9-UfDv)%48INR@nO@p(*$z&H3Hwj$H-c#{#bL?W2eCH zu3rL;fXki|&s0GfPv4%L%)_5(aFOlvQu;pf9M7L_Z+)(NN)x{bA$dbp%G(WSc2g)z zRL_01K&EQ0$?ZZE1m?X@Nr`Rid#fTvqyqHyYET+mIbkAKZmU?^kw=8JJ##k>g}J(-WbUomO35- z676vvZP#_P(~59j+s5iOH-GCe&s5<$so}c)O}@&o0-%!&0*nrhTX5sG{dAR7&(o5s zW2R+Y(hKSPEeimhUndHW;j?ubN6hj@C|BZO2ORmwj6yZdQybqCPJi3dV=YM~e)0@% z7Ak!6MG*B2G4Q2LaR5US`MUpa#**UCXj0$3s=Lay-J?uUPlqtGH`qlk5GQHU>Ic7n zy~2L~oGY8CI3ILrTajB=5(={T5#$oJ@4(k8+Aw)U_@_2`#?7$1X#711$uj$^<(mAG4q98;&kbZeEZbFt)*J3F zscrQ&gxAZ69q+TR^n11)!pM9*%yif-fKh7VwU#}`l{0yc(>jWBIGi378wLGtYAp(= z4rLuA7#B52b--k=Qlp0#;! z=mp&^`Hy#o<|__~8;bnrLsyzJM_fgaO^Xt-a)w}G4s z#k>ZlN_g&y>s3y@7j#)NYFi&*g=taD?9zMD!fJLqp_^o@v*O&3CC#As?its4@Sb6;ih$O#P8f6^@NzN>x8YXI;;*r_Z!eWvq!k~b;jv~O^)Pwc%rSE z#+<*(fp7I2t|r5?VMJakZ>Xng*`H#>Qj^o4(~)d>sd(4s{O=E6?w0v)bF@=AfG$pM z2SL#jtu?+2YxLi~0j@>r=ev{)LAB!MDc-}4SCtXpzfjhy4L`tieWL~GyF^?}Zu|QQ z<}iA1kNkt}bZXb`eZKV|)Tez{zJc6Yy3$=HMdz4%fi-P5)wf4tvSCTsz@zIqri$l~ z&T1=*Likb7jh%o8EQVamCUm&gf|p_R~n<>UtdQa}7(qNAWEt4Pd$)9us!&aMV6x|G}~YodMohd0U;xVU3l zs<8lhbN3$t{;xwHLZv!nr&z3=MhlK`{02+&sW(&;qdQj%O-VUJLwby;mdZLewj_LKs zR_ksE!6!A4qd{XE%Efh^m`MjDb323EQUK!B|MVyHRo2!vOE>+@4xk!AO9 zaYt^-!2SH)cG+X0o`7=NgSxxNJR=m9>*8;Zok+hx3jJH0kJa%Sho6>~f1mJ1v5*EH z6^BQ1GQcYX9)6?!x;0o5w+(QEY3vrN4=YIX7IdNc3}Tyy=WSl6hPP(c8p0#GqlBz> zP&H0q)ox$Wm2Tqd)jYsT=SG`arB%)$_0YBUUtLdK!DNr#wH&{e)sB$m>Cl&9p|SB= z=kTnzYQ+|8e}HA|SngpEB^sa}QePD%Zz_Y{-AdnW=GxHvJZjxLWocS|2+PAth&qES z@qz@{$dDx`r_41kv{bY$tB0o2@WCTdJ_^cN8P=OWmDU?Oba_hqfHSj3)@4tMZdqO~ z%^fJ9gnYGm2qZ|eGVtvFY0v+EsZN%Vcu{i{=cnJX7EvTSC;t6dfU+@BW~%Z01-e4G3itsbiYYV#5Yk9JIxaK+3? z9$aI5bTj?Js0eYCYA$mubA7unSqszizMbD^H1l2^dt|on0X@?ZMK6s=-?}J06c&jt zxMx%1qbTQS4u*jy0`{VS!FS=0Ek0|#fm0|$#5=25S>J>sA3GLvb6Gh!?duMrvBzPP zjqz*q*=#mU;@8oL$=VaIF`K!P9Le2vXxF3&B0}^VZ*pjZC#Y|r2+a?rAH>b-shs~M zTnyIp7jjp{YL!415E95A8JRY2yksjdt$1jZ9ZT~$4mlM>_ z%$%ZAcCB@vVy?Gj9{N9I%1-q5aEF~p;JM?|yi`6QOKJ+L$GBY^aa{k5-x5I|I9$|m z{aqvdc8~}r(p;h&j&5;Rh{NUb$;scwNO9>vb zjXpxBikXa~H2AkBqo6BEOT4ISYI_O7GP#2lKANV@EKqg!t2CE2;!nG3#l6^u;^XB>s`}Awa+iE z@d?I2zgE3o<&Cx0&DOxx&oybu38(%iC^C7^2aeG9U~}n<`>V+cg3~HQZhRhPfV|GX zWtqKS%vwtmwy!EdMz{z)=O{jy58zE_?5tAy4|V2PnT9Nk>_uyIO3}=cm->Y8B)XLjS~Yex2y8K zzo!wUImVW%pMf*PE|T12cBGN<7{F4&gm$Nx0h)wZXxz%}OlBFhecoSr0p_1tl>qiM zG364(f}>3>JSZsWgvvkus>T+0PQLuV#6{+CtO^yg$Iy_yZ2IFbYb^7(tr|45nIw}1 zQ!4Y>4kweX>d=U#sosz3c{{Bv&}HWt`J%vvQ%tsxP--)^!Vuxt_NGe1i)>rH-X^bc zJ^H^)wtC(}t3RwaIIogZF57SX?G?y+@}GB6fmMSAI*sONDxQ_Z)%g|&BnSsaVG#Aj z87;DvpAn<=r@GI{uddMLFssRF=u8OJJDQ>qD=2)dP9p(b9b8zrsSfRlswo*>b_Gvh zRFY)k241U}16dXv&N1by%f5jx;#ce-MiuFocrKaiRM#Dp2)1MVpkt=S>Atw%kMAfz z^BI!~*5;aKxkc_xpO-d=55>+-Cn+s zP1o84Q!CK^E>%1FGSw-qIncdYiMZ;yz~PmxNjHJ~GHXjQv9=dOJ(%(<88W3dwA zd6a#-R*ZuJ!>mOS47f^ig-kRbf?noUo@@x_r|!88C|67m=?^E-fY;kW;VbMk0i@7m z*t~d8*_8vU8?eoZ}lIbRD;UKnTDBE1dq)?fU!^cI6gBVD|rZ-h(QdXhq;P?@`= zg{bySTeU}HGJ^c~30b?q?XrcX<8{-9b8|Y&!YB)UsGgDKizn`6xYY7nv`yL1o23r- zQ$fW4mE%$=U%_vaGEq&@kO9EB9WU%V;LkWL1KC{{cq0THJ=R1rYfg$y+J1E% zzpb21T75a;U?+v%sJ=|(BjHch`NrDAP`s$N5P@+dH?8{%AIrABM$$l8y8DHGa@tOu#DI-5qYr-cwFteD69>O z-1{5JdWvD_0a|T-IjgvReSb=4w0*nDP_OX}a>ymd7qsy~N6-Pv7GvalT7oe()JQkE zbGZG^uITS!Ol*X?eZ!2e79fUYJIrNJ`H)PT^l*9`fEX*B;f*41!Oe_+;ymq|cWBmz z$rclv6Ba1v=1-Fd9JW5)?0LBbFrYpFbK6aRQ@zW3o0JezggpU|bf#s-2>xze7iAt9 zu|zhGeVJ((_}HkiS`%1E*U7>Lsgz^bOcnmMdwd6!e*&rJPgty5@3a5teBu~Su6(3R zJ2J;F(sTdIi4s(fE(Ie$)MT>|`xi+2Dq)YIGCV>NjWLj@qvEq|?V+c&}nJfZ#jAaica^3dK zU_wPUv>d%I4(+&FDGx7d>Q!`<$TadN6jXIn!U$d7R7Z+fEz05W6Xi3vOQ>X;?xS!tq_Bg0?3`HMjnb1Rh|_79Hrzwjp$B}#Kklb1&> zsB?hgI-O!xrCzt^r6x!<04X3mZ7k{;d$$JZv}Jrqyc>`E2#h>(*c98KKKahSibC58 z)pm(Zp-r7|;DaVMMTMWzGJlOiuf~H<^ryXK&J1v!-QPWk&vke_){@Ok%dKnGV=vz; z!R(H|Z>bbitJKWVXimD_q5qK+5pXadCjWNVbDP*rQ@Q#k&q)yUEz4s)_3NgU((Vj#F6Z&_+oY&w9i?aQ*+k)Q%F2#5!SRk8`)GX+_@$q7W7)~ zV7PqX712A*x&ZJW-{q{BcoVnjlAocTQ4tn-_p_oO#um;(W0a_nFR!MW(u#@u3P5IF zD=gpaCyBAIyTw3cozWucT{)rbH_fOdN?5u8B58EGuCD#Sui1nw!B#m) zF`%US33iQ})d`MdG9N_b`u-5I$cVtk!|-@np?hcV9W)eo`gH5U^Z8+FdAvX@_b7V{ zyq(BVor%gBl|b;z(f;tIw=c(&gBWApc)4TyyE_48Y)KRT*U}9Oo0cCk$E1=FA5AHq zQY)y{aj;Q%Vkce${)0xMpiksVsBeH4%;rq@ycEyH^T(d88_@|!D&#_lXF|)hl>N&bUpTEo_)OXJoFn5#Wof`-QN zlH#$O7VN4|UCk$@IbrH)d37DDT_-Uqj}?45+NMgq)!DuSxx3b_qC)JOmB8z80r7Y9 znuX_(kbqNYy=C@sdJ(gdLUnzJMRbIEj02IZbNZEwXTSOZVRR-ID;tNxXne-JgRB0y zP?&=Rxx&go8~0QGz)pnd@!?Wi^{&i+#f59pQj6l;{Z# zaj&r3KD#)6eMAK_-z~?Fs8ndjquRyhG!$uWJQw@xW;Zvcw=*p3T~deR!BpecqjLXl z#VbR#Bk5tbq)dextg(%fjbV7tT%hTf19^mWnsb1li1W0NjGYPO42k|jl03x4sMz&^ z5No-@vgr!kQ7kslp0%f@hG$EJE@U<{@szm6?#N(sk4@I+OO|BwP}1=MS)&;30v|Z7 z_Jd(HNXaw1KF1(-CFexckYkp9Gv~=usg09HQdzRL&)oKGgC=Iw^A#%U8>aNHa7`93 za_x}+JcC~)exc#th1Y>`<^{++hnJ`B4GMo%fy*_`D>guYK zusv~N1anLvHWlo|1up1i&ni#)kJTADVoH6(V#J)Rw#L+9+rD|eUxgf@FFwRFb}!v~ zA%W&e@vgt016z1R-}5`$!alH@ftwg?jz0DX(8Jzy zny+3sA`J6HWSbv#(G1Poe~IB8tid!sK@}zDy>xz-MKW#8tmxpH;kS#hu&e z5~AGF>%0LLYklryEZgzEoT~qUC+1bq;3iq4be-C*<@MoIiVwp%dS9Q~sGcJfwY$EmE%TB(yo5H-Dr?U?8n34|rOD6uD zf{x_lH2MX;noV6=sqey&hWQdH=L{98uJcsJn8%Y9mwfc-yVo(cA-x0#gha=j6Pl~>NqSywT#uXl-5v1!*fYms z!|$qRH{_w?d4AQ*l$m&S;nVAEO7Tt9SY^l6C~X2qjKtf^|;$gDK_p0pB#Qw z{>-4%`KFA?eQ}+962Z~Ap3O|N!WiftKkFk4rx2=#FW2PO8p9*JZ9B8xUGdb4%pGy{ zR_AgN`Z*quHcFmm?0=NrTgvX~)7$Y4)Q2f89fsry>)Uaw5ZMV_XH7qB23Vh=Bu1TY z!YH>Vq9Gw{m!ePUmMRyy5V0>cS0Q57p-ieH{*}y6MvRPPhWoAuM8QF2<7#C_kfsc=KaELc-EnKK-?{f{4}WV;DmYIjfME(hC?-7z<-DM;%M5~`hVJ7bJgn<*JU}U z!p%@IH@<+trI#15@U_d)TA+Wi(7aop)LjV68nPkJWF64E+1$F~p9F(c?E#!i>uIVQ zI)50J?5`h;{p-u;m{|Iwd)&fjVOpejb2GgrUKWCHSMtH6Smx8W)w^Eukx3QbqPPM;j+r2V+5dR~+}+O` zhH{}p`LykY2lrLfjNVNHeK;-_TPd6Zo)Q`-prWv6jjk@R zJXyYLycHUb6lX6?$)t%M*b!U$rOgNqN9<)m9&$gLoUy>bPlP-|L+${7o6XzdqvFCX zm%&)O;o(CUsbX-DlJVZ0o4r$pVe%7_KjnUP#ujW#W;92qO%!06HHKvDP)*;#1rVZ^j&YOGD;01oxO8LWaeLgHP*dx=hz}p|Q)8(AIO$?KR(}rZ^)O_}y#o+-g zgKFamLhJP=NMov_o9Kn+O4(OtUU5FEE9He+pPG!^{`j3ceVZy_ z&D76=bdhLuG<>J<>|aQWN(_y`BW9Fx2m=wnb(1|6l6uDnx)aeE48b1bESy$kF5+|C z++1Vo-=A)aE^5==jTW>T-sLXMZ@d=Gav>5#a648EODD?2BTqf+3Qt!L7EOJumeo0) zLGZ@e5K5}BNHhs_a&XmwM3xQV35}C!#-2?=uo?OxPxR2?IpK)b zsJ1gC$iQ!{DB2KBPqfy2?Z_Qk1jf2nJIwFM;V#s>*nx^!w8$%WaV|XEc$Q8ITy&+$ z_3gxp*te*|n8OoL578@V)ik^zE-Tn}$DAaT88Oq?Tcv+ z@reu)cqH*O-_=;swM%9d&7QyNiTm!L4Gq_@M{KN3b-iu-(6(UaN3JJGax!U>j~R10 zE58=xTCl#j8X1m%kV*%q)_KjSmhMyl^}e99v$d1cRAbt9w>mzMVlWh=g`s2wgY z&u(M~_Alo`N45^uX2|4Bzmx_L8kq-R@*V%3U$93*OGXN49X*>%0HoxeI9j57Wlx=j zIE!tOX92>}P!yO1`&I}TQ{>_|{q9Ar5XA6dwsBHiLBF;CDnnogZ(;N*yi*|a+8+UO zn<}m0%DX!g-Mt#7gX?U3<=bg#My}BvPdgbn{j-ZQAJdxC-B)ECv3x4ABAS04_8?Sz zVnrm66{(ZgF{gK&)>{p|?GE;2U2HraF<0yMkz zYxF5!fHiDB)fky858Ck}58nn7G{p$$_ZIK?y;{1NYgMLcJ7ZJ?C4qD!3hOT z*_KA?KpRJ^R_9J9qUQ(DmX<%83>qPM5K}QNygTO{834#mp--gZ z2Yc3OOC>$Icy}Q<33-3`xOsbtX&4;mR-I)X9@S+IcKd0peRB5ePgT;K>LC4-Vcr*fo}~wpzMuXssN&+1;SNI2G@N3MFH9+EGQ=)oeR^J6Q0{nNUQV*Y`S(yZ_?Q#hmgLR zWV@`&?XTJIV+)SEFjon~^H1I9VK&WZL*+Mn!+%^5pe)s%duO-f;VG`XfI&Qh16k#k z;~_xst|yXB5&MTS3w^p}bejkdBo`d-xjcL|$+yW=Ub5$TX6c!kV|)y(t_0KL2$}hy z5>S}G0Kc8@?%8XzjFaou5`)$x@@B&cA2vM8amJ}@p^ai*w>fd7W!G!gCf*?zIqy|O z^t<&VG7{>kn43Bjk8k#?Rb}RcY+GGATs-Vn5-OAJ?7c=}j@{QoWTe^`zR@FHL&Kk; zk;Qy9@M!s^k~<&zpOLeSA7%emV`+Kjt9+BYCgL}dxbfbPI^cKT;(jgZO4nhDA!TMl zS(r?FhEi+qKbixfbk)3hj{7i~hLYwXkFJ(0g{=1jTZ?SxgS*^-5|&Oi*%HhcmiM;X+fW=KacF{0%8-)ln`#cP z4K?4F;s=-u9csNdhdzx~8c;er)$HQtkxRxHODxDF-Z*S@+Rv@+^>v!Qv=2uL&*Pyd zZU%5^C%zmJaO#G``|3ofG&{U})xWrV!TWAbDnB2}rAdL9!nPY0PKUG|7e3~-J2 z1XK-&cONv0cP@#X6cgUZppQqxhOX%e9^m$gPoWtomv8xK%=VFF%lJLai4y7E?(}rV zbrA#ukd7->Wzg~ZN~wfYa=Oxdrf=HSX{X8JTd5{LuGLppeQ%?_evVMH=GaTYdW#5> zC`7jDcFC-h-v72gK@o4{6d;N96FuHJ83G+evP4k9hu7d`z!%X^p zJ!Ke3z17>sv(NJN1sZXrgbkLtsCuImkn%zQ-7Omnutenz5nta-oLcXdT3ZY?+DBB3 zTYnQ8l?Xqg-&u^qAs#2Dza z4H>vMQt|MF5rpRn74j!FN0E&KeRx27A1`@Gsio(k+fV_dTQNA2gX&FSN)DF zWQWrB8=sJato4S^)PeJ&7dE3yY%JWKPu|0QlDh%OKTdKM)0mcw?gGDa&2=kK`k$JH z6ieXYvHrJMB1z&Sbmk~qXUa&%)}%9=ka^zD!|IFhpev?1NH#LpuT)TAb3@)JHoD9P zbG2B~;jMXI$0F?Cf_or#|v?_~aPZ zIxbLnqfnjXmoxGe7d(h8x<@IF-{4CYzbzj>ybLCOCGPXy-^A>Aw?~4m(||(KGy2oj z-=U4Sho#4m8k3e|N>At)g{A%l<(5^o5;MKL2W&X)6dt!}*jWDNs!GEc_27;7<-Xq}GtuZ}K(4Yv z*QkD3UqX%ySBjhhpzG42a>#K*#9Mo{FTJEkOTgIBgm(s1DOe%SAp+8K7G1D(jTao# zWJ0WV0Dj$A$a10Wy2z{GmhjmBi`K2efW}6s;}HxOYRJ=C^iprXjU6 zMpCZb0WMN2v(y}Iep(b7KB}uO34(){@2b=Oz7Tsb!jY7o882$l6;$oDe%uB#PO_>P zzpKHo&0lznW&1|40?yf>Zuh>qj(15{#n8+gcs^b8IZxRpX@M{b3Gh7&T1O56ZAAaV zd;dUHR;YI+-Rr+n5(tfU6jm4?3_pzVc8>ap6mTMcXci_LI!uwv+tQY|X>&I=FaqAz`hnpl1(TQfCY@Eitg7Ie%x}iA$n%$6k`8kM<@RFJEe7{2}8N`vwh*6Hjz{})8 zvom(!{_A4gK$~2(Sx8B7RcUewq&;9_?8PrtbbkYAUE5g8tB)BaW?iSFx%WI=hBk?R zXg9^?^H`m{%@T`;kb-5=$+Jp&<&1j`MjkZ^6c6rhS^8Mn=6EGKEli!|wem#;iS zJ2}PMgf@_WW#PM)^u2EQPmS&%PX2&I?gAS+6ZYo81Ln;66D;6cMXGG*ti7@O+Qu#- z&{!Z~3<;*!-(Z@%66@R9Pq@ZrOne>1#?1(cvKkCoARwRbFgq8a#J&ASRAf~(&wH{3 z8%0NcIzp6dW`l*2s@MOCSmcbLBtu0%$RSL(7!SDPcM|%jff?$QeE5j8jST3>aiu8D zz0Rje2CO1vzX=>3d#6w=5k^5!C}k9c?RPyE!k;sWForEs*LkT_(teALi+xKZIz}Ya zuDTd%B`X>!!$Nfj+sdo;KY?@i{AgUs_Wp~1aXl^c)A0O?U@}h(u5=N6oog$0mniwX zbftnmM;Z%WIZ#}}P8Tj}~)MNUuho(sbz2B|=LhGvh7ZqAQA^zRh)PwX+E z23Xt0FSQ#Um>xBmbj16}+i0`ItGCSCnU|_)=DXPC9sXvq>Hclw$n(-285galb8`$P@>L@k}R0iKM(ujC2 zL+f$on23X&{MMzo&{p0T>Z~cD#Kfq`MoxcXRJKhjXvze*f^> z)Vwom)_T@6E9NZ@ma0yXm!&1&o*=mXgqOd-YP37AStPN=#Y0q{A2D;)&n1!saaw$; zL;X@|R>s?cOUycO;}k2Q?#o4^=|mu6-4{`sX7vd(mX4hn^F3b7<;}2X%W<`AvFa~S z7hC+=2#gkO#s4XSWt(DGEA=d1V1Q^D_| zKn@x5W{edf85A|w%MF^sG;Z`RSsmf!MFb}};1I9-Fspf~6m(3hX zU*z3SN`j^=Yqq4ynW~3%HKuR}f=NWdbG&hKL#WPeFgeJ;dq-8rnEl-mR2JazM2TK8 zn>6q~b3Q5rJ^zNMi5>rRSqpJG@RnPB_{7_KAe<}FFs3F+jeAYLine6~{+zBqVhMYM zG#gR1XsC$#`%nM)@ir;!9zP=Hk}D?XmQ!Ne4Wz=O9lw>MzS^YnU&k_#hB83{h!W*) zARNRYUu#-l3QKqmnk;s$@K4=53^Yw&wTsVx6=g0$-IH-R_;daWO=(JvtieT7Y3+@C=NmWacuB6KrJ_>$ zRCFMoyRx65&G$}-S~fc^qw9JmFF4Ssr)MG`j9uyw9s9HT__1}+l=Van0T4W@m?>Ri zc~R9qJL+3KZ`?23I>JyuW#HivPUqCM%OgRenw+e9P?J)I2|M)4lOA)<^li^QKd39! zMfce-S_Ts&4s0c!wYJQ$&KZHvH_lZl|*68Hg%p&+6K!CLqqr-`bo=_8kl1`=hv5oRdY#`anc1@J2wFODn;29%krFk z)1(O#8$t%U0D6Me7#8sB%nIw{s(;q7^xMsnh1t%PC4V(FD>=fRp_M8&<6X%t3=i2s zH`d~g3&c7C+&(wV8GL;50QR3{C(wixy*R97&QH*R32S13+gl4^7JCf?M=$0_iJ9@G zZ{-Q5=?G?e44CGrazeJ5V>U6>U$XSU!9@gka=xxn10p2~zJ9mJ$H+AVL7L_#G_3>X zj#E1!mnM;RAyHd36E=Krm$cwVhD^AQW2fgO*yM921R9yv4c}s`UI-@i@tgx`%}=!m zK71EF;r(kNrSu=rZPGV^Sf{? zL-Bwg))fLTO8>X4^CP(P#IB4?`O(TxV?!7|4iRPI!t4xOuyNfoT(&KPQsfITsI>8Y zgGDr%iTj&!#6&Vu04Xo!l2r^Gb5;e4Sy@V|>uOmNC2*h5Z*m-~n)4wlgzUF{cjo;i z*WV|-mq%whnwR$tgEzx$IZGb@Mc1zvn6-_DFB(QCrQ~kvZ`)Z+4OU*#R3K#(7Uf7N zCfpwGy|8mAcGdB{@Wpzu-J7=tf8%zz2VYRX>N}1zBMo?2>n{5*)8WN<(*3GLW7-8F zoVseJ)K3YLLv&>|bh|DN3W_k3@ZJJ`gcIO;xh=72_f96vuc4tp@aGMTE1pUf^-_%I z!pgBHnvocO0FVXu@vIcIq$Y-b)Dj+Vrj@&%?8Tvb2D1ZB+^Z%^j=HK=ndp+*2&qWx zv6Y|d+0@q%EUq>$G&N&IUT|JyoO3$vF6~2JC^n*l)X~ENQQ%ryeH|C831^^dB@u@* zF!3nIwAawx#xk8i@Ur@1q0vR9)p&y9SG6+X%DB8;e<3fqtB%=ZZn;770p5U`zI2@S z!a>B;H#diJvP1$UCw@)H=Nt`48Yx+=dD;v~4tZ2qjjr1FNT@2yzyWQg{I8*1W`6`} zAArPO0ZT&r1V7g^8G*5}YVbs|<~_9JP`R@@49{k#X)t zBrC&5WJL6cumJ*gC#GxWJNDj%8xBRr7*{?T!XN1Nr7PVMqWg{n4jX#?M!oI05v_Ua zODcG%kPQmrjh`fovjz>f$H&j^A(EEX{nBz@;zj4R92LF%7*EJXceNC>8l+vVD)-#N zZ`RGdEoiHEPI}vlNbCCgWACj?RpDzxZ_dOR6~kUPUf(P?fS1MatW|o4f8URv=vZJ` zO$e!Je|#n3u=O~6@EH}A++3Iz)5`6wN&pTtq($%bpm`GXvKK0gKLh}Fg12*`{Uj%M zvN1E_P!<3QeEkeY(wKd~w$+Rx+*E>#=&HVdSx~d^Z2`C8cP*lC0bntW5X7~rldXzV zeT3cf@#FXIxNSmC^V#Ui+6Ld!A`~J{Pfs{^vt&oHz{m5X2Et9!#axkG*@J0D2&2%> zyjZc>hMVG@)s}_&5$>ariI`uPb8iingAX|~5zVve=Wn(g7+) zwbCS*v*rxt4E&GdHl5vo13o@A-}%w9feZYGU1H*>6^T`leXq3|eT8{A!xDBWDX}q@ zWa(A4A)O(#k+4Y*w8*xSMpx@wSch8AT9%Mn2Usr+a-<*CXg&e=8O6v6$4Yi@_@ zne^5N4F5|G{f?Mf--)?1yZur}$6BY@bY0h>;-21FO8B3@E=#Y{X_amTNuICrUN^r* zxNXo&7OxMy+F)TCcNG1Fe{w5pm{{|x`FvM$s~4|d;}zXy>yaaKs0rieQx39;Mkelz zYnjcES6CtlfDG1RCN?_#~-m*s^j<>o=fX~G|Y%xn&j8Iu(Bxqv#? zKVauIDd1sJ9cnh?r@C-VN;)Z*6Y+FxIh~mJ1Z2l`0!9z^xmqia4FVfjw9Nwj5CsT0 z9jGJhfZp*$3p%Qbs@)`A8ICgV6wljF8FOq5?KpVnH{$|fpYdf1D z?Q4YKb=zGs7?R4~(3oab+nn;<1g zmGpBRXkFI#f$uJTNODZPYuP+m7zWKd{y4ctzYYlJcg#{rZm9$Ux6BrAO+xb@R?uf| zU}T}SkK&w`pr5fuYyFyJAFu?{eNJOJJ>;V@_}avqq&X3ycKs~X4Rxl}93G!VnORm1 z6-Mc>c0PhN0O_=?hz&{oWpzLY zZqS4p#KE^J*z}1)ac|}6s{ES`ii(W*rymSjX&oLi3y+ZXo13z7Jvk!T0a}v65g?S! zXHGeMYh^43(OwOmA!ye8f}iP4i2XN?8bm*Zky!_1zY7DZBddhbzP{y;Cm+$-j-z8* zX|zhTQq&|z&9B=^XkOt^<-lHGnPJf0j@v6N;;NjO(A+e{ljViIbYjxWU>_Yw8Asq` zJ9z=UX5jl51O)(~Igi5ClkGMYybT?p{p;w;^X9yCQYBa-Chdsn?o+&o;?f-XSBj|n zOFes5TCj3YmVk5OYb!Ih4atQcsIE`Cei9oY^ll7CH{1p)uKMFm1O*tYy_IU~i^gws zbSG!P6SwPUgShb=c7__tOmL@_w*k3WAY)BD7_(qw_%_h|Z*c??n4Q+n;}syG_?Zhr zrXwLAKjl@WJM#Kh)i|n2ICAxyJb5`yh)e%Mq)%qGm86z#NEaT0wX|Ek|OPQ;)RCaUe(@-D-?-8f~bk(903 zmae2jQfX{fumjiiwACpi!GzjpHTm5R4<`Tx*$P!5Kpd;2DU_X;Nr)8wLP9LZ)XZ-3 z==K#fEHDDRGxlG-t6~6-AFN|V1G2Ew1vARRTeH!@EQa%c>~^S^iHJdFBNW^cRJ)Q zk*_upt|SQB#6-j~>Nr$rj^QkVwH6mBlo>>5E7TlWI!4<)tuuacv`CD@Pehc`Sn7}s z$)Eme{=P6>d^FjY?KEz0zO4WHGB^wWyVOj2CXk|@$GxVGoiWjJZl3p|${8H}PU+`% zv&quGky!VO1ln2q!O3-P%DdgM0Lw*v*E){>*PupEAG62~R4diYnQ}BL%Gz7&!Y$Q4{Tn)6e7n-kK&kD!g;weTq-1~LJVb`>)c9mDATUD{e)6bFd zwZx&9Yiz?;C0CR#5W3t0aNuJm@sEIaJYiDSx%(rpO}^UO6Qe|ur|^Fum=VO5A2Q9H zB(q?s4VN|rvD7rtqccS*CwcmFZGI|yJ3`?@u)C=RB=3@#+&5`moYDffY~LlZchsbh ztJT&;qzt^%7X_z;frhmt3uHwzu)WtqqJ?O=?Y%Fi+Opz{>Mp9l*Cm=+I(RIJ-!gSv zkCZJL=q{R(R-v_pqt6g0uFjwBv8s(wKfRfAZuGNUEEU$4iS3sqw3vWNyFvYA&}u-{hHQvy5`wir68JT?k1awqol?Ne zUiuA|Pduna9B3wwHMpZHX=t4uR2B-b32c9QU|<7tMN$e>hbh!fP!%K&5_NQJ6_`su zM)0!+b$_{HkyAL1>u0X0_h{8C|IxiG@|lFS=ph5s&+p$(L8l?3Y7WQUOhUM~8ItMp{>V z9<0AkkFF9=q*Uh2`+-tEL#-z{X#_6{4sV8ndqs4ln%NQ~r6qG_7HSOgs?WL-PYbbu z96ixs2w<4Fhb6Yd_fV?*<=~1v=b5twl6iasJpTAYgqhr&I=@X+8Q&mMUH=v#`BzfU z)p>+tGj&LjMit6pv=L#ahqfmuTwU|jsYFbt&Iyjnji@?4fz#07hF4XUAZ}hlTMvYI zl5BszSXJL+AG+Rm%ZdY>PUXRtuJ=x(=^S!6PS=F=m|e!N>%$g__Pm^K$E1+SLY>m| zsIoMaj+nwC`+4L!m%{p->Yvjr{mkgl>Eeb0rIe&v_Wd}2QO1-^P)*%2*zK)qsKzW{ zNfm5tpF5WdTNGB{srOBSfU-JmD)Y9}TOCx!BTGwDnvh5G=~20^c^t5yOtNo!g`>Ff z+hp_gpObg(H+hNyE}D(u1;2qox{Z5;vrWEn#d|vZ_951kua?y}RhkuI7XT56y$E9!Z_0+(&tOZ&cvwt;};!v&DE51?5O+d*M; zlaLVh1P8D;A2%dXqO@m{w73ERYtOxDe4Ip~zCx{1f6g%Xi92#BO~*Osx>L{*Z;Py{ zJBno3;hj}TpN0Re^Qqn;GNYyxu3-Fdq4lpM>{EOP^vccw4xoU{;G)^}hE>z1t9%r< z2VM6mEQ*l;aG(i#KgMBr-;B}sH6c7&K=m2J_>fp%vYH0-PRmGypM%*h>O{<#aJB8b z)<6}@Bl1?7@gVhHq4xK!aMYL+lX}Ly>aP@|7cOv%Ge_{@m_c54-OodON(-$p*x)+$ zX3Tki0q>!=45fp!~FUg&cDqRsD&7%8gRPTn>iZcx!3(|E6_O9xqt%A$HBYIMR1};|0$L9 zyEj=ijfZoxag5e$@8(%5G8{XNTU}KbJgDYXz2M zn8kWg+q*7~s-q{Ucy&rhh)9kDO*;VBxWGY>9J@Gl?n(GH%6bSVghJe`%O3@^fq`d; zW*oztjLi&BK_F&gH#2FbhRZ$`Ar5C_v@}iq69X))EP2X@BJB>NLdRs`8+;=|&}O9P z9yVUNM%Rf=zmC|1S0L4{ksE`q4Q!urC7!Zio9>c*R*{|3ukU?d@c1BJrLcF#wHf1i zB~Y-8zB2m9DFf=SP*`9!%`Kc;zuD@3peM8E^qWT8pJSo)$$7mxfhiBuhlOV{V-t+* z)dio8XmgNI=aQmm^5r7&Al$M`0}$CT=z1Vj_fd#gWl_S_-jO%M@N22hKrWkNda*GE zLKkm#txvvfz8&}|HZ=bNI60E3M>;_*eKQF0(sUSE&e>9JJL@(!;NSZ_~@y)?>i1n@_eR!5*Zl!0rtj72*8mku;=BKz{2ons%4cJ>!t zbz^i8uD56Aet9FI(VT1nyrN|&4i3|)Va~XM;u$3a2TWtTN^vm5+*Y2H=AvuSsM#aF z%bxIX!jS4aal`Ec9oXOPG0^ zCqLe;5D$(gFp3M{AIVon8t!dB0@{KA-N418C`nSt=_purK*v9dblwibT8?YzKni_n0=sccTB?B+gc@?kY%GTtkqoxI?unQSlSoDN9Q%i|& zR!qcSoiec}uuN&(CdeJSbxISeeqL^OOXf}M6{$#v^I(pto&HPX3DtG%c4YtSV|x7VmP2=_yh6X{31SsYx)TwH*eFB zDS($gvu7sk6F_3oOb2MRCv||M&1c($BF!5=roKB8=1k^EuQZ?>ZF?`_Q8^Y+|(ikrjKG1+>ldx0gjxq_>-#C?6GTP2_) zF>@MKdS)N@ej%KzPn&RNwHI=Sy7g#4qsRnRkJCL5*XrYm3Ene{og^OR6{X3WOi)N< zE6bxXKQ7pjA<8=%SvkHFcuF^Dn4+@BLXTN_1ykYF>ZYX-lAp$+IrVV7YhJxRI@3o? zA|sm}S&MZ7zt+U-?P~l4@1_Lw3I=eB7Qm`YPMo6>uIU zZ+2Sov6M&#cIuKncaJ1O1|4&GjuRuj-2#e^#(ddVYun0_;*vQTxh;KYVd}|;MJu@+ zSqqEV!~kQqaJ4FR(t{N71Xx26+FrEDEo^*+!ba{+2mT!c_T%-Ub&{W!%F4FS_cKEO zfh%}*Imik9duwufVsov;bot}0fKbkwDlQh9V0$hD(m>!`#b4@d99_?KG{90BgPxf^D~e{cHMuLj2|dR}E3 z+f3I$EScQhOWX zSB!3RG@84X7HjmoZgH9f1wGV((kOAjx7jN!km1$irCKs|bmJ^hai5Lp&ScSe?{YJBM)v^qF*e^<8-^r$9J_xss=1WVW6d zvO7cnuHx+3z#!$_Wek2;FbG(!Th2pj_a4YehLS07X6eAyym=nM!Scs%GKoS?@gZrd z#p-R`mp(!XOzR=Ic>>lTGS2pP|C#SEO&fnq*7thc?0&U|k(m381zhYmXKKN;N^aT* zpI6)WhJhU|-Y+V=meVZ@hn-4Cw>`V`&_3>w z!VdmBjUob)s=j#wc=ucqu20kKX_@iY29*{KODT+h=?DFD7;WqT7s|GL3~W5ylTQqy zgpwH2DPvcKF*u2drrU>j6LDL+E;T7W*U%0speWn$%~lB-bT!$4K?iI`e%1VAdjL&F z5(SS?+~xc4tO#t%w;_!UsZNl!rYK}!4zusiDyZ&yc`+|Pyh5F2`>zd}vD$rv#l$nJAnzuj(W5qeXVXQuF8{X4~6hlZqXSyH|K z6vW@egCs(9lmp4Pv}&z()+{hKjQhLmo^Ta4~=dPoLJNN(xgMVDB&BY>zn&CdA0b3D zP$zf-SH`^8%ij-jEUI=E%~CDSPFvq;Mv|KL&|&X}Hrs*op~2xzn<16o&eltaI7Ez0 zVmP-gV>i!?qAlG7E}xHS6Td#3*&q3f9C=(lvrp{Mk~ag-DNz5XC5PV6VfT5;(pXt^ zrUVsgC~AOo^G=@sIRCwLQDQ!TOO*}Hssv#4{O6Qi9g5i7x7W>#OUQBPLc!wU@xdjd zSNz_^;Q@B; z;DI}0cgC2EEd{Zup4>(q3+Paj2RF+tF%_{l<5%mtzlk?N=on~RU02O`N4*ZthoVsm z)DS7*qBZXwm;M+ZYMAd>LBmN8(6TroYTCWXl4QT-WS&B#`|qDmQ^(fJXkLg+~Gq;&*avqJ{0#qRAOD zGu}AN?UPrZNWD&o|ycIy9b+goG5)q8z$ z)OWbjbM!<9I46682~b||JpaJSZY>4#Tl9GVoYClq7@{2E1gm%AgodK_$1-+VPc_B8u7p@7P$YUuh);|ABK=Yht1JzHoD28Z@1wHCn&8~Sqw8mx_W$bsheZO z@(o#SnXBDLXi46q5P9Dg0a#ujFkajF(vaQn&FGRm#TahKuZBrvhRuv)Mqzx*sExru zyTvAut;;Qx*=a~&Ww?qRp~w1m`yo_XIiOkTF}bs5hkq~|PoSsuoZ={z*8Lc%+M+l( zicGvA2QJlT&RBeUXP)2%aiVU^{L=(>fP~Y-oOP-08|2?>0ROv7QTXV|`ElE?29nRW zoO!&}N@JkiP!e33K8F#Bvl$o}Nyi1yp=#?zmx1Rwlj1OR8CytUFq`#jJC&`ZhLQ+(zj7jof2Ci&6 z186oUbE-^#WT-eH^w(WB2nD!Fr+JduO65K+1$6ZfR|4Rm2pO7?M>BTM3W=(r#G~3? zqxs_WFiDqy_{WUgPkdI0fwvu}wn=BPZ~KR@aG<$B{i+Nfl#H&a*^vES4eZ5IR88!s zP}?{m4eI&z94m0n4p%sIgfI)T*eeS0D_o|hcOYg5yGw4eait^v*M#oP4Qf*SP}k3< zg#qNPu8c8ehM%W?w=jpLnXeDUAt=SVGrI?{FJR8l$J9Gt{uU^p({+J09$>&^QI?S! z<~r4%#c)qdm(A}E8kzl>nHM-+FZ&!0ADYMpqXNtF_iW4k{i;y1AHncx&Kt2R2N*~= z^?EJl4^12DWQVLxG(deCofmv?F)BV}a>Nl-VD~ou-ygHNU^f(+ zIkFaq@|iTZ;2RrM48`RO;z#K_EJM{S9G)7MlF`Xdi{Eko2QWZ6Uc1|W5ArX_@I&91 zrwA74WfL#KA)A6Y#CVw`PS z(%4&O84W7sdtwU=Q8+<9Mr!8B_<)Ugf{B4WL-^Xt`A>Vs;FYN`&yXqknV+*wgVvC2 z6wCAtcHU{haDo(vFMeiwOv^{0x8ecNar%iN{_$NI+;6ta@3mJMd7!a*ky=Lz96f|2 z*n8+D1)=(@ND^61*n%YbD4bW3y7om6gaJre5_q6PEu|b#@q3@iObx>qxn#sjCo+a0 zw(!HxcD;YRjHyq+9?tKDu|mYtw|jxF*%4Y!@5=B{UI7@ev_|0n-~m;OsK6Tq*&5uk zN5Hy&r&RJNKv1Ct)#}lSp@t#aNn2*sd&QU?#4I?7qGvxbppz=l)4_^<+R*R+KS-zv zi&w~G8@nfXrT0f?wk-~44&Ca0_Nst*_{>;5PJvr)pGgv52h=6YyoNb5R$W)hzZU}e zyMii2f@>&}DW#ujm(e5O@I2muX6QJaKPI5b*{`<+2a@)jQv(DY|s*W|{o(-Pvp zS5asRB~`&NA0R5#j}z}0^Cp^+RIO%xtWM>vy1jbK5OnheqCKz50xh_VaeimIfb4OT z!1CQY{dtOs#cxnGOR8tjPeeY)7Zl)hOtOdI2Bxpfi^RFs^cCZBA((o4-jS!*ex6sN zYJ+5V^z9G${TteiegNBs&m^)js>RzmRkfK|pBVBQ1dvM^nAmyr&dQBL_gsbxfK20^ z>F=Tcu)qgiIB?liox!1IL7$2P<1>rWo>#`WX_~O?PT=$e29D=a;A26}Mh^(Vq=%Z5 zA>9ix{avZZ*xO4k?0m*jnOLKkKDHG87#4IZHd#zQWxu`gf|#jkPy=6F^ z3h%dDN8}E5zDN}2H~ZOJv$^d!C&<;QD(am2a-x-*V@6LpVnmu81ChRrw#%{?>-XL% z0|B56L)DmNe=9Azr?4YvB3CT!CO@##8Yau4vojIn1=GtOcEm`2yo>YH0_O&6@;|Wc z?!z!ZDIP%F-T?RtzH%wDi%BFQx(F`5LXvTrx>KsAB^PQ}K6ghLy+ znUR0HXAfX$y^cX0D)#tqAaeO8gW^3CpCP^V?5Hs&fT1+B%(Dc5(T!mN%kYXrvq}zQ zF-Ld;%})MSItC;^xY~0gDT^FCCDSISBN%h^bTd~=>bgN{akI1w#Gi^OwPBC}3=v55 zyov~PHZ>;vhmmj-z?>`ovZPa+>_E6#x2i4GiL~N<4o3N7P1@)H1Bmf+=!|foH{Zb6 zA;bU8vi{TtFFE*h9^nuK!ZE)F;>reohY1Vmzhe890Z9471Ng??eyhXh3T}y0X^_cX=B#Ug`Pg_ zgK9Q#?z<*mPY;Owhsgm#2x@Az^LHA`MKj;Qv)c-=FJq4VOnFK(;Px}IY07Wj3(tsitup)ew zbZ731xR3upn;*btpv3&Z&P&;6Afv!|;-je(c&VGAQNC$D64uV3#50W#fVCeUpwLyC z4AY3e75Csihzt%g)y?t2wNfR|k9GrN&R@0x{stGyl)vGuRE)4fJ#Qpg6S+zTe5nWY zn|p@-$H$0RFpH_kiHk8tc>^}8IrISZQnKN3f>H`zmr@0?X4&U|MQoh_wRXY`{D%b* zVS;iuu(t^Ctz+!uw0jp($3Bl6uqEm~o={Psd%gqsv|I@ws^8tjf&W{nJ^&er6V3f} z*i!69iU^(7Pm_aC2kMdNO!ItyrO{5W2F-B94s~bD1dhj!;6j%&f^Nn4CQCc|Z`b_a z$5=23E*m@+(d)RgA|WNW5~+Q@PH;G@8z#g++(DKqox+jW$X8b0LSpl8D0(?Y4Go|F zR!FVEcdBEkTrFk5?*^pmq7cp7&PIlRX2`WNC|R%K1OMgs1lX?k8suvKq=3HQX>{K1 z9c;Xwm0mE}7Jm4pE^@GlG_%VFt1oIhiCeu9AR8GTK+a>3zR=%WJdq45^wt7>Ox{c1 zlCTtpJ%0Hoa-4#Wx>L08ydBeB${LCqiiZP)>lnT)eU1~d(jc5F?THB`YcV1Zbj(Q5SMNRg{zsxi5Lvoh@9M~l z{)z)2=~+x*6c<)R&@pLNtmJkTRW@_O#k}O7sA!4u1PU4`FtEGf46sP4@V{Bo-MCG? zpkcYU-N@>lC2P@)^U+YFw(ENuF{>}7S{hC@8oC;GEyK|181^K@ZiQ$?!PbEAj+pKK#w{&LBcJ@uYmvdLO=!797V<9_gMSDOAK9v zt$NO-VkR#Fa%QM8l#X7hLV-7lPYot$$K%gR40AeOxs(qxk)c+u2@H?eH zoZGp;p_LplW~ptYn5#~HTI8i}8-3tXjcD<8M!kz()zk9z)c1vV#2q+^#UZFtNNwe7 z){X}T*H~7Z?@X|fu$oQ2TkTKWqSM8N4J95N8DW}_T3pmQT2K2DEX@SmP$v-Gba}ER zNfyq~*VmU>Ud~d3l%Wrl-m!aSB{@^Rhg9U{^}I}ndPR03yklSCUyC81=L-!x& zbdLN){LQjWX&U^pIT)zrlrMgGEx}Zb=M6vv3rOHzmmw6>L=zEphWft~q{@EBLA{ym z?D?M9k0{y(l&K5TgULJxmcbOJhHAcx1?`eOnbd`SFtz6OnY2=T#sI@ z*bo)9hVy74jl^gq{ES_d?0e20qfF`jNUy z{l_pMd~&PveVrkX8rtfVlqTyHTRATi&6b$$newi|?L~m=)uOV~HI&a|1MSVeaM`G9n4a{6f!Lm~0 zCSHZ;IBa>LPQGlKQ0lL3IR}RdX3KdEwMq-~=pcQ#1SPLD;V<2*eepqzFJHYnaN{Wu z{+LLLQs-HiKf=?lR&FBrrTE8G=jSHl=Yv?t&H^xQ79uT^#yl9?_+BX>yDtnk>hU2V zO&m5dx_bOSjuETy+JGN!>ZaOq<7NXw$hGcGWZWbd6S@JN>FpQ!BD^f9OZ6(yPty~X zOpVsw_w4VB)sms~4OTpUY?k8FX*NIT91%V{tFb&=rPJbLGp=gLarRM`5rnc2H(I2v zZO3C%YS5Q%Jd&AoQFrh>+a3w>{ppu%-%e3U$;h%YW^L3}q8}(QZpC|2HDSSfdFmgP zK?yV|()`DRQqCf*K8`l0#i@31RgaFHzFJ_9G?p_5Rx7?2Y^sK>0bU?bV1d1iKULlF zIgD6F6@0coYAvoQp&!}VKo>w2x)j4dAQUc#Hufv@D-xT!O^F7YyvEWlB;`qr#kfi$ zA;V}{M>!dAZwNQVh5qw2^jBoa|e^FV!E_>^{fGBN+p=F7p@VE$Yn##BLEJWIJAF9{1 ztuxkw#ypqg1%NS#QejuM`}{28f`~mIz`_yz=NE9zgF!DM<#TW` z86wi-hYkb8B=eX>ke6}vGim4-nZ42lY$1l$ehcVXZZEn4^@saJ1#VHPvdQ=T-Ef`{ zLC3aDOAspD4vkzknb$28nA!8XkykMrUvcVPzH(BIdClIt>7Ef-^dYVpDqN&au*4s+seNKu@C9xG3Jq_#>kp8o5L_9Ka)_l`^y8^ zMT9|m%tHLF-wN&riWjJ85kp)m>X9a6@>;;FSiGqZ^C&n3G(YAb`JcXG5V+O7#)pLy zkoNQ`UY{z_|LuO}3WA?`sBk1w=W(;^uDn`3bM9f==L!dm1BP#{Qphz$+ZrzjKeV}BE;8*%90>TG@c#WXd-Wz)Dp7*s_1S7$ zf{S!liQi7x*PHpB8EHK}yup=mxe)>|IY3jhu?g zYXUBZh=>Rb2^4V3W4J0|^g%Z1w)XL^t9Fwn*;GF`1-OW)sBRx5yqCnpZ#eBY6^cC_ z;fQ{ycwJw+PSx1wL9KVX(sFOoMvflXGnCG*u29Wd2T^C|nlh8ITwRzw04 z{4evk|5tDVn!sLWqnAGX-{K>O8>CqkFN!TiQ5|9Y$^I>-py zwplji>Hl7WELMsR^=9k#e`Ch|Sv0}JGSEgQ$e?upCl37e#Jf*m=11!cn39?mLI3fK zzkmA!8wR%>-}d!uxc|8bU^FA_fK1vMq5nBOt3Y74f;liSK>r1N_ZLAF2bi{?e6%^~ z|6BySkf#qr2R;JAhriGHKmG|t5?DfNPEW;eapF%^!Y2i{#6XZAJ$my$m*AHPEMaI= zf98MhRxm1{a16d5KKwrlCl4r`(PoMHzZ8fX6#SwuGBuSXAfp$MBsl^;_j?fy+Gz9u zp;*99_&ux2J#|=Nk)OY&ql^Cd#KXpD<25D#uar&pmf5;Lg#lMQ8d(*@cQYCRETZnw z;bCys=9uY9UQt6-BA26hNAOODdL`{}hPYvlGF=-N*9aUD$|EmD?ZVJ!d}_qC3Qspj zQw43SD})`ZH*TX^{H6(O+|c0Vom4y=}ge!RF-ykFV>pK_yW0TT+KNV zu_yQ#)E*R1y)|JT;q~R0Y(L)MccYO0{`ps>hV^8Y_0=+&Qh0Rm@bHsob<9W4 zH|DN{oXncu4rj~09U`YXnt!j3w%{%-6zr^~tE(%q1+-4X4xtZ5S}nIrhT1QLUHZ`>xekLL%R6rb4r%L@Vl5z(S+6y-vzvRUix~4Zs0PX z$>ygmtv&Hle=`-}eN}#(Of1;c-<+Z;e=)T@sa>&JIvrQvwOLtPyPA8!gbF5NhPlDZ z_TW+!^c?UvdCd}CBinnj6!43l#${)WP2=H{+b%^VC4;yka;b#ZR&yt{d{w-+?=y^X ztWB5R!FqjrG@gIk&Mr(#L2*!vWS>>LbvmYZ|l4JL(Mc{Nh?iTd&`-~W)3XmtbYG3t(lf4evrl=I(8D>%Uu{QxQtVZ3Qlnb z9cz}lU7kkuCvsiPE%w!0hE$lVbq#0q{3)E%e2?ywH(x|-Y)Te_kZ@+Tc`w(|M@l}U zXLd~%$A%@%cQP;VFu0zhwdYl)zU={(kE$BsHB2r<$U>4F~qC%c0Fss%@Pw2Id zxNZn1nLs~wzB9@rRTsXibF-H9xQ9!8{)~FKrIA|YTdPWk%X{5K(PwPDI!i{ej2z;} zr=q}ks$MLn4WaRt4fBShpfjz0c2dbgnU+sQ+Ku0N#QoHtJ+egc%WY~8QvbPIdxcY< zwO4b6jKjMmn*3s7{t?`oMhqvFrC=s1oHkKH6;f7%e^oiJ5ObjF%r}si$#a>Vi4xZf zvFep5t%|ELB*UI4H@$dYZ{V>>o`oVk@&k{z(v#o~A~!R}nS6qVur4`wl-??|wzrG; zBCn|x=qeDc`|RlV@K*FL*AuOE%BH~-mX(dVpaLNGj#NfYj&gwK?1OBMMI?=ybspZ& zV|3#RlVGAkByxLtyr zmiwYtHuH+$)=~Sbt;#m@t9FszS9QvWKjIIXvD(b@Q@^5nh6Fj~r4|GVSy)|bMS1e8 zGMJ5I795gmV0C^qJj_Hyo_Vg~_WMx zE;mAl^Ht@0Zb|^`mV(dLd6(WJnE24&5j(Ko8b))vDRiv}mOlj&&GE3Ec>ltG(yo)!@m zxb>}sQtWHZL)lQYT^Q2Q(nN)0v1rLDBC*(Mlsq^MQ1yQRJN7DtFwpt?BZ809b53hF z{HNPfR2w+AzUGJ~q4Bac|p9xRdy8Z?F4K&@%~tDks!~qfb(%(H%Uj% zRA}%FM^SGiU4sfuse$Bko(K$h7|`}32fe+pnX6+~#*Im*tpe}Ek*~=wd3m+^Io8ZJ z?B%7akB0*!@uuosiV{3-=_HP>CD)HOet{TjV6C^Lzc|6EAlhw>D`rZ%iPCNktJY}R zgi5!!1w`S#B~P;LS549n4V+n8(#2tLEn_yHR$&PaLSmOl$U>reT|X z(SQ!%?yN24DGVcz zA6=9}lB3Cek@xkwy1EpLU3?6ucY}6Ooi$j3GgK84=+vqdFln?XnG}{28rI1H@3jx> z{C)&SwN$@7@H33Ezrt)(WMs}70S{fqt5+rq=rqZ-Xy3qk;FMta?TbAvo;FrJ^~)-EtzBjEulx5T#~c1U)G`e*?7fOkOTTA*XkDq-0nV}g&VQi znpf`yiI}}wTR~24yVS8oxFu@?+Aqy#Dx#QhGrK}tI8yOyT~6!NN^UJ0zEjIt=lQC+ zEPYM7r{RWRwCl>32-Z#@I#lc%3C@$gK!UCdU+<$q2dEFZ4M|6)`c8o(76Ub8xl zUpZdc=Ii!a1r-L=j4C1}8rqzinxaG_cM%pI{-Il(3^ z&m@=2F5q@(Mt3gzqMdicISXnFM|w-F*gS_s?`XcgyVV_PtPIZzRBWtnAjMmMOnozR ze!Nc}_*Eu+^u$DQ+2LffoY8EZX*h4f{&1^@u9y~I90-cMgc?wu8el+{BEEjTa3Gp@ z%G!dn(@evV-g(~)IE8qykF!=x!$b0zh`dAn^sFc0k=$@ZGLK+*(nkQgOf3BrN9XC9 z-Clj;$&{LqP|bSGQ?ixS)m&c^EZ51Iu^&UgibB z-_52D>#vVaVr>?^vCikhg6Hb5kk2)hoHBt1`T;;n->CJax)GEuhe;JMaXZc(?0%RA7>s8WxvMp zV^TyihZHGGw%5`)vK3*92x&}~VnSqjv*u;8%oL)r@5YuWJ2N8@rpX@4SmtC4Bl|uL zjWCv&^US%K$eo4*P594b*daigSV$g}<9I zF2lA!7u%{rQYu{pA@oihlpFMlt?oA~)#65WD^G;E3tqPn zTHUD_ju)*PUa`=NT+H?$nclb&=cRk{TqraUw6o3q0pm)}d_(yZ-w=Fc?2}A}Ki2Gd z?R8`0GOzKuE3t*^HE5uz|*K1WGz7+K>(T&85psuDXLlRyY(KYiSo;I?HrHU_x5`jhy&w6 zGg6m@#@Vx)8{Ik$HXerw$*1ZhM#;8IBRMP03681{74tst*Cs4dgfXHGEY-h!^fJ=0eud=W{Buh_ef##~ zcZM3?!%7);kKVY5F`veg9%%%Z`x@a95w|(dDT9}Yk6%udkx3)f_OWQkk#nFL^T`-R za!97pP3$$Y@t$>Pz^ z>rRt|U7iNU;m+_>M+|y1MGB3>8WU0F4E|P>}<~`Y!s?U|J0q!K4QGS<}GUC(p zcJ9|FqyW7IeIU+!wmv4R({iCi_GZ`!kTK+gJF=isG^7oHq~HR9fM~4p9Qi1K!^Hgl zr6S&Qe668y2q*V=khrW9d{?Nbp~{96{3%-nG2d$^v9Vml*(R*E1%J0xOuM}8A3prH zOu2K^XUMc))=avue`fX~ox5EdGtqKeCV`f9+a($3rhbG^ff9^xPw5!~~4it%f?g^rWAq2AFVRVTo@m z-6*=3R(~E}+nPz-!t0L(^InLDZh^FhTzk>&QB9lff$jbq$C|_S#8L=|I$ZK!Nu7Jl zi5VtSIgL12P5_RiYZq*@Xv* zV*neM<2{jJ@)$d=>Pw@+K4iW=>IZ%p*BmV<=e*GSwft&B{BnEBY~~+uFekY3)WyK> za-BNaige3w%i4WFd!9sdOb=~;j1m~K=lHtov0LjI9h;bHP3UC=)g}$fQO4hxrdXsD zx^$hiw#fv>190$f5J>voZpewR9=He9vK>h!3Q*f*lyOu8rD3YlXmc zKscnRs3^ari%c%8ufGfMO2#XIUrB3dSWiz+vmKLwfPl1$%IB^Ae)UU(8x)7s#Ir6& z{~xo4Eel1# + +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +[ +\ +] +^ +_ +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +£ +¤ +¥ +§ +© +« +® +° +± +² +³ +µ +· +¹ +º +» +¼ +× +ß +æ +÷ +ø +đ +ŋ +ɔ +ə +ɡ +ʰ +ˇ +ˈ +ˊ +ˋ +ˍ +ː +˙ +˚ +ˢ +α +β +γ +δ +ε +η +θ +ι +κ +λ +μ +ν +ο +π +ρ +ς +σ +τ +υ +φ +χ +ψ +ω +а +б +в +г +д +е +ж +з +и +к +л +м +н +о +п +р +с +т +у +ф +х +ц +ч +ш +ы +ь +я +і +ا +ب +ة +ت +د +ر +س +ع +ل +م +ن +ه +و +ي +۩ +ก +ง +น +ม +ย +ร +อ +า +เ +๑ +་ +ღ +ᄀ +ᄁ +ᄂ +ᄃ +ᄅ +ᄆ +ᄇ +ᄈ +ᄉ +ᄋ +ᄌ +ᄎ +ᄏ +ᄐ +ᄑ +ᄒ +ᅡ +ᅢ +ᅣ +ᅥ +ᅦ +ᅧ +ᅨ +ᅩ +ᅪ +ᅬ +ᅭ +ᅮ +ᅯ +ᅲ +ᅳ +ᅴ +ᅵ +ᆨ +ᆫ +ᆯ +ᆷ +ᆸ +ᆺ +ᆻ +ᆼ +ᗜ +ᵃ +ᵉ +ᵍ +ᵏ +ᵐ +ᵒ +ᵘ +‖ +„ +† +• +‥ +‧ +
 +‰ +′ +″ +‹ +› +※ +‿ +⁄ +ⁱ +⁺ +ⁿ +₁ +₂ +₃ +₄ +€ +℃ +№ +™ +ⅰ +ⅱ +ⅲ +ⅳ +ⅴ +← +↑ +→ +↓ +↔ +↗ +↘ +⇒ +∀ +− +∕ +∙ +√ +∞ +∟ +∠ +∣ +∥ +∩ +∮ +∶ +∼ +∽ +≈ +≒ +≡ +≤ +≥ +≦ +≧ +≪ +≫ +⊙ +⋅ +⋈ +⋯ +⌒ +① +② +③ +④ +⑤ +⑥ +⑦ +⑧ +⑨ +⑩ +⑴ +⑵ +⑶ +⑷ +⑸ +⒈ +⒉ +⒊ +⒋ +ⓒ +ⓔ +ⓘ +─ +━ +│ +┃ +┅ +┆ +┊ +┌ +└ +├ +┣ +═ +║ +╚ +╞ +╠ +╭ +╮ +╯ +╰ +╱ +╳ +▂ +▃ +▅ +▇ +█ +▉ +▋ +▌ +▍ +▎ +■ +□ +▪ +▫ +▬ +▲ +△ +▶ +► +▼ +▽ +◆ +◇ +○ +◎ +● +◕ +◠ +◢ +◤ +☀ +★ +☆ +☕ +☞ +☺ +☼ +♀ +♂ +♠ +♡ +♣ +♥ +♦ +♪ +♫ +♬ +✈ +✔ +✕ +✖ +✦ +✨ +✪ +✰ +✿ +❀ +❤ +➜ +➤ +⦿ +、 +。 +〃 +々 +〇 +〈 +〉 +《 +》 +「 +」 +『 +』 +【 +】 +〓 +〔 +〕 +〖 +〗 +〜 +〝 +〞 +ぁ +あ +ぃ +い +う +ぇ +え +お +か +き +く +け +こ +さ +し +す +せ +そ +た +ち +っ +つ +て +と +な +に +ぬ +ね +の +は +ひ +ふ +へ +ほ +ま +み +む +め +も +ゃ +や +ゅ +ゆ +ょ +よ +ら +り +る +れ +ろ +わ +を +ん +゜ +ゝ +ァ +ア +ィ +イ +ゥ +ウ +ェ +エ +ォ +オ +カ +キ +ク +ケ +コ +サ +シ +ス +セ +ソ +タ +チ +ッ +ツ +テ +ト +ナ +ニ +ヌ +ネ +ノ +ハ +ヒ +フ +ヘ +ホ +マ +ミ +ム +メ +モ +ャ +ヤ +ュ +ユ +ョ +ヨ +ラ +リ +ル +レ +ロ +ワ +ヲ +ン +ヶ +・ +ー +ヽ +ㄅ +ㄆ +ㄇ +ㄉ +ㄋ +ㄌ +ㄍ +ㄎ +ㄏ +ㄒ +ㄚ +ㄛ +ㄞ +ㄟ +ㄢ +ㄤ +ㄥ +ㄧ +ㄨ +ㆍ +㈦ +㊣ +㎡ +㗎 +一 +丁 +七 +万 +丈 +三 +上 +下 +不 +与 +丐 +丑 +专 +且 +丕 +世 +丘 +丙 +业 +丛 +东 +丝 +丞 +丟 +両 +丢 +两 +严 +並 +丧 +丨 +个 +丫 +中 +丰 +串 +临 +丶 +丸 +丹 +为 +主 +丼 +丽 +举 +丿 +乂 +乃 +久 +么 +义 +之 +乌 +乍 +乎 +乏 +乐 +乒 +乓 +乔 +乖 +乗 +乘 +乙 +乜 +九 +乞 +也 +习 +乡 +书 +乩 +买 +乱 +乳 +乾 +亀 +亂 +了 +予 +争 +事 +二 +于 +亏 +云 +互 +五 +井 +亘 +亙 +亚 +些 +亜 +亞 +亟 +亡 +亢 +交 +亥 +亦 +产 +亨 +亩 +享 +京 +亭 +亮 +亲 +亳 +亵 +人 +亿 +什 +仁 +仃 +仄 +仅 +仆 +仇 +今 +介 +仍 +从 +仏 +仑 +仓 +仔 +仕 +他 +仗 +付 +仙 +仝 +仞 +仟 +代 +令 +以 +仨 +仪 +们 +仮 +仰 +仲 +件 +价 +任 +份 +仿 +企 +伉 +伊 +伍 +伎 +伏 +伐 +休 +伕 +众 +优 +伙 +会 +伝 +伞 +伟 +传 +伢 +伤 +伦 +伪 +伫 +伯 +估 +伴 +伶 +伸 +伺 +似 +伽 +佃 +但 +佇 +佈 +位 +低 +住 +佐 +佑 +体 +佔 +何 +佗 +佘 +余 +佚 +佛 +作 +佝 +佞 +佟 +你 +佢 +佣 +佤 +佥 +佩 +佬 +佯 +佰 +佳 +併 +佶 +佻 +佼 +使 +侃 +侄 +來 +侈 +例 +侍 +侏 +侑 +侖 +侗 +供 +依 +侠 +価 +侣 +侥 +侦 +侧 +侨 +侬 +侮 +侯 +侵 +侶 +侷 +便 +係 +促 +俄 +俊 +俎 +俏 +俐 +俑 +俗 +俘 +俚 +保 +俞 +俟 +俠 +信 +俨 +俩 +俪 +俬 +俭 +修 +俯 +俱 +俳 +俸 +俺 +俾 +倆 +倉 +個 +倌 +倍 +倏 +們 +倒 +倔 +倖 +倘 +候 +倚 +倜 +借 +倡 +値 +倦 +倩 +倪 +倫 +倬 +倭 +倶 +债 +值 +倾 +偃 +假 +偈 +偉 +偌 +偎 +偏 +偕 +做 +停 +健 +側 +偵 +偶 +偷 +偻 +偽 +偿 +傀 +傅 +傍 +傑 +傘 +備 +傚 +傢 +傣 +傥 +储 +傩 +催 +傭 +傲 +傳 +債 +傷 +傻 +傾 +僅 +働 +像 +僑 +僕 +僖 +僚 +僥 +僧 +僭 +僮 +僱 +僵 +價 +僻 +儀 +儂 +億 +儆 +儉 +儋 +儒 +儕 +儘 +償 +儡 +優 +儲 +儷 +儼 +儿 +兀 +允 +元 +兄 +充 +兆 +兇 +先 +光 +克 +兌 +免 +児 +兑 +兒 +兔 +兖 +党 +兜 +兢 +入 +內 +全 +兩 +八 +公 +六 +兮 +兰 +共 +兲 +关 +兴 +兵 +其 +具 +典 +兹 +养 +兼 +兽 +冀 +内 +円 +冇 +冈 +冉 +冊 +册 +再 +冏 +冒 +冕 +冗 +写 +军 +农 +冠 +冢 +冤 +冥 +冨 +冪 +冬 +冯 +冰 +冲 +决 +况 +冶 +冷 +冻 +冼 +冽 +冾 +净 +凄 +准 +凇 +凈 +凉 +凋 +凌 +凍 +减 +凑 +凛 +凜 +凝 +几 +凡 +凤 +処 +凪 +凭 +凯 +凰 +凱 +凳 +凶 +凸 +凹 +出 +击 +函 +凿 +刀 +刁 +刃 +分 +切 +刈 +刊 +刍 +刎 +刑 +划 +列 +刘 +则 +刚 +创 +初 +删 +判 +別 +刨 +利 +刪 +别 +刮 +到 +制 +刷 +券 +刹 +刺 +刻 +刽 +剁 +剂 +剃 +則 +剉 +削 +剋 +剌 +前 +剎 +剐 +剑 +剔 +剖 +剛 +剜 +剝 +剣 +剤 +剥 +剧 +剩 +剪 +副 +割 +創 +剷 +剽 +剿 +劃 +劇 +劈 +劉 +劊 +劍 +劏 +劑 +力 +劝 +办 +功 +加 +务 +劣 +动 +助 +努 +劫 +劭 +励 +劲 +劳 +労 +劵 +効 +劾 +势 +勁 +勃 +勇 +勉 +勋 +勐 +勒 +動 +勖 +勘 +務 +勛 +勝 +勞 +募 +勢 +勤 +勧 +勳 +勵 +勸 +勺 +勻 +勾 +勿 +匀 +包 +匆 +匈 +匍 +匐 +匕 +化 +北 +匙 +匝 +匠 +匡 +匣 +匪 +匮 +匯 +匱 +匹 +区 +医 +匾 +匿 +區 +十 +千 +卅 +升 +午 +卉 +半 +卍 +华 +协 +卑 +卒 +卓 +協 +单 +卖 +南 +単 +博 +卜 +卞 +卟 +占 +卡 +卢 +卤 +卦 +卧 +卫 +卮 +卯 +印 +危 +即 +却 +卵 +卷 +卸 +卻 +卿 +厂 +厄 +厅 +历 +厉 +压 +厌 +厕 +厘 +厚 +厝 +原 +厢 +厥 +厦 +厨 +厩 +厭 +厮 +厲 +厳 +去 +县 +叁 +参 +參 +又 +叉 +及 +友 +双 +反 +収 +发 +叔 +取 +受 +变 +叙 +叛 +叟 +叠 +叡 +叢 +口 +古 +句 +另 +叨 +叩 +只 +叫 +召 +叭 +叮 +可 +台 +叱 +史 +右 +叵 +叶 +号 +司 +叹 +叻 +叼 +叽 +吁 +吃 +各 +吆 +合 +吉 +吊 +吋 +同 +名 +后 +吏 +吐 +向 +吒 +吓 +吕 +吖 +吗 +君 +吝 +吞 +吟 +吠 +吡 +否 +吧 +吨 +吩 +含 +听 +吭 +吮 +启 +吱 +吳 +吴 +吵 +吶 +吸 +吹 +吻 +吼 +吽 +吾 +呀 +呂 +呃 +呆 +呈 +告 +呋 +呎 +呐 +呓 +呕 +呗 +员 +呛 +呜 +呢 +呤 +呦 +周 +呱 +呲 +味 +呵 +呷 +呸 +呻 +呼 +命 +咀 +咁 +咂 +咄 +咆 +咋 +和 +咎 +咏 +咐 +咒 +咔 +咕 +咖 +咗 +咘 +咙 +咚 +咛 +咣 +咤 +咦 +咧 +咨 +咩 +咪 +咫 +咬 +咭 +咯 +咱 +咲 +咳 +咸 +咻 +咽 +咿 +哀 +品 +哂 +哄 +哆 +哇 +哈 +哉 +哋 +哌 +响 +哎 +哏 +哐 +哑 +哒 +哔 +哗 +哟 +員 +哥 +哦 +哧 +哨 +哩 +哪 +哭 +哮 +哲 +哺 +哼 +哽 +唁 +唄 +唆 +唇 +唉 +唏 +唐 +唑 +唔 +唠 +唤 +唧 +唬 +售 +唯 +唰 +唱 +唳 +唷 +唸 +唾 +啃 +啄 +商 +啉 +啊 +問 +啓 +啕 +啖 +啜 +啞 +啟 +啡 +啤 +啥 +啦 +啧 +啪 +啫 +啬 +啮 +啰 +啱 +啲 +啵 +啶 +啷 +啸 +啻 +啼 +啾 +喀 +喂 +喃 +善 +喆 +喇 +喉 +喊 +喋 +喎 +喏 +喔 +喘 +喙 +喚 +喜 +喝 +喟 +喧 +喪 +喫 +喬 +單 +喰 +喱 +喲 +喳 +喵 +営 +喷 +喹 +喺 +喻 +喽 +嗅 +嗆 +嗇 +嗎 +嗑 +嗒 +嗓 +嗔 +嗖 +嗚 +嗜 +嗝 +嗟 +嗡 +嗣 +嗤 +嗦 +嗨 +嗪 +嗬 +嗯 +嗰 +嗲 +嗳 +嗶 +嗷 +嗽 +嘀 +嘅 +嘆 +嘈 +嘉 +嘌 +嘍 +嘎 +嘔 +嘖 +嘗 +嘘 +嘚 +嘛 +嘜 +嘞 +嘟 +嘢 +嘣 +嘤 +嘧 +嘩 +嘭 +嘮 +嘯 +嘰 +嘱 +嘲 +嘴 +嘶 +嘸 +嘹 +嘻 +嘿 +噁 +噌 +噎 +噓 +噔 +噗 +噙 +噜 +噠 +噢 +噤 +器 +噩 +噪 +噬 +噱 +噴 +噶 +噸 +噹 +噻 +噼 +嚀 +嚇 +嚎 +嚏 +嚐 +嚓 +嚕 +嚟 +嚣 +嚥 +嚨 +嚮 +嚴 +嚷 +嚼 +囂 +囉 +囊 +囍 +囑 +囔 +囗 +囚 +四 +囝 +回 +囟 +因 +囡 +团 +団 +囤 +囧 +囪 +囫 +园 +困 +囱 +囲 +図 +围 +囹 +固 +国 +图 +囿 +圃 +圄 +圆 +圈 +國 +圍 +圏 +園 +圓 +圖 +團 +圜 +土 +圣 +圧 +在 +圩 +圭 +地 +圳 +场 +圻 +圾 +址 +坂 +均 +坊 +坍 +坎 +坏 +坐 +坑 +块 +坚 +坛 +坝 +坞 +坟 +坠 +坡 +坤 +坦 +坨 +坪 +坯 +坳 +坵 +坷 +垂 +垃 +垄 +型 +垒 +垚 +垛 +垠 +垢 +垣 +垦 +垩 +垫 +垭 +垮 +垵 +埂 +埃 +埋 +城 +埔 +埕 +埗 +域 +埠 +埤 +埵 +執 +埸 +培 +基 +埼 +堀 +堂 +堃 +堅 +堆 +堇 +堑 +堕 +堙 +堡 +堤 +堪 +堯 +堰 +報 +場 +堵 +堺 +堿 +塊 +塌 +塑 +塔 +塗 +塘 +塚 +塞 +塢 +塩 +填 +塬 +塭 +塵 +塾 +墀 +境 +墅 +墉 +墊 +墒 +墓 +増 +墘 +墙 +墜 +增 +墟 +墨 +墩 +墮 +墳 +墻 +墾 +壁 +壅 +壆 +壇 +壊 +壑 +壓 +壕 +壘 +壞 +壟 +壢 +壤 +壩 +士 +壬 +壮 +壯 +声 +売 +壳 +壶 +壹 +壺 +壽 +处 +备 +変 +复 +夏 +夔 +夕 +外 +夙 +多 +夜 +够 +夠 +夢 +夥 +大 +天 +太 +夫 +夭 +央 +夯 +失 +头 +夷 +夸 +夹 +夺 +夾 +奂 +奄 +奇 +奈 +奉 +奋 +奎 +奏 +奐 +契 +奔 +奕 +奖 +套 +奘 +奚 +奠 +奢 +奥 +奧 +奪 +奬 +奮 +女 +奴 +奶 +奸 +她 +好 +如 +妃 +妄 +妆 +妇 +妈 +妊 +妍 +妒 +妓 +妖 +妘 +妙 +妝 +妞 +妣 +妤 +妥 +妨 +妩 +妪 +妮 +妲 +妳 +妹 +妻 +妾 +姆 +姉 +姊 +始 +姍 +姐 +姑 +姒 +姓 +委 +姗 +姚 +姜 +姝 +姣 +姥 +姦 +姨 +姪 +姫 +姬 +姹 +姻 +姿 +威 +娃 +娄 +娅 +娆 +娇 +娉 +娑 +娓 +娘 +娛 +娜 +娟 +娠 +娣 +娥 +娩 +娱 +娲 +娴 +娶 +娼 +婀 +婁 +婆 +婉 +婊 +婕 +婚 +婢 +婦 +婧 +婪 +婭 +婴 +婵 +婶 +婷 +婺 +婿 +媒 +媚 +媛 +媞 +媧 +媲 +媳 +媽 +媾 +嫁 +嫂 +嫉 +嫌 +嫑 +嫔 +嫖 +嫘 +嫚 +嫡 +嫣 +嫦 +嫩 +嫲 +嫵 +嫻 +嬅 +嬉 +嬌 +嬗 +嬛 +嬢 +嬤 +嬪 +嬰 +嬴 +嬷 +嬸 +嬿 +孀 +孃 +子 +孑 +孔 +孕 +孖 +字 +存 +孙 +孚 +孛 +孜 +孝 +孟 +孢 +季 +孤 +学 +孩 +孪 +孫 +孬 +孰 +孱 +孳 +孵 +學 +孺 +孽 +孿 +宁 +它 +宅 +宇 +守 +安 +宋 +完 +宏 +宓 +宕 +宗 +官 +宙 +定 +宛 +宜 +宝 +实 +実 +宠 +审 +客 +宣 +室 +宥 +宦 +宪 +宫 +宮 +宰 +害 +宴 +宵 +家 +宸 +容 +宽 +宾 +宿 +寂 +寄 +寅 +密 +寇 +富 +寐 +寒 +寓 +寛 +寝 +寞 +察 +寡 +寢 +寥 +實 +寧 +寨 +審 +寫 +寬 +寮 +寰 +寵 +寶 +寸 +对 +寺 +寻 +导 +対 +寿 +封 +専 +射 +将 +將 +專 +尉 +尊 +尋 +對 +導 +小 +少 +尔 +尕 +尖 +尘 +尚 +尝 +尤 +尧 +尬 +就 +尴 +尷 +尸 +尹 +尺 +尻 +尼 +尽 +尾 +尿 +局 +屁 +层 +屄 +居 +屆 +屈 +屉 +届 +屋 +屌 +屍 +屎 +屏 +屐 +屑 +展 +屜 +属 +屠 +屡 +屢 +層 +履 +屬 +屯 +山 +屹 +屿 +岀 +岁 +岂 +岌 +岐 +岑 +岔 +岖 +岗 +岘 +岙 +岚 +岛 +岡 +岩 +岫 +岬 +岭 +岱 +岳 +岷 +岸 +峇 +峋 +峒 +峙 +峡 +峤 +峥 +峦 +峨 +峪 +峭 +峯 +峰 +峴 +島 +峻 +峽 +崁 +崂 +崆 +崇 +崎 +崑 +崔 +崖 +崗 +崙 +崛 +崧 +崩 +崭 +崴 +崽 +嵇 +嵊 +嵋 +嵌 +嵐 +嵘 +嵩 +嵬 +嵯 +嶂 +嶄 +嶇 +嶋 +嶙 +嶺 +嶼 +嶽 +巅 +巍 +巒 +巔 +巖 +川 +州 +巡 +巢 +工 +左 +巧 +巨 +巩 +巫 +差 +己 +已 +巳 +巴 +巷 +巻 +巽 +巾 +巿 +币 +市 +布 +帅 +帆 +师 +希 +帐 +帑 +帕 +帖 +帘 +帚 +帛 +帜 +帝 +帥 +带 +帧 +師 +席 +帮 +帯 +帰 +帳 +帶 +帷 +常 +帼 +帽 +幀 +幂 +幄 +幅 +幌 +幔 +幕 +幟 +幡 +幢 +幣 +幫 +干 +平 +年 +并 +幸 +幹 +幺 +幻 +幼 +幽 +幾 +广 +庁 +広 +庄 +庆 +庇 +床 +序 +庐 +库 +应 +底 +庖 +店 +庙 +庚 +府 +庞 +废 +庠 +度 +座 +庫 +庭 +庵 +庶 +康 +庸 +庹 +庾 +廁 +廂 +廃 +廈 +廉 +廊 +廓 +廖 +廚 +廝 +廟 +廠 +廢 +廣 +廬 +廳 +延 +廷 +建 +廿 +开 +弁 +异 +弃 +弄 +弈 +弊 +弋 +式 +弑 +弒 +弓 +弔 +引 +弗 +弘 +弛 +弟 +张 +弥 +弦 +弧 +弩 +弭 +弯 +弱 +張 +強 +弹 +强 +弼 +弾 +彅 +彆 +彈 +彌 +彎 +归 +当 +录 +彗 +彙 +彝 +形 +彤 +彥 +彦 +彧 +彩 +彪 +彫 +彬 +彭 +彰 +影 +彷 +役 +彻 +彼 +彿 +往 +征 +径 +待 +徇 +很 +徉 +徊 +律 +後 +徐 +徑 +徒 +従 +徕 +得 +徘 +徙 +徜 +從 +徠 +御 +徨 +復 +循 +徬 +微 +徳 +徴 +徵 +德 +徹 +徼 +徽 +心 +必 +忆 +忌 +忍 +忏 +忐 +忑 +忒 +忖 +志 +忘 +忙 +応 +忠 +忡 +忤 +忧 +忪 +快 +忱 +念 +忻 +忽 +忿 +怀 +态 +怂 +怅 +怆 +怎 +怏 +怒 +怔 +怕 +怖 +怙 +怜 +思 +怠 +怡 +急 +怦 +性 +怨 +怪 +怯 +怵 +总 +怼 +恁 +恃 +恆 +恋 +恍 +恐 +恒 +恕 +恙 +恚 +恢 +恣 +恤 +恥 +恨 +恩 +恪 +恫 +恬 +恭 +息 +恰 +恳 +恵 +恶 +恸 +恺 +恻 +恼 +恿 +悄 +悅 +悉 +悌 +悍 +悔 +悖 +悚 +悟 +悠 +患 +悦 +您 +悩 +悪 +悬 +悯 +悱 +悲 +悴 +悵 +悶 +悸 +悻 +悼 +悽 +情 +惆 +惇 +惊 +惋 +惑 +惕 +惘 +惚 +惜 +惟 +惠 +惡 +惦 +惧 +惨 +惩 +惫 +惬 +惭 +惮 +惯 +惰 +惱 +想 +惴 +惶 +惹 +惺 +愁 +愆 +愈 +愉 +愍 +意 +愕 +愚 +愛 +愜 +感 +愣 +愤 +愧 +愫 +愷 +愿 +慄 +慈 +態 +慌 +慎 +慑 +慕 +慘 +慚 +慟 +慢 +慣 +慧 +慨 +慫 +慮 +慰 +慳 +慵 +慶 +慷 +慾 +憂 +憊 +憋 +憎 +憐 +憑 +憔 +憚 +憤 +憧 +憨 +憩 +憫 +憬 +憲 +憶 +憾 +懂 +懇 +懈 +應 +懊 +懋 +懑 +懒 +懦 +懲 +懵 +懶 +懷 +懸 +懺 +懼 +懾 +懿 +戀 +戈 +戊 +戌 +戍 +戎 +戏 +成 +我 +戒 +戕 +或 +战 +戚 +戛 +戟 +戡 +戦 +截 +戬 +戮 +戰 +戲 +戳 +戴 +戶 +户 +戸 +戻 +戾 +房 +所 +扁 +扇 +扈 +扉 +手 +才 +扎 +扑 +扒 +打 +扔 +払 +托 +扛 +扣 +扦 +执 +扩 +扪 +扫 +扬 +扭 +扮 +扯 +扰 +扱 +扳 +扶 +批 +扼 +找 +承 +技 +抄 +抉 +把 +抑 +抒 +抓 +投 +抖 +抗 +折 +抚 +抛 +抜 +択 +抟 +抠 +抡 +抢 +护 +报 +抨 +披 +抬 +抱 +抵 +抹 +押 +抽 +抿 +拂 +拄 +担 +拆 +拇 +拈 +拉 +拋 +拌 +拍 +拎 +拐 +拒 +拓 +拔 +拖 +拗 +拘 +拙 +拚 +招 +拜 +拟 +拡 +拢 +拣 +拥 +拦 +拧 +拨 +择 +括 +拭 +拮 +拯 +拱 +拳 +拴 +拷 +拼 +拽 +拾 +拿 +持 +挂 +指 +挈 +按 +挎 +挑 +挖 +挙 +挚 +挛 +挝 +挞 +挟 +挠 +挡 +挣 +挤 +挥 +挨 +挪 +挫 +振 +挲 +挹 +挺 +挽 +挾 +捂 +捅 +捆 +捉 +捋 +捌 +捍 +捎 +捏 +捐 +捕 +捞 +损 +捡 +换 +捣 +捧 +捨 +捩 +据 +捱 +捲 +捶 +捷 +捺 +捻 +掀 +掂 +掃 +掇 +授 +掉 +掌 +掏 +掐 +排 +掖 +掘 +掙 +掛 +掠 +採 +探 +掣 +接 +控 +推 +掩 +措 +掬 +掰 +掲 +掳 +掴 +掷 +掸 +掺 +揀 +揃 +揄 +揆 +揉 +揍 +描 +提 +插 +揖 +揚 +換 +握 +揣 +揩 +揪 +揭 +揮 +援 +揶 +揸 +揹 +揽 +搀 +搁 +搂 +搅 +損 +搏 +搐 +搓 +搔 +搖 +搗 +搜 +搞 +搡 +搪 +搬 +搭 +搵 +搶 +携 +搽 +摀 +摁 +摄 +摆 +摇 +摈 +摊 +摒 +摔 +摘 +摞 +摟 +摧 +摩 +摯 +摳 +摸 +摹 +摺 +摻 +撂 +撃 +撅 +撇 +撈 +撐 +撑 +撒 +撓 +撕 +撚 +撞 +撤 +撥 +撩 +撫 +撬 +播 +撮 +撰 +撲 +撵 +撷 +撸 +撻 +撼 +撿 +擀 +擁 +擂 +擄 +擅 +擇 +擊 +擋 +操 +擎 +擒 +擔 +擘 +據 +擞 +擠 +擡 +擢 +擦 +擬 +擰 +擱 +擲 +擴 +擷 +擺 +擼 +擾 +攀 +攏 +攒 +攔 +攘 +攙 +攜 +攝 +攞 +攢 +攣 +攤 +攥 +攪 +攫 +攬 +支 +收 +攸 +改 +攻 +放 +政 +故 +效 +敌 +敍 +敎 +敏 +救 +敕 +敖 +敗 +敘 +教 +敛 +敝 +敞 +敢 +散 +敦 +敬 +数 +敲 +整 +敵 +敷 +數 +斂 +斃 +文 +斋 +斌 +斎 +斐 +斑 +斓 +斗 +料 +斛 +斜 +斟 +斡 +斤 +斥 +斧 +斩 +斫 +斬 +断 +斯 +新 +斷 +方 +於 +施 +旁 +旃 +旅 +旋 +旌 +旎 +族 +旖 +旗 +无 +既 +日 +旦 +旧 +旨 +早 +旬 +旭 +旮 +旱 +时 +旷 +旺 +旻 +昀 +昂 +昆 +昇 +昉 +昊 +昌 +明 +昏 +易 +昔 +昕 +昙 +星 +映 +春 +昧 +昨 +昭 +是 +昱 +昴 +昵 +昶 +昼 +显 +晁 +時 +晃 +晉 +晋 +晌 +晏 +晒 +晓 +晔 +晕 +晖 +晗 +晚 +晝 +晞 +晟 +晤 +晦 +晨 +晩 +普 +景 +晰 +晴 +晶 +晷 +智 +晾 +暂 +暄 +暇 +暈 +暉 +暌 +暐 +暑 +暖 +暗 +暝 +暢 +暧 +暨 +暫 +暮 +暱 +暴 +暸 +暹 +曄 +曆 +曇 +曉 +曖 +曙 +曜 +曝 +曠 +曦 +曬 +曰 +曲 +曳 +更 +書 +曹 +曼 +曾 +替 +最 +會 +月 +有 +朋 +服 +朐 +朔 +朕 +朗 +望 +朝 +期 +朦 +朧 +木 +未 +末 +本 +札 +朮 +术 +朱 +朴 +朵 +机 +朽 +杀 +杂 +权 +杆 +杈 +杉 +李 +杏 +材 +村 +杓 +杖 +杜 +杞 +束 +杠 +条 +来 +杨 +杭 +杯 +杰 +東 +杳 +杵 +杷 +杼 +松 +板 +极 +构 +枇 +枉 +枋 +析 +枕 +林 +枚 +果 +枝 +枢 +枣 +枪 +枫 +枭 +枯 +枰 +枱 +枳 +架 +枷 +枸 +柄 +柏 +某 +柑 +柒 +染 +柔 +柘 +柚 +柜 +柞 +柠 +柢 +查 +柩 +柬 +柯 +柱 +柳 +柴 +柵 +査 +柿 +栀 +栃 +栄 +栅 +标 +栈 +栉 +栋 +栎 +栏 +树 +栓 +栖 +栗 +校 +栩 +株 +样 +核 +根 +格 +栽 +栾 +桀 +桁 +桂 +桃 +桅 +框 +案 +桉 +桌 +桎 +桐 +桑 +桓 +桔 +桜 +桠 +桡 +桢 +档 +桥 +桦 +桧 +桨 +桩 +桶 +桿 +梁 +梅 +梆 +梏 +梓 +梗 +條 +梟 +梢 +梦 +梧 +梨 +梭 +梯 +械 +梳 +梵 +梶 +检 +棂 +棄 +棉 +棋 +棍 +棒 +棕 +棗 +棘 +棚 +棟 +棠 +棣 +棧 +森 +棱 +棲 +棵 +棹 +棺 +椁 +椅 +椋 +植 +椎 +椒 +検 +椪 +椭 +椰 +椹 +椽 +椿 +楂 +楊 +楓 +楔 +楚 +楝 +楞 +楠 +楣 +楨 +楫 +業 +楮 +極 +楷 +楸 +楹 +楼 +楽 +概 +榄 +榆 +榈 +榉 +榔 +榕 +榖 +榛 +榜 +榨 +榫 +榭 +榮 +榱 +榴 +榷 +榻 +槁 +槃 +構 +槌 +槍 +槎 +槐 +槓 +様 +槛 +槟 +槤 +槭 +槲 +槳 +槻 +槽 +槿 +樁 +樂 +樊 +樑 +樓 +標 +樞 +樟 +模 +樣 +権 +横 +樫 +樯 +樱 +樵 +樸 +樹 +樺 +樽 +樾 +橄 +橇 +橋 +橐 +橘 +橙 +機 +橡 +橢 +橫 +橱 +橹 +橼 +檀 +檄 +檎 +檐 +檔 +檗 +檜 +檢 +檬 +檯 +檳 +檸 +檻 +櫃 +櫚 +櫛 +櫥 +櫸 +櫻 +欄 +權 +欒 +欖 +欠 +次 +欢 +欣 +欧 +欲 +欸 +欺 +欽 +款 +歆 +歇 +歉 +歌 +歎 +歐 +歓 +歙 +歛 +歡 +止 +正 +此 +步 +武 +歧 +歩 +歪 +歯 +歲 +歳 +歴 +歷 +歸 +歹 +死 +歼 +殁 +殃 +殆 +殇 +殉 +殊 +残 +殒 +殓 +殖 +殘 +殞 +殡 +殤 +殭 +殯 +殲 +殴 +段 +殷 +殺 +殼 +殿 +毀 +毁 +毂 +毅 +毆 +毋 +母 +毎 +每 +毒 +毓 +比 +毕 +毗 +毘 +毙 +毛 +毡 +毫 +毯 +毽 +氈 +氏 +氐 +民 +氓 +气 +氖 +気 +氙 +氛 +氟 +氡 +氢 +氣 +氤 +氦 +氧 +氨 +氪 +氫 +氮 +氯 +氰 +氲 +水 +氷 +永 +氹 +氾 +汀 +汁 +求 +汆 +汇 +汉 +汎 +汐 +汕 +汗 +汙 +汛 +汝 +汞 +江 +池 +污 +汤 +汨 +汩 +汪 +汰 +汲 +汴 +汶 +汹 +決 +汽 +汾 +沁 +沂 +沃 +沅 +沈 +沉 +沌 +沏 +沐 +沒 +沓 +沖 +沙 +沛 +沟 +没 +沢 +沣 +沥 +沦 +沧 +沪 +沫 +沭 +沮 +沱 +河 +沸 +油 +治 +沼 +沽 +沾 +沿 +況 +泄 +泉 +泊 +泌 +泓 +法 +泗 +泛 +泞 +泠 +泡 +波 +泣 +泥 +注 +泪 +泫 +泮 +泯 +泰 +泱 +泳 +泵 +泷 +泸 +泻 +泼 +泽 +泾 +洁 +洄 +洋 +洒 +洗 +洙 +洛 +洞 +津 +洩 +洪 +洮 +洱 +洲 +洵 +洶 +洸 +洹 +活 +洼 +洽 +派 +流 +浃 +浄 +浅 +浆 +浇 +浊 +测 +济 +浏 +浑 +浒 +浓 +浔 +浙 +浚 +浜 +浣 +浦 +浩 +浪 +浬 +浮 +浯 +浴 +海 +浸 +涂 +涅 +涇 +消 +涉 +涌 +涎 +涓 +涔 +涕 +涙 +涛 +涝 +涞 +涟 +涠 +涡 +涣 +涤 +润 +涧 +涨 +涩 +涪 +涮 +涯 +液 +涵 +涸 +涼 +涿 +淀 +淄 +淅 +淆 +淇 +淋 +淌 +淑 +淒 +淖 +淘 +淙 +淚 +淞 +淡 +淤 +淦 +淨 +淩 +淪 +淫 +淬 +淮 +深 +淳 +淵 +混 +淹 +淺 +添 +淼 +清 +済 +渉 +渊 +渋 +渍 +渎 +渐 +渔 +渗 +渙 +渚 +減 +渝 +渠 +渡 +渣 +渤 +渥 +渦 +温 +測 +渭 +港 +渲 +渴 +游 +渺 +渾 +湃 +湄 +湊 +湍 +湖 +湘 +湛 +湟 +湧 +湫 +湮 +湯 +湳 +湾 +湿 +満 +溃 +溅 +溉 +溏 +源 +準 +溜 +溝 +溟 +溢 +溥 +溧 +溪 +溫 +溯 +溱 +溴 +溶 +溺 +溼 +滁 +滂 +滄 +滅 +滇 +滋 +滌 +滑 +滓 +滔 +滕 +滙 +滚 +滝 +滞 +滟 +满 +滢 +滤 +滥 +滦 +滨 +滩 +滬 +滯 +滲 +滴 +滷 +滸 +滾 +滿 +漁 +漂 +漆 +漉 +漏 +漓 +演 +漕 +漠 +漢 +漣 +漩 +漪 +漫 +漬 +漯 +漱 +漲 +漳 +漸 +漾 +漿 +潆 +潇 +潋 +潍 +潑 +潔 +潘 +潛 +潜 +潞 +潟 +潢 +潤 +潦 +潧 +潭 +潮 +潰 +潴 +潸 +潺 +潼 +澀 +澄 +澆 +澈 +澍 +澎 +澗 +澜 +澡 +澤 +澧 +澱 +澳 +澹 +激 +濁 +濂 +濃 +濑 +濒 +濕 +濘 +濛 +濟 +濠 +濡 +濤 +濫 +濬 +濮 +濯 +濱 +濺 +濾 +瀅 +瀆 +瀉 +瀋 +瀏 +瀑 +瀕 +瀘 +瀚 +瀛 +瀝 +瀞 +瀟 +瀧 +瀨 +瀬 +瀰 +瀾 +灌 +灏 +灑 +灘 +灝 +灞 +灣 +火 +灬 +灭 +灯 +灰 +灵 +灶 +灸 +灼 +災 +灾 +灿 +炀 +炁 +炅 +炉 +炊 +炎 +炒 +炔 +炕 +炖 +炙 +炜 +炫 +炬 +炭 +炮 +炯 +炳 +炷 +炸 +点 +為 +炼 +炽 +烁 +烂 +烃 +烈 +烊 +烏 +烘 +烙 +烛 +烟 +烤 +烦 +烧 +烨 +烩 +烫 +烬 +热 +烯 +烷 +烹 +烽 +焉 +焊 +焕 +焖 +焗 +焘 +焙 +焚 +焜 +無 +焦 +焯 +焰 +焱 +然 +焼 +煅 +煉 +煊 +煌 +煎 +煒 +煖 +煙 +煜 +煞 +煤 +煥 +煦 +照 +煨 +煩 +煮 +煲 +煸 +煽 +熄 +熊 +熏 +熒 +熔 +熙 +熟 +熠 +熨 +熬 +熱 +熵 +熹 +熾 +燁 +燃 +燄 +燈 +燉 +燊 +燎 +燒 +燔 +燕 +燙 +燜 +營 +燥 +燦 +燧 +燭 +燮 +燴 +燻 +燼 +燿 +爆 +爍 +爐 +爛 +爪 +爬 +爭 +爰 +爱 +爲 +爵 +父 +爷 +爸 +爹 +爺 +爻 +爽 +爾 +牆 +片 +版 +牌 +牍 +牒 +牙 +牛 +牝 +牟 +牠 +牡 +牢 +牦 +牧 +物 +牯 +牲 +牴 +牵 +特 +牺 +牽 +犀 +犁 +犄 +犊 +犍 +犒 +犢 +犧 +犬 +犯 +状 +犷 +犸 +犹 +狀 +狂 +狄 +狈 +狎 +狐 +狒 +狗 +狙 +狞 +狠 +狡 +狩 +独 +狭 +狮 +狰 +狱 +狸 +狹 +狼 +狽 +猎 +猕 +猖 +猗 +猙 +猛 +猜 +猝 +猥 +猩 +猪 +猫 +猬 +献 +猴 +猶 +猷 +猾 +猿 +獄 +獅 +獎 +獐 +獒 +獗 +獠 +獣 +獨 +獭 +獰 +獲 +獵 +獷 +獸 +獺 +獻 +獼 +獾 +玄 +率 +玉 +王 +玑 +玖 +玛 +玟 +玠 +玥 +玩 +玫 +玮 +环 +现 +玲 +玳 +玷 +玺 +玻 +珀 +珂 +珅 +珈 +珉 +珊 +珍 +珏 +珐 +珑 +珙 +珞 +珠 +珣 +珥 +珩 +珪 +班 +珮 +珲 +珺 +現 +球 +琅 +理 +琇 +琉 +琊 +琍 +琏 +琐 +琛 +琢 +琥 +琦 +琨 +琪 +琬 +琮 +琰 +琲 +琳 +琴 +琵 +琶 +琺 +琼 +瑀 +瑁 +瑄 +瑋 +瑕 +瑗 +瑙 +瑚 +瑛 +瑜 +瑞 +瑟 +瑠 +瑣 +瑤 +瑩 +瑪 +瑯 +瑰 +瑶 +瑾 +璀 +璁 +璃 +璇 +璉 +璋 +璎 +璐 +璜 +璞 +璟 +璧 +璨 +環 +璽 +璿 +瓊 +瓏 +瓒 +瓜 +瓢 +瓣 +瓤 +瓦 +瓮 +瓯 +瓴 +瓶 +瓷 +甄 +甌 +甕 +甘 +甙 +甚 +甜 +生 +產 +産 +甥 +甦 +用 +甩 +甫 +甬 +甭 +甯 +田 +由 +甲 +申 +电 +男 +甸 +町 +画 +甾 +畀 +畅 +界 +畏 +畑 +畔 +留 +畜 +畝 +畢 +略 +畦 +番 +畫 +異 +畲 +畳 +畴 +當 +畸 +畹 +畿 +疆 +疇 +疊 +疏 +疑 +疔 +疖 +疗 +疙 +疚 +疝 +疟 +疡 +疣 +疤 +疥 +疫 +疮 +疯 +疱 +疲 +疳 +疵 +疸 +疹 +疼 +疽 +疾 +痂 +病 +症 +痈 +痉 +痊 +痍 +痒 +痔 +痕 +痘 +痙 +痛 +痞 +痠 +痢 +痣 +痤 +痧 +痨 +痪 +痫 +痰 +痱 +痴 +痹 +痺 +痼 +痿 +瘀 +瘁 +瘋 +瘍 +瘓 +瘘 +瘙 +瘟 +瘠 +瘡 +瘢 +瘤 +瘦 +瘧 +瘩 +瘪 +瘫 +瘴 +瘸 +瘾 +療 +癇 +癌 +癒 +癖 +癜 +癞 +癡 +癢 +癣 +癥 +癫 +癬 +癮 +癱 +癲 +癸 +発 +登 +發 +白 +百 +皂 +的 +皆 +皇 +皈 +皋 +皎 +皑 +皓 +皖 +皙 +皚 +皮 +皰 +皱 +皴 +皺 +皿 +盂 +盃 +盅 +盆 +盈 +益 +盎 +盏 +盐 +监 +盒 +盔 +盖 +盗 +盘 +盛 +盜 +盞 +盟 +盡 +監 +盤 +盥 +盧 +盪 +目 +盯 +盱 +盲 +直 +相 +盹 +盼 +盾 +省 +眈 +眉 +看 +県 +眙 +眞 +真 +眠 +眦 +眨 +眩 +眯 +眶 +眷 +眸 +眺 +眼 +眾 +着 +睁 +睇 +睏 +睐 +睑 +睛 +睜 +睞 +睡 +睢 +督 +睥 +睦 +睨 +睪 +睫 +睬 +睹 +睽 +睾 +睿 +瞄 +瞅 +瞇 +瞋 +瞌 +瞎 +瞑 +瞒 +瞓 +瞞 +瞟 +瞠 +瞥 +瞧 +瞩 +瞪 +瞬 +瞭 +瞰 +瞳 +瞻 +瞼 +瞿 +矇 +矍 +矗 +矚 +矛 +矜 +矢 +矣 +知 +矩 +矫 +短 +矮 +矯 +石 +矶 +矽 +矾 +矿 +码 +砂 +砌 +砍 +砒 +研 +砖 +砗 +砚 +砝 +砣 +砥 +砧 +砭 +砰 +砲 +破 +砷 +砸 +砺 +砼 +砾 +础 +硅 +硐 +硒 +硕 +硝 +硫 +硬 +确 +硯 +硼 +碁 +碇 +碉 +碌 +碍 +碎 +碑 +碓 +碗 +碘 +碚 +碛 +碟 +碣 +碧 +碩 +碰 +碱 +碳 +碴 +確 +碼 +碾 +磁 +磅 +磊 +磋 +磐 +磕 +磚 +磡 +磨 +磬 +磯 +磲 +磷 +磺 +礁 +礎 +礙 +礡 +礦 +礪 +礫 +礴 +示 +礼 +社 +祀 +祁 +祂 +祇 +祈 +祉 +祎 +祐 +祕 +祖 +祗 +祚 +祛 +祜 +祝 +神 +祟 +祠 +祢 +祥 +票 +祭 +祯 +祷 +祸 +祺 +祿 +禀 +禁 +禄 +禅 +禍 +禎 +福 +禛 +禦 +禧 +禪 +禮 +禱 +禹 +禺 +离 +禽 +禾 +禿 +秀 +私 +秃 +秆 +秉 +秋 +种 +科 +秒 +秘 +租 +秣 +秤 +秦 +秧 +秩 +秭 +积 +称 +秸 +移 +秽 +稀 +稅 +程 +稍 +税 +稔 +稗 +稚 +稜 +稞 +稟 +稠 +稣 +種 +稱 +稲 +稳 +稷 +稹 +稻 +稼 +稽 +稿 +穀 +穂 +穆 +穌 +積 +穎 +穗 +穢 +穩 +穫 +穴 +究 +穷 +穹 +空 +穿 +突 +窃 +窄 +窈 +窍 +窑 +窒 +窓 +窕 +窖 +窗 +窘 +窜 +窝 +窟 +窠 +窥 +窦 +窨 +窩 +窪 +窮 +窯 +窺 +窿 +竄 +竅 +竇 +竊 +立 +竖 +站 +竜 +竞 +竟 +章 +竣 +童 +竭 +端 +競 +竹 +竺 +竽 +竿 +笃 +笆 +笈 +笋 +笏 +笑 +笔 +笙 +笛 +笞 +笠 +符 +笨 +第 +笹 +笺 +笼 +筆 +等 +筊 +筋 +筍 +筏 +筐 +筑 +筒 +答 +策 +筛 +筝 +筠 +筱 +筲 +筵 +筷 +筹 +签 +简 +箇 +箋 +箍 +箏 +箐 +箔 +箕 +算 +箝 +管 +箩 +箫 +箭 +箱 +箴 +箸 +節 +篁 +範 +篆 +篇 +築 +篑 +篓 +篙 +篝 +篠 +篡 +篤 +篩 +篪 +篮 +篱 +篷 +簇 +簌 +簍 +簡 +簦 +簧 +簪 +簫 +簷 +簸 +簽 +簾 +簿 +籁 +籃 +籌 +籍 +籐 +籟 +籠 +籤 +籬 +籮 +籲 +米 +类 +籼 +籽 +粄 +粉 +粑 +粒 +粕 +粗 +粘 +粟 +粤 +粥 +粧 +粪 +粮 +粱 +粲 +粳 +粵 +粹 +粼 +粽 +精 +粿 +糅 +糊 +糍 +糕 +糖 +糗 +糙 +糜 +糞 +糟 +糠 +糧 +糬 +糯 +糰 +糸 +系 +糾 +紀 +紂 +約 +紅 +紉 +紊 +紋 +納 +紐 +紓 +純 +紗 +紘 +紙 +級 +紛 +紜 +素 +紡 +索 +紧 +紫 +紮 +累 +細 +紳 +紹 +紺 +終 +絃 +組 +絆 +経 +結 +絕 +絞 +絡 +絢 +給 +絨 +絮 +統 +絲 +絳 +絵 +絶 +絹 +綁 +綏 +綑 +經 +継 +続 +綜 +綠 +綢 +綦 +綫 +綬 +維 +綱 +網 +綴 +綵 +綸 +綺 +綻 +綽 +綾 +綿 +緊 +緋 +総 +緑 +緒 +緘 +線 +緝 +緞 +締 +緣 +編 +緩 +緬 +緯 +練 +緹 +緻 +縁 +縄 +縈 +縛 +縝 +縣 +縫 +縮 +縱 +縴 +縷 +總 +績 +繁 +繃 +繆 +繇 +繋 +織 +繕 +繚 +繞 +繡 +繩 +繪 +繫 +繭 +繳 +繹 +繼 +繽 +纂 +續 +纍 +纏 +纓 +纔 +纖 +纜 +纠 +红 +纣 +纤 +约 +级 +纨 +纪 +纫 +纬 +纭 +纯 +纰 +纱 +纲 +纳 +纵 +纶 +纷 +纸 +纹 +纺 +纽 +纾 +线 +绀 +练 +组 +绅 +细 +织 +终 +绊 +绍 +绎 +经 +绑 +绒 +结 +绔 +绕 +绘 +给 +绚 +绛 +络 +绝 +绞 +统 +绡 +绢 +绣 +绥 +绦 +继 +绩 +绪 +绫 +续 +绮 +绯 +绰 +绳 +维 +绵 +绶 +绷 +绸 +绻 +综 +绽 +绾 +绿 +缀 +缄 +缅 +缆 +缇 +缈 +缉 +缎 +缓 +缔 +缕 +编 +缘 +缙 +缚 +缜 +缝 +缠 +缢 +缤 +缥 +缨 +缩 +缪 +缭 +缮 +缰 +缱 +缴 +缸 +缺 +缽 +罂 +罄 +罌 +罐 +网 +罔 +罕 +罗 +罚 +罡 +罢 +罩 +罪 +置 +罰 +署 +罵 +罷 +罹 +羁 +羅 +羈 +羊 +羌 +美 +羔 +羚 +羞 +羟 +羡 +羣 +群 +羥 +羧 +羨 +義 +羯 +羲 +羸 +羹 +羽 +羿 +翁 +翅 +翊 +翌 +翎 +習 +翔 +翘 +翟 +翠 +翡 +翦 +翩 +翰 +翱 +翳 +翹 +翻 +翼 +耀 +老 +考 +耄 +者 +耆 +耋 +而 +耍 +耐 +耒 +耕 +耗 +耘 +耙 +耦 +耨 +耳 +耶 +耷 +耸 +耻 +耽 +耿 +聂 +聆 +聊 +聋 +职 +聒 +联 +聖 +聘 +聚 +聞 +聪 +聯 +聰 +聲 +聳 +聴 +聶 +職 +聽 +聾 +聿 +肃 +肄 +肅 +肆 +肇 +肉 +肋 +肌 +肏 +肓 +肖 +肘 +肚 +肛 +肝 +肠 +股 +肢 +肤 +肥 +肩 +肪 +肮 +肯 +肱 +育 +肴 +肺 +肽 +肾 +肿 +胀 +胁 +胃 +胄 +胆 +背 +胍 +胎 +胖 +胚 +胛 +胜 +胝 +胞 +胡 +胤 +胥 +胧 +胫 +胭 +胯 +胰 +胱 +胳 +胴 +胶 +胸 +胺 +能 +脂 +脅 +脆 +脇 +脈 +脉 +脊 +脍 +脏 +脐 +脑 +脓 +脖 +脘 +脚 +脛 +脣 +脩 +脫 +脯 +脱 +脲 +脳 +脸 +脹 +脾 +腆 +腈 +腊 +腋 +腌 +腎 +腐 +腑 +腓 +腔 +腕 +腥 +腦 +腩 +腫 +腭 +腮 +腰 +腱 +腳 +腴 +腸 +腹 +腺 +腻 +腼 +腾 +腿 +膀 +膈 +膊 +膏 +膑 +膘 +膚 +膛 +膜 +膝 +膠 +膦 +膨 +膩 +膳 +膺 +膻 +膽 +膾 +膿 +臀 +臂 +臃 +臆 +臉 +臊 +臍 +臓 +臘 +臟 +臣 +臥 +臧 +臨 +自 +臬 +臭 +至 +致 +臺 +臻 +臼 +臾 +舀 +舂 +舅 +舆 +與 +興 +舉 +舊 +舌 +舍 +舎 +舐 +舒 +舔 +舖 +舗 +舛 +舜 +舞 +舟 +航 +舫 +般 +舰 +舱 +舵 +舶 +舷 +舸 +船 +舺 +舾 +艇 +艋 +艘 +艙 +艦 +艮 +良 +艰 +艱 +色 +艳 +艷 +艹 +艺 +艾 +节 +芃 +芈 +芊 +芋 +芍 +芎 +芒 +芙 +芜 +芝 +芡 +芥 +芦 +芩 +芪 +芫 +芬 +芭 +芮 +芯 +花 +芳 +芷 +芸 +芹 +芻 +芽 +芾 +苁 +苄 +苇 +苋 +苍 +苏 +苑 +苒 +苓 +苔 +苕 +苗 +苛 +苜 +苞 +苟 +苡 +苣 +若 +苦 +苫 +苯 +英 +苷 +苹 +苻 +茁 +茂 +范 +茄 +茅 +茉 +茎 +茏 +茗 +茜 +茧 +茨 +茫 +茬 +茭 +茯 +茱 +茲 +茴 +茵 +茶 +茸 +茹 +茼 +荀 +荃 +荆 +草 +荊 +荏 +荐 +荒 +荔 +荖 +荘 +荚 +荞 +荟 +荠 +荡 +荣 +荤 +荥 +荧 +荨 +荪 +荫 +药 +荳 +荷 +荸 +荻 +荼 +荽 +莅 +莆 +莉 +莊 +莎 +莒 +莓 +莖 +莘 +莞 +莠 +莢 +莧 +莪 +莫 +莱 +莲 +莴 +获 +莹 +莺 +莽 +莿 +菀 +菁 +菅 +菇 +菈 +菊 +菌 +菏 +菓 +菖 +菘 +菜 +菟 +菠 +菡 +菩 +華 +菱 +菲 +菸 +菽 +萁 +萃 +萄 +萊 +萋 +萌 +萍 +萎 +萘 +萝 +萤 +营 +萦 +萧 +萨 +萩 +萬 +萱 +萵 +萸 +萼 +落 +葆 +葉 +著 +葚 +葛 +葡 +董 +葦 +葩 +葫 +葬 +葭 +葯 +葱 +葳 +葵 +葷 +葺 +蒂 +蒋 +蒐 +蒔 +蒙 +蒜 +蒞 +蒟 +蒡 +蒨 +蒲 +蒸 +蒹 +蒻 +蒼 +蒿 +蓁 +蓄 +蓆 +蓉 +蓋 +蓑 +蓓 +蓖 +蓝 +蓟 +蓦 +蓬 +蓮 +蓼 +蓿 +蔑 +蔓 +蔔 +蔗 +蔘 +蔚 +蔡 +蔣 +蔥 +蔫 +蔬 +蔭 +蔵 +蔷 +蔺 +蔻 +蔼 +蔽 +蕁 +蕃 +蕈 +蕉 +蕊 +蕎 +蕙 +蕤 +蕨 +蕩 +蕪 +蕭 +蕲 +蕴 +蕻 +蕾 +薄 +薅 +薇 +薈 +薊 +薏 +薑 +薔 +薙 +薛 +薦 +薨 +薩 +薪 +薬 +薯 +薰 +薹 +藉 +藍 +藏 +藐 +藓 +藕 +藜 +藝 +藤 +藥 +藩 +藹 +藻 +藿 +蘆 +蘇 +蘊 +蘋 +蘑 +蘚 +蘭 +蘸 +蘼 +蘿 +虎 +虏 +虐 +虑 +虔 +處 +虚 +虛 +虜 +虞 +號 +虢 +虧 +虫 +虬 +虱 +虹 +虻 +虽 +虾 +蚀 +蚁 +蚂 +蚊 +蚌 +蚓 +蚕 +蚜 +蚝 +蚣 +蚤 +蚩 +蚪 +蚯 +蚱 +蚵 +蛀 +蛆 +蛇 +蛊 +蛋 +蛎 +蛐 +蛔 +蛙 +蛛 +蛟 +蛤 +蛭 +蛮 +蛰 +蛳 +蛹 +蛻 +蛾 +蜀 +蜂 +蜃 +蜆 +蜇 +蜈 +蜊 +蜍 +蜒 +蜓 +蜕 +蜗 +蜘 +蜚 +蜜 +蜡 +蜢 +蜥 +蜱 +蜴 +蜷 +蜻 +蜿 +蝇 +蝈 +蝉 +蝌 +蝎 +蝕 +蝗 +蝙 +蝟 +蝠 +蝦 +蝨 +蝴 +蝶 +蝸 +蝼 +螂 +螃 +融 +螞 +螢 +螨 +螯 +螳 +螺 +蟀 +蟄 +蟆 +蟋 +蟎 +蟑 +蟒 +蟠 +蟬 +蟲 +蟹 +蟻 +蟾 +蠅 +蠍 +蠔 +蠕 +蠛 +蠟 +蠡 +蠢 +蠣 +蠱 +蠶 +蠹 +蠻 +血 +衄 +衅 +衆 +行 +衍 +術 +衔 +街 +衙 +衛 +衝 +衞 +衡 +衢 +衣 +补 +表 +衩 +衫 +衬 +衮 +衰 +衲 +衷 +衹 +衾 +衿 +袁 +袂 +袄 +袅 +袈 +袋 +袍 +袒 +袖 +袜 +袞 +袤 +袪 +被 +袭 +袱 +裁 +裂 +装 +裆 +裊 +裏 +裔 +裕 +裘 +裙 +補 +裝 +裟 +裡 +裤 +裨 +裱 +裳 +裴 +裸 +裹 +製 +裾 +褂 +複 +褐 +褒 +褓 +褔 +褚 +褥 +褪 +褫 +褲 +褶 +褻 +襁 +襄 +襟 +襠 +襪 +襬 +襯 +襲 +西 +要 +覃 +覆 +覇 +見 +規 +覓 +視 +覚 +覦 +覧 +親 +覬 +観 +覷 +覺 +覽 +觀 +见 +观 +规 +觅 +视 +览 +觉 +觊 +觎 +觐 +觑 +角 +觞 +解 +觥 +触 +觸 +言 +訂 +計 +訊 +討 +訓 +訕 +訖 +託 +記 +訛 +訝 +訟 +訣 +訥 +訪 +設 +許 +訳 +訴 +訶 +診 +註 +証 +詆 +詐 +詔 +評 +詛 +詞 +詠 +詡 +詢 +詣 +試 +詩 +詫 +詬 +詭 +詮 +詰 +話 +該 +詳 +詹 +詼 +誅 +誇 +誉 +誌 +認 +誓 +誕 +誘 +語 +誠 +誡 +誣 +誤 +誥 +誦 +誨 +說 +説 +読 +誰 +課 +誹 +誼 +調 +諄 +談 +請 +諏 +諒 +論 +諗 +諜 +諡 +諦 +諧 +諫 +諭 +諮 +諱 +諳 +諷 +諸 +諺 +諾 +謀 +謁 +謂 +謄 +謊 +謎 +謐 +謔 +謗 +謙 +講 +謝 +謠 +謨 +謬 +謹 +謾 +譁 +證 +譎 +譏 +識 +譙 +譚 +譜 +警 +譬 +譯 +議 +譲 +譴 +護 +譽 +讀 +變 +讓 +讚 +讞 +计 +订 +认 +讥 +讧 +讨 +让 +讪 +讫 +训 +议 +讯 +记 +讲 +讳 +讴 +讶 +讷 +许 +讹 +论 +讼 +讽 +设 +访 +诀 +证 +诃 +评 +诅 +识 +诈 +诉 +诊 +诋 +词 +诏 +译 +试 +诗 +诘 +诙 +诚 +诛 +话 +诞 +诟 +诠 +诡 +询 +诣 +诤 +该 +详 +诧 +诩 +诫 +诬 +语 +误 +诰 +诱 +诲 +说 +诵 +诶 +请 +诸 +诺 +读 +诽 +课 +诿 +谀 +谁 +调 +谄 +谅 +谆 +谈 +谊 +谋 +谌 +谍 +谎 +谏 +谐 +谑 +谒 +谓 +谔 +谕 +谗 +谘 +谙 +谚 +谛 +谜 +谟 +谢 +谣 +谤 +谥 +谦 +谧 +谨 +谩 +谪 +谬 +谭 +谯 +谱 +谲 +谴 +谶 +谷 +豁 +豆 +豇 +豈 +豉 +豊 +豌 +豎 +豐 +豔 +豚 +象 +豢 +豪 +豫 +豬 +豹 +豺 +貂 +貅 +貌 +貓 +貔 +貘 +貝 +貞 +負 +財 +貢 +貧 +貨 +販 +貪 +貫 +責 +貯 +貰 +貳 +貴 +貶 +買 +貸 +費 +貼 +貽 +貿 +賀 +賁 +賂 +賃 +賄 +資 +賈 +賊 +賑 +賓 +賜 +賞 +賠 +賡 +賢 +賣 +賤 +賦 +質 +賬 +賭 +賴 +賺 +購 +賽 +贅 +贈 +贊 +贍 +贏 +贓 +贖 +贛 +贝 +贞 +负 +贡 +财 +责 +贤 +败 +账 +货 +质 +贩 +贪 +贫 +贬 +购 +贮 +贯 +贰 +贱 +贲 +贴 +贵 +贷 +贸 +费 +贺 +贻 +贼 +贾 +贿 +赁 +赂 +赃 +资 +赅 +赈 +赊 +赋 +赌 +赎 +赏 +赐 +赓 +赔 +赖 +赘 +赚 +赛 +赝 +赞 +赠 +赡 +赢 +赣 +赤 +赦 +赧 +赫 +赭 +走 +赳 +赴 +赵 +赶 +起 +趁 +超 +越 +趋 +趕 +趙 +趟 +趣 +趨 +足 +趴 +趵 +趸 +趺 +趾 +跃 +跄 +跆 +跋 +跌 +跎 +跑 +跖 +跚 +跛 +距 +跟 +跡 +跤 +跨 +跩 +跪 +路 +跳 +践 +跷 +跹 +跺 +跻 +踉 +踊 +踌 +踏 +踐 +踝 +踞 +踟 +踢 +踩 +踪 +踮 +踱 +踴 +踵 +踹 +蹂 +蹄 +蹇 +蹈 +蹉 +蹊 +蹋 +蹑 +蹒 +蹙 +蹟 +蹣 +蹤 +蹦 +蹩 +蹬 +蹭 +蹲 +蹴 +蹶 +蹺 +蹼 +蹿 +躁 +躇 +躉 +躊 +躋 +躍 +躏 +躪 +身 +躬 +躯 +躲 +躺 +軀 +車 +軋 +軌 +軍 +軒 +軟 +転 +軸 +軼 +軽 +軾 +較 +載 +輒 +輓 +輔 +輕 +輛 +輝 +輟 +輩 +輪 +輯 +輸 +輻 +輾 +輿 +轄 +轅 +轆 +轉 +轍 +轎 +轟 +车 +轧 +轨 +轩 +转 +轭 +轮 +软 +轰 +轲 +轴 +轶 +轻 +轼 +载 +轿 +较 +辄 +辅 +辆 +辇 +辈 +辉 +辊 +辍 +辐 +辑 +输 +辕 +辖 +辗 +辘 +辙 +辛 +辜 +辞 +辟 +辣 +辦 +辨 +辩 +辫 +辭 +辮 +辯 +辰 +辱 +農 +边 +辺 +辻 +込 +辽 +达 +迁 +迂 +迄 +迅 +过 +迈 +迎 +运 +近 +返 +还 +这 +进 +远 +违 +连 +迟 +迢 +迤 +迥 +迦 +迩 +迪 +迫 +迭 +述 +迴 +迷 +迸 +迹 +迺 +追 +退 +送 +适 +逃 +逅 +逆 +选 +逊 +逍 +透 +逐 +递 +途 +逕 +逗 +這 +通 +逛 +逝 +逞 +速 +造 +逢 +連 +逮 +週 +進 +逵 +逶 +逸 +逻 +逼 +逾 +遁 +遂 +遅 +遇 +遊 +運 +遍 +過 +遏 +遐 +遑 +遒 +道 +達 +違 +遗 +遙 +遛 +遜 +遞 +遠 +遢 +遣 +遥 +遨 +適 +遭 +遮 +遲 +遴 +遵 +遶 +遷 +選 +遺 +遼 +遽 +避 +邀 +邁 +邂 +邃 +還 +邇 +邈 +邊 +邋 +邏 +邑 +邓 +邕 +邛 +邝 +邢 +那 +邦 +邨 +邪 +邬 +邮 +邯 +邰 +邱 +邳 +邵 +邸 +邹 +邺 +邻 +郁 +郅 +郊 +郎 +郑 +郜 +郝 +郡 +郢 +郤 +郦 +郧 +部 +郫 +郭 +郴 +郵 +郷 +郸 +都 +鄂 +鄉 +鄒 +鄔 +鄙 +鄞 +鄢 +鄧 +鄭 +鄰 +鄱 +鄲 +鄺 +酉 +酊 +酋 +酌 +配 +酐 +酒 +酗 +酚 +酝 +酢 +酣 +酥 +酩 +酪 +酬 +酮 +酯 +酰 +酱 +酵 +酶 +酷 +酸 +酿 +醃 +醇 +醉 +醋 +醍 +醐 +醒 +醚 +醛 +醜 +醞 +醣 +醪 +醫 +醬 +醮 +醯 +醴 +醺 +釀 +釁 +采 +釉 +释 +釋 +里 +重 +野 +量 +釐 +金 +釗 +釘 +釜 +針 +釣 +釦 +釧 +釵 +鈀 +鈉 +鈍 +鈎 +鈔 +鈕 +鈞 +鈣 +鈦 +鈪 +鈴 +鈺 +鈾 +鉀 +鉄 +鉅 +鉉 +鉑 +鉗 +鉚 +鉛 +鉤 +鉴 +鉻 +銀 +銃 +銅 +銑 +銓 +銖 +銘 +銜 +銬 +銭 +銮 +銳 +銷 +銹 +鋁 +鋅 +鋒 +鋤 +鋪 +鋰 +鋸 +鋼 +錄 +錐 +錘 +錚 +錠 +錢 +錦 +錨 +錫 +錮 +錯 +録 +錳 +錶 +鍊 +鍋 +鍍 +鍛 +鍥 +鍰 +鍵 +鍺 +鍾 +鎂 +鎊 +鎌 +鎏 +鎔 +鎖 +鎗 +鎚 +鎧 +鎬 +鎮 +鎳 +鏈 +鏖 +鏗 +鏘 +鏞 +鏟 +鏡 +鏢 +鏤 +鏽 +鐘 +鐮 +鐲 +鐳 +鐵 +鐸 +鐺 +鑄 +鑊 +鑑 +鑒 +鑣 +鑫 +鑰 +鑲 +鑼 +鑽 +鑾 +鑿 +针 +钉 +钊 +钎 +钏 +钒 +钓 +钗 +钙 +钛 +钜 +钝 +钞 +钟 +钠 +钡 +钢 +钣 +钤 +钥 +钦 +钧 +钨 +钩 +钮 +钯 +钰 +钱 +钳 +钴 +钵 +钺 +钻 +钼 +钾 +钿 +铀 +铁 +铂 +铃 +铄 +铅 +铆 +铉 +铎 +铐 +铛 +铜 +铝 +铠 +铡 +铢 +铣 +铤 +铨 +铩 +铬 +铭 +铮 +铰 +铲 +铵 +银 +铸 +铺 +链 +铿 +销 +锁 +锂 +锄 +锅 +锆 +锈 +锉 +锋 +锌 +锏 +锐 +锑 +错 +锚 +锟 +锡 +锢 +锣 +锤 +锥 +锦 +锭 +键 +锯 +锰 +锲 +锵 +锹 +锺 +锻 +镀 +镁 +镂 +镇 +镉 +镌 +镍 +镐 +镑 +镕 +镖 +镗 +镛 +镜 +镣 +镭 +镯 +镰 +镳 +镶 +長 +长 +門 +閃 +閉 +開 +閎 +閏 +閑 +閒 +間 +閔 +閘 +閡 +関 +閣 +閥 +閨 +閩 +閱 +閲 +閹 +閻 +閾 +闆 +闇 +闊 +闌 +闍 +闔 +闕 +闖 +闘 +關 +闡 +闢 +门 +闪 +闫 +闭 +问 +闯 +闰 +闲 +间 +闵 +闷 +闸 +闹 +闺 +闻 +闽 +闾 +阀 +阁 +阂 +阅 +阆 +阇 +阈 +阉 +阎 +阐 +阑 +阔 +阕 +阖 +阙 +阚 +阜 +队 +阡 +阪 +阮 +阱 +防 +阳 +阴 +阵 +阶 +阻 +阿 +陀 +陂 +附 +际 +陆 +陇 +陈 +陋 +陌 +降 +限 +陕 +陛 +陝 +陞 +陟 +陡 +院 +陣 +除 +陨 +险 +陪 +陰 +陲 +陳 +陵 +陶 +陷 +陸 +険 +陽 +隅 +隆 +隈 +隊 +隋 +隍 +階 +随 +隐 +隔 +隕 +隘 +隙 +際 +障 +隠 +隣 +隧 +隨 +險 +隱 +隴 +隶 +隸 +隻 +隼 +隽 +难 +雀 +雁 +雄 +雅 +集 +雇 +雉 +雋 +雌 +雍 +雎 +雏 +雑 +雒 +雕 +雖 +雙 +雛 +雜 +雞 +離 +難 +雨 +雪 +雯 +雰 +雲 +雳 +零 +雷 +雹 +電 +雾 +需 +霁 +霄 +霆 +震 +霈 +霉 +霊 +霍 +霎 +霏 +霑 +霓 +霖 +霜 +霞 +霧 +霭 +霰 +露 +霸 +霹 +霽 +霾 +靂 +靄 +靈 +青 +靓 +靖 +静 +靚 +靛 +靜 +非 +靠 +靡 +面 +靥 +靦 +革 +靳 +靴 +靶 +靼 +鞅 +鞋 +鞍 +鞏 +鞑 +鞘 +鞠 +鞣 +鞦 +鞭 +韆 +韋 +韌 +韓 +韜 +韦 +韧 +韩 +韬 +韭 +音 +韵 +韶 +韻 +響 +頁 +頂 +頃 +項 +順 +須 +頌 +預 +頑 +頒 +頓 +頗 +領 +頜 +頡 +頤 +頫 +頭 +頰 +頷 +頸 +頹 +頻 +頼 +顆 +題 +額 +顎 +顏 +顔 +願 +顛 +類 +顧 +顫 +顯 +顱 +顴 +页 +顶 +顷 +项 +顺 +须 +顼 +顽 +顾 +顿 +颁 +颂 +预 +颅 +领 +颇 +颈 +颉 +颊 +颌 +颍 +颐 +频 +颓 +颔 +颖 +颗 +题 +颚 +颛 +颜 +额 +颞 +颠 +颡 +颢 +颤 +颦 +颧 +風 +颯 +颱 +颳 +颶 +颼 +飄 +飆 +风 +飒 +飓 +飕 +飘 +飙 +飚 +飛 +飞 +食 +飢 +飨 +飩 +飪 +飯 +飲 +飼 +飽 +飾 +餃 +餅 +餉 +養 +餌 +餐 +餒 +餓 +餘 +餚 +餛 +餞 +餡 +館 +餮 +餵 +餾 +饅 +饈 +饋 +饌 +饍 +饑 +饒 +饕 +饗 +饞 +饥 +饨 +饪 +饬 +饭 +饮 +饯 +饰 +饱 +饲 +饴 +饵 +饶 +饷 +饺 +饼 +饽 +饿 +馀 +馁 +馄 +馅 +馆 +馈 +馋 +馍 +馏 +馒 +馔 +首 +馗 +香 +馥 +馨 +馬 +馭 +馮 +馳 +馴 +駁 +駄 +駅 +駆 +駐 +駒 +駕 +駛 +駝 +駭 +駱 +駿 +騁 +騎 +騏 +験 +騙 +騨 +騰 +騷 +驀 +驅 +驊 +驍 +驒 +驕 +驗 +驚 +驛 +驟 +驢 +驥 +马 +驭 +驮 +驯 +驰 +驱 +驳 +驴 +驶 +驷 +驸 +驹 +驻 +驼 +驾 +驿 +骁 +骂 +骄 +骅 +骆 +骇 +骈 +骊 +骋 +验 +骏 +骐 +骑 +骗 +骚 +骛 +骜 +骞 +骠 +骡 +骤 +骥 +骧 +骨 +骯 +骰 +骶 +骷 +骸 +骼 +髂 +髅 +髋 +髏 +髒 +髓 +體 +髖 +高 +髦 +髪 +髮 +髯 +髻 +鬃 +鬆 +鬍 +鬓 +鬚 +鬟 +鬢 +鬣 +鬥 +鬧 +鬱 +鬼 +魁 +魂 +魄 +魅 +魇 +魍 +魏 +魔 +魘 +魚 +魯 +魷 +鮑 +鮨 +鮪 +鮭 +鮮 +鯉 +鯊 +鯖 +鯛 +鯨 +鯰 +鯽 +鰍 +鰓 +鰭 +鰲 +鰻 +鰾 +鱈 +鱉 +鱔 +鱗 +鱷 +鱸 +鱼 +鱿 +鲁 +鲈 +鲍 +鲑 +鲛 +鲜 +鲟 +鲢 +鲤 +鲨 +鲫 +鲱 +鲲 +鲶 +鲷 +鲸 +鳃 +鳄 +鳅 +鳌 +鳍 +鳕 +鳖 +鳗 +鳝 +鳞 +鳥 +鳩 +鳳 +鳴 +鳶 +鴉 +鴕 +鴛 +鴦 +鴨 +鴻 +鴿 +鵑 +鵜 +鵝 +鵡 +鵬 +鵰 +鵲 +鶘 +鶩 +鶯 +鶴 +鷗 +鷲 +鷹 +鷺 +鸚 +鸞 +鸟 +鸠 +鸡 +鸢 +鸣 +鸥 +鸦 +鸨 +鸪 +鸭 +鸯 +鸳 +鸵 +鸽 +鸾 +鸿 +鹂 +鹃 +鹄 +鹅 +鹈 +鹉 +鹊 +鹌 +鹏 +鹑 +鹕 +鹘 +鹜 +鹞 +鹤 +鹦 +鹧 +鹫 +鹭 +鹰 +鹳 +鹵 +鹹 +鹼 +鹽 +鹿 +麂 +麋 +麒 +麓 +麗 +麝 +麟 +麥 +麦 +麩 +麴 +麵 +麸 +麺 +麻 +麼 +麽 +麾 +黃 +黄 +黍 +黎 +黏 +黑 +黒 +黔 +默 +黛 +黜 +黝 +點 +黠 +黨 +黯 +黴 +鼋 +鼎 +鼐 +鼓 +鼠 +鼬 +鼹 +鼻 +鼾 +齁 +齊 +齋 +齐 +齒 +齡 +齢 +齣 +齦 +齿 +龄 +龅 +龈 +龊 +龋 +龌 +龍 +龐 +龔 +龕 +龙 +龚 +龛 +龜 +龟 +︰ +︱ +︶ +︿ +﹁ +﹂ +﹍ +﹏ +﹐ +﹑ +﹒ +﹔ +﹕ +﹖ +﹗ +﹙ +﹚ +﹝ +﹞ +﹡ +﹣ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +。 +「 +」 +、 +・ +ッ +ー +イ +ク +シ +ス +ト +ノ +フ +ラ +ル +ン +゙ +゚ + ̄ +¥ +👍 +🔥 +😂 +😎 +... +yam +10 +2017 +12 +11 +2016 +20 +30 +15 +06 +lofter +##s +2015 +by +16 +14 +18 +13 +24 +17 +2014 +21 +##0 +22 +19 +25 +23 +com +100 +00 +05 +2013 +##a +03 +09 +08 +28 +##2 +50 +01 +04 +##1 +27 +02 +2012 +##3 +26 +##e +07 +##8 +##5 +##6 +##4 +##9 +##7 +29 +2011 +40 +##t +2010 +##o +##d +##i +2009 +##n +app +www +the +##m +31 +##c +##l +##y +##r +##g +2008 +60 +http +200 +qq +##p +80 +##f +google +pixnet +90 +cookies +tripadvisor +500 +##er +##k +35 +##h +facebook +2007 +2000 +70 +##b +of +##x +##u +45 +300 +iphone +32 +1000 +2006 +48 +ip +36 +in +38 +3d +##w +##ing +55 +ctrip +##on +##v +33 +##の +to +34 +400 +id +2005 +it +37 +windows +llc +top +99 +42 +39 +000 +led +at +##an +41 +51 +52 +46 +49 +43 +53 +44 +##z +android +58 +and +59 +2004 +56 +vr +##か +5000 +2003 +47 +blogthis +twitter +54 +##le +150 +ok +2018 +57 +75 +cn +no +ios +##in +##mm +##00 +800 +on +te +3000 +65 +2001 +360 +95 +ig +lv +120 +##ng +##を +##us +##に +pc +てす +── +600 +##te +85 +2002 +88 +##ed +html +ncc +wifi +email +64 +blog +is +##10 +##て +mail +online +##al +dvd +##ic +studio +##は +##℃ +##ia +##と +line +vip +72 +##q +98 +##ce +##en +for +##is +##ra +##es +##j +usb +net +cp +1999 +asia +4g +##cm +diy +new +3c +##お +ta +66 +language +vs +apple +tw +86 +web +##ne +ipad +62 +you +##re +101 +68 +##tion +ps +de +bt +pony +atm +##2017 +1998 +67 +##ch +ceo +##or +go +##na +av +pro +cafe +96 +pinterest +97 +63 +pixstyleme3c +##ta +more +said +##2016 +1997 +mp3 +700 +##ll +nba +jun +##20 +92 +tv +1995 +pm +61 +76 +nbsp +250 +##ie +linux +##ma +cd +110 +hd +##17 +78 +##ion +77 +6000 +am +##th +##st +94 +##se +##et +69 +180 +gdp +my +105 +81 +abc +89 +flash +79 +one +93 +1990 +1996 +##ck +gps +##も +##ly +web885 +106 +2020 +91 +##ge +4000 +1500 +xd +boss +isbn +1994 +org +##ry +me +love +##11 +0fork +73 +##12 +3g +##ter +##ar +71 +82 +##la +hotel +130 +1970 +pk +83 +87 +140 +ie +##os +##30 +##el +74 +##50 +seo +cpu +##ml +p2p +84 +may +##る +sun +tue +internet +cc +posted +youtube +##at +##ン +##man +ii +##ル +##15 +abs +nt +pdf +yahoo +ago +1980 +##it +news +mac +104 +##てす +##me +##り +java +1992 +spa +##de +##nt +hk +all +plus +la +1993 +##mb +##16 +##ve +west +##da +160 +air +##い +##ps +から +##to +1989 +logo +htc +php +https +fi +momo +##son +sat +##ke +##80 +ebd +suv +wi +day +apk +##88 +##um +mv +galaxy +wiki +or +brake +##ス +1200 +する +this +1991 +mon +##こ +❤2017 +po +##ない +javascript +life +home +june +##ss +system +900 +##ー +##0 +pp +1988 +world +fb +4k +br +##as +ic +ai +leonardo +safari +##60 +live +free +xx +wed +win7 +kiehl +##co +lg +o2o +##go +us +235 +1949 +mm +しい +vfm +kanye +##90 +##2015 +##id +jr +##ey +123 +rss +##sa +##ro +##am +##no +thu +fri +350 +##sh +##ki +103 +comments +name +##のて +##pe +##ine +max +1987 +8000 +uber +##mi +##ton +wordpress +office +1986 +1985 +##ment +107 +bd +win10 +##ld +##li +gmail +bb +dior +##rs +##ri +##rd +##ます +up +cad +##® +dr +して +read +##21 +をお +##io +##99 +url +1984 +pvc +paypal +show +policy +##40 +##ty +##18 +with +##★ +##01 +txt +102 +##ba +dna +from +post +mini +ar +taiwan +john +##ga +privacy +agoda +##13 +##ny +word +##24 +##22 +##by +##ur +##hz +1982 +##ang +265 +cookie +netscape +108 +##ka +##~ +##ad +house +share +note +ibm +code +hello +nike +sim +survey +##016 +1979 +1950 +wikia +##32 +##017 +5g +cbc +##tor +##kg +1983 +##rt +##14 +campaign +store +2500 +os +##ct +##ts +##° +170 +api +##ns +365 +excel +##な +##ao +##ら +##し +~~ +##nd +university +163 +には +518 +##70 +##ya +##il +##25 +pierre +ipo +0020 +897 +##23 +hotels +##ian +のお +125 +years +6606 +##ers +##26 +high +##day +time +##ay +bug +##line +##く +##す +##be +xp +talk2yam +yamservice +10000 +coco +##dy +sony +##ies +1978 +microsoft +david +people +##ha +1960 +instagram +intel +その +##ot +iso +1981 +##va +115 +##mo +##land +xxx +man +co +ltxsw +##ation +baby +220 +##pa +##ol +1945 +7000 +tag +450 +##ue +msn +##31 +oppo +##ト +##ca +control +##om +st +chrome +##ure +##ん +be +##き +lol +##19 +した +##bo +240 +lady +##100 +##way +##から +4600 +##ko +##do +##un +4s +corporation +168 +##ni +herme +##28 +cp +978 +##up +##06 +ui +##ds +ppt +admin +three +します +bbc +re +128 +##48 +ca +##015 +##35 +hp +##ee +tpp +##た +##ive +×× +root +##cc +##ました +##ble +##ity +adobe +park +114 +et +oled +city +##ex +##ler +##ap +china +##book +20000 +view +##ice +global +##km +your +hong +##mg +out +##ms +ng +ebay +##29 +menu +ubuntu +##cy +rom +##view +open +ktv +do +server +##lo +if +english +##ね +##5 +##oo +1600 +##02 +step1 +kong +club +135 +july +inc +1976 +mr +hi +##net +touch +##ls +##ii +michael +lcd +##05 +##33 +phone +james +step2 +1300 +ios9 +##box +dc +##2 +##ley +samsung +111 +280 +pokemon +css +##ent +##les +いいえ +##1 +s8 +atom +play +bmw +##said +sa +etf +ctrl +♥yoyo♥ +##55 +2025 +##2014 +##66 +adidas +amazon +1958 +##ber +##ner +visa +##77 +##der +1800 +connectivity +##hi +firefox +109 +118 +hr +so +style +mark +pop +ol +skip +1975 +as +##27 +##ir +##61 +190 +mba +##う +##ai +le +##ver +1900 +cafe2017 +lte +super +113 +129 +##ron +amd +like +##☆ +are +##ster +we +##sk +paul +data +international +##ft +longchamp +ssd +good +##ート +##ti +reply +##my +↓↓↓ +apr +star +##ker +source +136 +js +112 +get +force +photo +##one +126 +##2013 +##ow +link +bbs +1972 +goods +##lin +python +119 +##ip +game +##ics +##ません +blue +##● +520 +##45 +page +itunes +##03 +1955 +260 +1968 +gt +gif +618 +##ff +##47 +group +くたさい +about +bar +ganji +##nce +music +lee +not +1977 +1971 +1973 +##per +an +faq +comment +##って +days +##ock +116 +##bs +1974 +1969 +v1 +player +1956 +xbox +sql +fm +f1 +139 +##ah +210 +##lv +##mp +##000 +melody +1957 +##3 +550 +17life +199 +1966 +xml +market +##au +##71 +999 +##04 +what +gl +##95 +##age +tips +##68 +book +##ting +mysql +can +1959 +230 +##ung +wonderland +watch +10℃ +##ction +9000 +mar +mobile +1946 +1962 +article +##db +part +▲top +party +って +1967 +1964 +1948 +##07 +##ore +##op +この +dj +##78 +##38 +010 +main +225 +1965 +##ong +art +320 +ad +134 +020 +##73 +117 +pm2 +japan +228 +##08 +ts +1963 +##ica +der +sm +##36 +2019 +##wa +ct +##7 +##や +##64 +1937 +homemesh +search +##85 +##れは +##tv +##di +macbook +##9 +##くたさい +service +##♥ +type +った +750 +##ier +##si +##75 +##います +##ok +best +##ット +goris +lock +##った +cf +3m +big +##ut +ftp +carol +##vi +10 +1961 +happy +sd +##ac +122 +anti +pe +cnn +iii +1920 +138 +##ラ +1940 +esp +jan +tags +##98 +##51 +august +vol +##86 +154 +##™ +##fs +##れ +##sion +design +ac +##ム +press +jordan +ppp +that +key +check +##6 +##tt +##㎡ +1080p +##lt +power +##42 +1952 +##bc +vivi +##ック +he +133 +121 +jpg +##rry +201 +175 +3500 +1947 +nb +##ted +##rn +しています +1954 +usd +##t00 +master +##ンク +001 +model +##58 +al +##09 +1953 +##34 +ram +goo +ても +##ui +127 +1930 +red +##ary +rpg +item +##pm +##41 +270 +##za +project +##2012 +hot +td +blogabstract +##ger +##62 +650 +##44 +gr2 +##します +##m +black +electronic +nfc +year +asus +また +html5 +cindy +##hd +m3 +132 +esc +##od +booking +##53 +fed +tvb +##81 +##ina +mit +165 +##いる +chan +192 +distribution +next +になる +peter +bios +steam +cm +1941 +にも +pk10 +##ix +##65 +##91 +dec +nasa +##ana +icecat +00z +b1 +will +##46 +li +se +##ji +##み +##ard +oct +##ain +jp +##ze +##bi +cio +##56 +smart +h5 +##39 +##port +curve +vpn +##nm +##dia +utc +##あり +12345678910 +##52 +rmvb +chanel +a4 +miss +##and +##im +media +who +##63 +she +girl +5s +124 +vera +##して +class +vivo +king +##フ +##ei +national +ab +1951 +5cm +888 +145 +ipod +ap +1100 +5mm +211 +ms +2756 +##69 +mp4 +msci +##po +##89 +131 +mg +index +380 +##bit +##out +##zz +##97 +##67 +158 +apec +##8 +photoshop +opec +¥799 +ては +##96 +##tes +##ast +2g +○○ +##ール +¥2899 +##ling +##よ +##ory +1938 +##ical +kitty +content +##43 +step3 +##cn +win8 +155 +vc +1400 +iphone7 +robert +##した +tcl +137 +beauty +##87 +en +dollars +##ys +##oc +step +pay +yy +a1 +##2011 +##lly +##ks +##♪ +1939 +188 +download +1944 +sep +exe +ph +います +school +gb +center +pr +street +##board +uv +##37 +##lan +winrar +##que +##ua +##com +1942 +1936 +480 +gpu +##4 +ettoday +fu +tom +##54 +##ren +##via +149 +##72 +b2b +144 +##79 +##tch +rose +arm +mb +##49 +##ial +##nn +nvidia +step4 +mvp +00㎡ +york +156 +##イ +how +cpi +591 +2765 +gov +kg +joe +##xx +mandy +pa +##ser +copyright +fashion +1935 +don +##け +ecu +##ist +##art +erp +wap +have +##lm +talk +##ek +##ning +##if +ch +##ite +video +1943 +cs +san +iot +look +##84 +##2010 +##ku +october +##ux +trump +##hs +##ide +box +141 +first +##ins +april +##ight +##83 +185 +angel +protected +aa +151 +162 +x1 +m2 +##fe +##× +##ho +size +143 +min +ofo +fun +gomaji +ex +hdmi +food +dns +march +chris +kevin +##のか +##lla +##pp +##ec +ag +ems +6s +720p +##rm +##ham +off +##92 +asp +team +fandom +ed +299 +▌♥ +##ell +info +されています +##82 +sina +4066 +161 +##able +##ctor +330 +399 +315 +dll +rights +ltd +idc +jul +3kg +1927 +142 +ma +surface +##76 +##ク +~~~ +304 +mall +eps +146 +green +##59 +map +space +donald +v2 +sodu +##light +1931 +148 +1700 +まて +310 +reserved +htm +##han +##57 +2d +178 +mod +##ise +##tions +152 +ti +##shi +doc +1933 +icp +055 +wang +##ram +shopping +aug +##pi +##well +now +wam +b2 +からお +##hu +236 +1928 +##gb +266 +f2 +##93 +153 +mix +##ef +##uan +bwl +##plus +##res +core +##ess +tea +5℃ +hktvmall +nhk +##ate +list +##ese +301 +feb +4m +inn +ての +nov +159 +12345 +daniel +##ci +pass +##bet +##nk +coffee +202 +ssl +airbnb +##ute +fbi +woshipm +skype +ea +cg +sp +##fc +##www +yes +edge +alt +007 +##94 +fpga +##ght +##gs +iso9001 +さい +##ile +##wood +##uo +image +lin +icon +american +##em +1932 +set +says +##king +##tive +blogger +##74 +なと +256 +147 +##ox +##zy +##red +##ium +##lf +nokia +claire +##リ +##ding +november +lohas +##500 +##tic +##マ +##cs +##ある +##che +##ire +##gy +##ult +db +january +win +##カ +166 +road +ptt +##ま +##つ +198 +##fa +##mer +anna +pchome +はい +udn +ef +420 +##time +##tte +2030 +##ア +g20 +white +かかります +1929 +308 +garden +eleven +di +##おります +chen +309b +777 +172 +young +cosplay +ちてない +4500 +bat +##123 +##tra +##ては +kindle +npc +steve +etc +##ern +##| +call +xperia +ces +travel +sk +s7 +##ous +1934 +##int +みいたたけます +183 +edu +file +cho +qr +##car +##our +186 +##ant +##d +eric +1914 +rends +##jo +##する +mastercard +##2000 +kb +##min +290 +##ino +vista +##ris +##ud +jack +2400 +##set +169 +pos +1912 +##her +##ou +taipei +しく +205 +beta +##ませんか +232 +##fi +express +255 +body +##ill +aphojoy +user +december +meiki +##ick +tweet +richard +##av +##ᆫ +iphone6 +##dd +ちてすか +views +##mark +321 +pd +##00 +times +##▲ +level +##ash +10g +point +5l +##ome +208 +koreanmall +##ak +george +q2 +206 +wma +tcp +##200 +スタッフ +full +mlb +##lle +##watch +tm +run +179 +911 +smith +business +##und +1919 +color +##tal +222 +171 +##less +moon +4399 +##rl +update +pcb +shop +499 +157 +little +なし +end +##mhz +van +dsp +easy +660 +##house +##key +history +##o +oh +##001 +##hy +##web +oem +let +was +##2009 +##gg +review +##wan +182 +##°c +203 +uc +title +##val +united +233 +2021 +##ons +doi +trivago +overdope +sbs +##ance +##ち +grand +special +573032185 +imf +216 +wx17house +##so +##ーム +audi +##he +london +william +##rp +##ake +science +beach +cfa +amp +ps4 +880 +##800 +##link +##hp +crm +ferragamo +bell +make +##eng +195 +under +zh +photos +2300 +##style +##ント +via +176 +da +##gi +company +i7 +##ray +thomas +370 +ufo +i5 +##max +plc +ben +back +research +8g +173 +mike +##pc +##ッフ +september +189 +##ace +vps +february +167 +pantos +wp +lisa +1921 +★★ +jquery +night +long +offer +##berg +##news +1911 +##いて +ray +fks +wto +せます +over +164 +340 +##all +##rus +1924 +##888 +##works +blogtitle +loftpermalink +##→ +187 +martin +test +ling +km +##め +15000 +fda +v3 +##ja +##ロ +wedding +かある +outlet +family +##ea +をこ +##top +story +##ness +salvatore +##lu +204 +swift +215 +room +している +oracle +##ul +1925 +sam +b2c +week +pi +rock +##のは +##a +##けと +##ean +##300 +##gle +cctv +after +chinese +##back +powered +x2 +##tan +1918 +##nes +##イン +canon +only +181 +##zi +##las +say +##oe +184 +##sd +221 +##bot +##world +##zo +sky +made +top100 +just +1926 +pmi +802 +234 +gap +##vr +177 +les +174 +▲topoct +ball +vogue +vi +ing +ofweek +cos +##list +##ort +▲topmay +##なら +##lon +として +last +##tc +##of +##bus +##gen +real +eva +##コ +a3 +nas +##lie +##ria +##coin +##bt +▲topapr +his +212 +cat +nata +vive +health +⋯⋯ +drive +sir +▲topmar +du +cup +##カー +##ook +##よう +##sy +alex +msg +tour +しました +3ce +##word +193 +ebooks +r8 +block +318 +##より +2200 +nice +pvp +207 +months +1905 +rewards +##ther +1917 +0800 +##xi +##チ +##sc +micro +850 +gg +blogfp +op +1922 +daily +m1 +264 +true +##bb +ml +##tar +##のお +##ky +anthony +196 +253 +##yo +state +218 +##ara +##aa +##rc +##tz +##ston +より +gear +##eo +##ade +ge +see +1923 +##win +##ura +ss +heart +##den +##ita +down +##sm +el +png +2100 +610 +rakuten +whatsapp +bay +dream +add +##use +680 +311 +pad +gucci +mpv +##ode +##fo +island +▲topjun +##▼ +223 +jason +214 +chicago +##❤ +しの +##hone +io +##れる +##ことか +sogo +be2 +##ology +990 +cloud +vcd +##con +2~3 +##ford +##joy +##kb +##こさいます +##rade +but +##ach +docker +##ful +rfid +ul +##ase +hit +ford +##star +580 +##○ +11 +a2 +sdk +reading +edited +##are +cmos +##mc +238 +siri +light +##ella +##ため +bloomberg +##read +pizza +##ison +jimmy +##vm +college +node +journal +ba +18k +##play +245 +##cer +20 +magic +##yu +191 +jump +288 +tt +##ings +asr +##lia +3200 +step5 +network +##cd +mc +いします +1234 +pixstyleme +273 +##600 +2800 +money +★★★★★ +1280 +12 +430 +bl +みの +act +##tus +tokyo +##rial +##life +emba +##ae +saas +tcs +##rk +##wang +summer +##sp +ko +##ving +390 +premium +##その +netflix +##ヒ +uk +mt +##lton +right +frank +two +209 +える +##ple +##cal +021 +##んな +##sen +##ville +hold +nexus +dd +##ius +てお +##mah +##なく +tila +zero +820 +ce +##tin +resort +##ws +charles +old +p10 +5d +report +##360 +##ru +##には +bus +vans +lt +##est +pv +##レ +links +rebecca +##ツ +##dm +azure +##365 +きな +limited +bit +4gb +##mon +1910 +moto +##eam +213 +1913 +var +eos +なとの +226 +blogspot +された +699 +e3 +dos +dm +fc +##ments +##ik +##kw +boy +##bin +##ata +960 +er +##せ +219 +##vin +##tu +##ula +194 +##∥ +station +##ろ +##ature +835 +files +zara +hdr +top10 +nature +950 +magazine +s6 +marriott +##シ +avira +case +##っと +tab +##ran +tony +##home +oculus +im +##ral +jean +saint +cry +307 +rosie +##force +##ini +ice +##bert +のある +##nder +##mber +pet +2600 +##◆ +plurk +▲topdec +##sis +00kg +▲topnov +720 +##ence +tim +##ω +##nc +##ても +##name +log +ips +great +ikea +malaysia +unix +##イト +3600 +##ncy +##nie +12000 +akb48 +##ye +##oid +404 +##chi +##いた +oa +xuehai +##1000 +##orm +##rf +275 +さん +##ware +##リー +980 +ho +##pro +text +##era +560 +bob +227 +##ub +##2008 +8891 +scp +avi +##zen +2022 +mi +wu +museum +qvod +apache +lake +jcb +▲topaug +★★★ +ni +##hr +hill +302 +ne +weibo +490 +ruby +##ーシ +##ヶ +##row +4d +▲topjul +iv +##ish +github +306 +mate +312 +##スト +##lot +##ane +andrew +のハイト +##tina +t1 +rf +ed2k +##vel +##900 +way +final +りの +ns +5a +705 +197 +##メ +sweet +bytes +##ene +▲topjan +231 +##cker +##2007 +##px +100g +topapp +229 +helpapp +rs +low +14k +g4g +care +630 +ldquo +あり +##fork +leave +rm +edition +##gan +##zon +##qq +▲topsep +##google +##ism +gold +224 +explorer +##zer +toyota +category +select +visual +##labels +restaurant +##md +posts +s1 +##ico +もっと +angelababy +123456 +217 +sports +s3 +mbc +1915 +してくたさい +shell +x86 +candy +##new +kbs +face +xl +470 +##here +4a +swissinfo +v8 +▲topfeb +dram +##ual +##vice +3a +##wer +sport +q1 +ios10 +public +int +card +##c +ep +au +rt +##れた +1080 +bill +##mll +kim +30 +460 +wan +##uk +##ミ +x3 +298 +0t +scott +##ming +239 +e5 +##3d +h7n9 +worldcat +brown +##あります +##vo +##led +##580 +##ax +249 +410 +##ert +paris +##~6 +polo +925 +##lr +599 +##ナ +capital +##hing +bank +cv +1g +##chat +##s +##たい +adc +##ule +2m +##e +digital +hotmail +268 +##pad +870 +bbq +quot +##ring +before +wali +##まて +mcu +2k +2b +という +costco +316 +north +333 +switch +##city +##p +philips +##mann +management +panasonic +##cl +##vd +##ping +##rge +alice +##lk +##ましょう +css3 +##ney +vision +alpha +##ular +##400 +##tter +lz +にお +##ありません +mode +gre +1916 +pci +##tm +237 +1~2 +##yan +##そ +について +##let +##キ +work +war +coach +ah +mary +##ᅵ +huang +##pt +a8 +pt +follow +##berry +1895 +##ew +a5 +ghost +##ション +##wn +##og +south +##code +girls +##rid +action +villa +git +r11 +table +games +##cket +error +##anonymoussaid +##ag +here +##ame +##gc +qa +##■ +##lis +gmp +##gin +vmalife +##cher +yu +wedding +##tis +demo +dragon +530 +soho +social +bye +##rant +river +orz +acer +325 +##↑ +##ース +##ats +261 +del +##ven +440 +ups +##ように +##ター +305 +value +macd +yougou +##dn +661 +##ano +ll +##urt +##rent +continue +script +##wen +##ect +paper +263 +319 +shift +##chel +##フト +##cat +258 +x5 +fox +243 +##さん +car +aaa +##blog +loading +##yn +##tp +kuso +799 +si +sns +イカせるテンマ +ヒンクテンマ3 +rmb +vdc +forest +central +prime +help +ultra +##rmb +##ような +241 +square +688 +##しい +のないフロクに +##field +##reen +##ors +##ju +c1 +start +510 +##air +##map +cdn +##wo +cba +stephen +m8 +100km +##get +opera +##base +##ood +vsa +com™ +##aw +##ail +251 +なのて +count +t2 +##ᅡ +##een +2700 +hop +##gp +vsc +tree +##eg +##ose +816 +285 +##ories +##shop +alphago +v4 +1909 +simon +##ᆼ +fluke62max +zip +スホンサー +##sta +louis +cr +bas +##~10 +bc +##yer +hadoop +##ube +##wi +1906 +0755 +hola +##low +place +centre +5v +d3 +##fer +252 +##750 +##media +281 +540 +0l +exchange +262 +series +##ハー +##san +eb +##bank +##k +q3 +##nge +##mail +take +##lp +259 +1888 +client +east +cache +event +vincent +##ールを +きを +##nse +sui +855 +adchoice +##и +##stry +##なたの +246 +##zone +ga +apps +sea +##ab +248 +cisco +##タ +##rner +kymco +##care +dha +##pu +##yi +minkoff +royal +p1 +への +annie +269 +collection +kpi +playstation +257 +になります +866 +bh +##bar +queen +505 +radio +1904 +andy +armani +##xy +manager +iherb +##ery +##share +spring +raid +johnson +1908 +##ob +volvo +hall +##ball +v6 +our +taylor +##hk +bi +242 +##cp +kate +bo +water +technology +##rie +サイトは +277 +##ona +##sl +hpv +303 +gtx +hip +rdquo +jayz +stone +##lex +##rum +namespace +##やり +620 +##ale +##atic +des +##erson +##ql +##ves +##type +enter +##この +##てきます +d2 +##168 +##mix +##bian +との +a9 +jj +ky +##lc +access +movie +##hc +リストに +tower +##ration +##mit +ます +##nch +ua +tel +prefix +##o2 +1907 +##point +1901 +ott +~10 +##http +##ury +baidu +##ink +member +##logy +bigbang +nownews +##js +##shot +##tb +##こと +247 +eba +##tics +##lus +ける +v5 +spark +##ama +there +##ions +god +##lls +##down +hiv +##ress +burberry +day2 +##kv +◆◆ +jeff +related +film +edit +joseph +283 +##ark +cx +32gb +order +g9 +30000 +##ans +##tty +s5 +##bee +かあります +thread +xr +buy +sh +005 +land +spotify +mx +##ari +276 +##verse +×email +sf +why +##ことて +244 +7headlines +nego +sunny +dom +exo +401 +666 +positioning +fit +rgb +##tton +278 +kiss +alexa +adam +lp +みリストを +##g +mp +##ties +##llow +amy +##du +np +002 +institute +271 +##rth +##lar +2345 +590 +##des +sidebar +15 +imax +site +##cky +##kit +##ime +##009 +season +323 +##fun +##ンター +##ひ +gogoro +a7 +pu +lily +fire +twd600 +##ッセーシを +いて +##vis +30ml +##cture +##をお +information +##オ +close +friday +##くれる +yi +nick +てすか +##tta +##tel +6500 +##lock +cbd +economy +254 +かお +267 +tinker +double +375 +8gb +voice +##app +oops +channel +today +985 +##right +raw +xyz +##+ +jim +edm +##cent +7500 +supreme +814 +ds +##its +##asia +dropbox +##てすか +##tti +books +272 +100ml +##tle +##ller +##ken +##more +##boy +sex +309 +##dom +t3 +##ider +##なります +##unch +1903 +810 +feel +5500 +##かった +##put +により +s2 +mo +##gh +men +ka +amoled +div +##tr +##n1 +port +howard +##tags +ken +dnf +##nus +adsense +##а +ide +##へ +buff +thunder +##town +##ique +has +##body +auto +pin +##erry +tee +てした +295 +number +##the +##013 +object +psp +cool +udnbkk +16gb +##mic +miui +##tro +most +r2 +##alk +##nity +1880 +±0 +##いました +428 +s4 +law +version +##oa +n1 +sgs +docomo +##tf +##ack +henry +fc2 +##ded +##sco +##014 +##rite +286 +0mm +linkedin +##ada +##now +wii +##ndy +ucbug +##◎ +sputniknews +legalminer +##ika +##xp +2gb +##bu +q10 +oo +b6 +come +##rman +cheese +ming +maker +##gm +nikon +##fig +ppi +kelly +##ります +jchere +てきます +ted +md +003 +fgo +tech +##tto +dan +soc +##gl +##len +hair +earth +640 +521 +img +##pper +##a1 +##てきる +##ロク +acca +##ition +##ference +suite +##ig +outlook +##mond +##cation +398 +##pr +279 +101vip +358 +##999 +282 +64gb +3800 +345 +airport +##over +284 +##おり +jones +##ith +lab +##su +##いるのて +co2 +town +piece +##llo +no1 +vmware +24h +##qi +focus +reader +##admin +##ora +tb +false +##log +1898 +know +lan +838 +##ces +f4 +##ume +motel +stop +##oper +na +flickr +netcomponents +##af +##─ +pose +williams +local +##ound +##cg +##site +##iko +いお +274 +5m +gsm +con +##ath +1902 +friends +##hip +cell +317 +##rey +780 +cream +##cks +012 +##dp +facebooktwitterpinterestgoogle +sso +324 +shtml +song +swiss +##mw +##キンク +lumia +xdd +string +tiffany +522 +marc +られた +insee +russell +sc +dell +##ations +ok +camera +289 +##vs +##flow +##late +classic +287 +##nter +stay +g1 +mtv +512 +##ever +##lab +##nger +qe +sata +ryan +d1 +50ml +cms +##cing +su +292 +3300 +editor +296 +##nap +security +sunday +association +##ens +##700 +##bra +acg +##かり +sofascore +とは +mkv +##ign +jonathan +gary +build +labels +##oto +tesla +moba +qi +gohappy +general +ajax +1024 +##かる +サイト +society +##test +##urs +wps +fedora +##ich +mozilla +328 +##480 +##dr +usa +urn +##lina +##r +grace +##die +##try +##ader +1250 +##なり +elle +570 +##chen +##ᆯ +price +##ten +uhz +##ough +eq +##hen +states +push +session +balance +wow +506 +##cus +##py +when +##ward +##ep +34e +wong +library +prada +##サイト +##cle +running +##ree +313 +ck +date +q4 +##ctive +##ool +##> +mk +##ira +##163 +388 +die +secret +rq +dota +buffet +は1ヶ +e6 +##ez +pan +368 +ha +##card +##cha +2a +##さ +alan +day3 +eye +f3 +##end +france +keep +adi +rna +tvbs +##ala +solo +nova +##え +##tail +##ょう +support +##ries +##なる +##ved +base +copy +iis +fps +##ways +hero +hgih +profile +fish +mu +ssh +entertainment +chang +##wd +click +cake +##ond +pre +##tom +kic +pixel +##ov +##fl +product +6a +##pd +dear +##gate +es +yumi +audio +##² +##sky +echo +bin +where +##ture +329 +##ape +find +sap +isis +##なと +nand +##101 +##load +##ream +band +a6 +525 +never +##post +festival +50cm +##we +555 +guide +314 +zenfone +##ike +335 +gd +forum +jessica +strong +alexander +##ould +software +allen +##ious +program +360° +else +lohasthree +##gar +することかてきます +please +##れます +rc +##ggle +##ric +bim +50000 +##own +eclipse +355 +brian +3ds +##side +061 +361 +##other +##ける +##tech +##ator +485 +engine +##ged +##t +plaza +##fit +cia +ngo +westbrook +shi +tbs +50mm +##みませんか +sci +291 +reuters +##ily +contextlink +##hn +af +##cil +bridge +very +##cel +1890 +cambridge +##ize +15g +##aid +##data +790 +frm +##head +award +butler +##sun +meta +##mar +america +ps3 +puma +pmid +##すか +lc +670 +kitchen +##lic +オーフン5 +きなしソフトサーヒス +そして +day1 +future +★★★★ +##text +##page +##rris +pm1 +##ket +fans +##っています +1001 +christian +bot +kids +trackback +##hai +c3 +display +##hl +n2 +1896 +idea +さんも +##sent +airmail +##ug +##men +pwm +けます +028 +##lution +369 +852 +awards +schemas +354 +asics +wikipedia +font +##tional +##vy +c2 +293 +##れている +##dget +##ein +っている +contact +pepper +スキル +339 +##~5 +294 +##uel +##ument +730 +##hang +みてす +q5 +##sue +rain +##ndi +wei +swatch +##cept +わせ +331 +popular +##ste +##tag +p2 +501 +trc +1899 +##west +##live +justin +honda +ping +messenger +##rap +v9 +543 +##とは +unity +appqq +はすへて +025 +leo +##tone +##テ +##ass +uniqlo +##010 +502 +her +jane +memory +moneydj +##tical +human +12306 +していると +##m2 +coc +miacare +##mn +tmt +##core +vim +kk +##may +fan +target +use +too +338 +435 +2050 +867 +737 +fast +##2c +services +##ope +omega +energy +##わ +pinkoi +1a +##なから +##rain +jackson +##ement +##シャンルの +374 +366 +そんな +p9 +rd +##ᆨ +1111 +##tier +##vic +zone +##│ +385 +690 +dl +isofix +cpa +m4 +322 +kimi +めて +davis +##lay +lulu +##uck +050 +weeks +qs +##hop +920 +##n +ae +##ear +~5 +eia +405 +##fly +korea +jpeg +boost +##ship +small +##リア +1860 +eur +297 +425 +valley +##iel +simple +##ude +rn +k2 +##ena +されます +non +patrick +しているから +##ナー +feed +5757 +30g +process +well +qqmei +##thing +they +aws +lu +pink +##ters +##kin +または +board +##vertisement +wine +##ien +unicode +##dge +r1 +359 +##tant +いを +##twitter +##3c +cool1 +される +##れて +##l +isp +##012 +standard +45㎡2 +402 +##150 +matt +##fu +326 +##iner +googlemsn +pixnetfacebookyahoo +##ラン +x7 +886 +##uce +メーカー +sao +##ev +##きました +##file +9678 +403 +xddd +shirt +6l +##rio +##hat +3mm +givenchy +ya +bang +##lio +monday +crystal +ロクイン +##abc +336 +head +890 +ubuntuforumwikilinuxpastechat +##vc +##~20 +##rity +cnc +7866 +ipv6 +null +1897 +##ost +yang +imsean +tiger +##fet +##ンス +352 +##= +dji +327 +ji +maria +##come +##んて +foundation +3100 +##beth +##なった +1m +601 +active +##aft +##don +3p +sr +349 +emma +##khz +living +415 +353 +1889 +341 +709 +457 +sas +x6 +##face +pptv +x4 +##mate +han +sophie +##jing +337 +fifa +##mand +other +sale +inwedding +##gn +てきちゃいます +##mmy +##pmlast +bad +nana +nbc +してみてくたさいね +なとはお +##wu +##かあります +##あ +note7 +single +##340 +せからこ +してくたさい♪この +しにはとんとんワークケートを +するとあなたにもっとマッチした +ならワークケートへ +もみつかっちゃうかも +ワークケートの +##bel +window +##dio +##ht +union +age +382 +14 +##ivity +##y +コメント +domain +neo +##isa +##lter +5k +f5 +steven +##cts +powerpoint +tft +self +g2 +ft +##テル +zol +##act +mwc +381 +343 +もう +nbapop +408 +てある +eds +ace +##room +previous +author +tomtom +il +##ets +hu +financial +☆☆☆ +っています +bp +5t +chi +1gb +##hg +fairmont +cross +008 +gay +h2 +function +##けて +356 +also +1b +625 +##ータ +##raph +1894 +3~5 +##ils +i3 +334 +avenue +##host +による +##bon +##tsu +message +navigation +50g +fintech +h6 +##ことを +8cm +##ject +##vas +##firm +credit +##wf +xxxx +form +##nor +##space +huawei +plan +json +sbl +##dc +machine +921 +392 +wish +##120 +##sol +windows7 +edward +##ために +development +washington +##nsis +lo +818 +##sio +##ym +##bor +planet +##~8 +##wt +ieee +gpa +##めて +camp +ann +gm +##tw +##oka +connect +##rss +##work +##atus +wall +chicken +soul +2mm +##times +fa +##ather +##cord +009 +##eep +hitachi +gui +harry +##pan +e1 +disney +##press +##ーション +wind +386 +frigidaire +##tl +liu +hsu +332 +basic +von +ev +いた +てきる +スホンサーサイト +learning +##ull +expedia +archives +change +##wei +santa +cut +ins +6gb +turbo +brand +cf1 +508 +004 +return +747 +##rip +h1 +##nis +##をこ +128gb +##にお +3t +application +しており +emc +rx +##oon +384 +quick +412 +15058 +wilson +wing +chapter +##bug +beyond +##cms +##dar +##oh +zoom +e2 +trip +sb +##nba +rcep +342 +aspx +ci +080 +gc +gnu +める +##count +advanced +dance +dv +##url +##ging +367 +8591 +am09 +shadow +battle +346 +##i +##cia +##という +emily +##のてす +##tation +host +ff +techorz +sars +##mini +##mporary +##ering +nc +4200 +798 +##next +cma +##mbps +##gas +##ift +##dot +##ィ +455 +##~17 +amana +##りの +426 +##ros +ir +00㎡1 +##eet +##ible +##↓ +710 +ˋ▽ˊ +##aka +dcs +iq +##v +l1 +##lor +maggie +##011 +##iu +588 +##~1 +830 +##gt +1tb +articles +create +##burg +##iki +database +fantasy +##rex +##cam +dlc +dean +##you +hard +path +gaming +victoria +maps +cb +##lee +##itor +overchicstoretvhome +systems +##xt +416 +p3 +sarah +760 +##nan +407 +486 +x9 +install +second +626 +##ann +##ph +##rcle +##nic +860 +##nar +ec +##とう +768 +metro +chocolate +##rian +~4 +##table +##しています +skin +##sn +395 +mountain +##0mm +inparadise +6m +7x24 +ib +4800 +##jia +eeworld +creative +g5 +g3 +357 +parker +ecfa +village +からの +18000 +sylvia +サーヒス +hbl +##ques +##onsored +##x2 +##きます +##v4 +##tein +ie6 +383 +##stack +389 +ver +##ads +##baby +sound +bbe +##110 +##lone +##uid +ads +022 +gundam +351 +thinkpad +006 +scrum +match +##ave +mems +##470 +##oy +##なりました +##talk +glass +lamigo +span +##eme +job +##a5 +jay +wade +kde +498 +##lace +ocean +tvg +##covery +##r3 +##ners +##rea +junior +think +##aine +cover +##ision +##sia +↓↓ +##bow +msi +413 +458 +406 +##love +711 +801 +soft +z2 +##pl +456 +1840 +mobil +mind +##uy +427 +nginx +##oi +めた +##rr +6221 +##mple +##sson +##ーシてす +371 +##nts +91tv +comhd +crv3000 +##uard +1868 +397 +deep +lost +field +gallery +##bia +rate +spf +redis +traction +930 +icloud +011 +なら +fe +jose +372 +##tory +into +sohu +fx +899 +379 +kicstart2 +##hia +すく +##~3 +##sit +ra +24 +##walk +##xure +500g +##pact +pacific +xa +natural +carlo +##250 +##walker +1850 +##can +cto +gigi +516 +##サー +pen +##hoo +ob +matlab +##b +##yy +13913459 +##iti +mango +##bbs +sense +c5 +oxford +##ニア +walker +jennifer +##ola +course +##bre +701 +##pus +##rder +lucky +075 +##ぁ +ivy +なお +##nia +sotheby +side +##ugh +joy +##orage +##ush +##bat +##dt +364 +r9 +##2d +##gio +511 +country +wear +##lax +##~7 +##moon +393 +seven +study +411 +348 +lonzo +8k +##ェ +evolution +##イフ +##kk +gs +kd +##レス +arduino +344 +b12 +##lux +arpg +##rdon +cook +##x5 +dark +five +##als +##ida +とても +sign +362 +##ちの +something +20mm +##nda +387 +##posted +fresh +tf +1870 +422 +cam +##mine +##skip +##form +##ssion +education +394 +##tee +dyson +stage +##jie +want +##night +epson +pack +あります +##ppy +テリヘル +##█ +wd +##eh +##rence +left +##lvin +golden +mhz +discovery +##trix +##n2 +loft +##uch +##dra +##sse +speed +~1 +1mdb +sorry +welcome +##urn +wave +gaga +##lmer +teddy +##160 +トラックハック +せよ +611 +##f2016 +378 +rp +##sha +rar +##あなたに +##きた +840 +holiday +##ュー +373 +074 +##vg +##nos +##rail +gartner +gi +6p +##dium +kit +488 +b3 +eco +##ろう +20g +sean +##stone +autocad +nu +##np +f16 +write +029 +m5 +##ias +images +atp +##dk +fsm +504 +1350 +ve +52kb +##xxx +##のに +##cake +414 +unit +lim +ru +1v +##ification +published +angela +16g +analytics +ak +##q +##nel +gmt +##icon +again +##₂ +##bby +ios11 +445 +かこさいます +waze +いてす +##ハ +9985 +##ust +##ティー +framework +##007 +iptv +delete +52sykb +cl +wwdc +027 +30cm +##fw +##ての +1389 +##xon +brandt +##ses +##dragon +tc +vetements +anne +monte +modern +official +##へて +##ere +##nne +##oud +もちろん +50 +etnews +##a2 +##graphy +421 +863 +##ちゃん +444 +##rtex +##てお +l2 +##gma +mount +ccd +たと +archive +morning +tan +ddos +e7 +##ホ +day4 +##ウ +gis +453 +its +495 +factory +bruce +pg +##ito +ってくたさい +guest +cdma +##lling +536 +n3 +しかし +3~4 +mega +eyes +ro +13 +women +dac +church +##jun +singapore +##facebook +6991 +starbucks +##tos +##stin +##shine +zen +##mu +tina +20℃ +1893 +##たけて +503 +465 +request +##gence +qt +##っ +1886 +347 +363 +q7 +##zzi +diary +##tore +409 +##ead +468 +cst +##osa +canada +agent +va +##jiang +##ちは +##ーク +##lam +sg +##nix +##sday +##よって +g6 +##master +bing +##zl +charlie +16 +8mm +nb40 +##ーン +thai +##ルフ +ln284ct +##itz +##2f +bonnie +##food +##lent +originals +##stro +##lts +418 +∟∣ +##bscribe +children +ntd +yesstyle +##かも +hmv +##tment +d5 +2cm +arts +sms +##pn +##я +##いい +topios9 +539 +lifestyle +virtual +##ague +xz +##deo +muji +024 +unt +##nnis +##ᅩ +faq1 +1884 +396 +##ette +fly +64㎡ +はしめまして +441 +curry +##pop +のこ +release +##← +##◆◆ +##cast +073 +ありな +500ml +##ews +5c +##stle +ios7 +##ima +787 +dog +lenovo +##r4 +roger +013 +cbs +vornado +100m +417 +##desk +##クok +##ald +1867 +9595 +2900 +##van +oil +##x +some +break +common +##jy +##lines +g7 +twice +419 +ella +nano +belle +にこ +##mes +##self +##note +jb +##ことかてきます +benz +##との +##ova +451 +save +##wing +##ますのて +kai +りは +##hua +##rect +rainer +##unge +448 +##0m +adsl +##かな +guestname +##uma +##kins +##zu +tokichoi +##price +county +##med +##mus +rmk +391 +address +vm +えて +openload +##group +##hin +##iginal +amg +urban +##oz +jobs +emi +##public +beautiful +##sch +album +##dden +##bell +jerry +works +hostel +miller +##drive +##rmin +##10 +376 +boot +828 +##370 +##fx +##cm~ +1885 +##nome +##ctionary +##oman +##lish +##cr +##hm +433 +##how +432 +francis +xi +c919 +b5 +evernote +##uc +vga +##3000 +coupe +##urg +##cca +##uality +019 +6g +れる +multi +##また +##ett +em +hey +##ani +##tax +##rma +inside +than +740 +leonnhurt +##jin +ict +れた +bird +notes +200mm +くの +##dical +##lli +result +442 +iu +ee +438 +smap +gopro +##last +yin +pure +998 +32g +けた +5kg +##dan +##rame +mama +##oot +bean +marketing +##hur +2l +bella +sync +xuite +##ground +515 +discuz +##getrelax +##ince +##bay +##5s +cj +##イス +gmat +apt +##pass +jing +##rix +c4 +rich +##とても +niusnews +##ello +bag +770 +##eting +##mobile +18 +culture +015 +##のてすか +377 +1020 +area +##ience +616 +details +gp +universal +silver +dit +はお +private +ddd +u11 +kanshu +##ified +fung +##nny +dx +##520 +tai +475 +023 +##fr +##lean +3s +##pin +429 +##rin +25000 +ly +rick +##bility +usb3 +banner +##baru +##gion +metal +dt +vdf +1871 +karl +qualcomm +bear +1010 +oldid +ian +jo +##tors +population +##ernel +1882 +mmorpg +##mv +##bike +603 +##© +ww +friend +##ager +exhibition +##del +##pods +fpx +structure +##free +##tings +kl +##rley +##copyright +##mma +california +3400 +orange +yoga +4l +canmake +honey +##anda +##コメント +595 +nikkie +##ルハイト +dhl +publishing +##mall +##gnet +20cm +513 +##クセス +##┅ +e88 +970 +##dog +fishbase +##! +##" +### +##$ +##% +##& +##' +##( +##) +##* +##+ +##, +##- +##. +##/ +##: +##; +##< +##= +##> +##? +##@ +##[ +##\ +##] +##^ +##_ +##{ +##| +##} +##~ +##£ +##¤ +##¥ +##§ +##« +##± +##³ +##µ +##· +##¹ +##º +##» +##¼ +##ß +##æ +##÷ +##ø +##đ +##ŋ +##ɔ +##ə +##ɡ +##ʰ +##ˇ +##ˈ +##ˊ +##ˋ +##ˍ +##ː +##˙ +##˚ +##ˢ +##α +##β +##γ +##δ +##ε +##η +##θ +##ι +##κ +##λ +##μ +##ν +##ο +##π +##ρ +##ς +##σ +##τ +##υ +##φ +##χ +##ψ +##б +##в +##г +##д +##е +##ж +##з +##к +##л +##м +##н +##о +##п +##р +##с +##т +##у +##ф +##х +##ц +##ч +##ш +##ы +##ь +##і +##ا +##ب +##ة +##ت +##د +##ر +##س +##ع +##ل +##م +##ن +##ه +##و +##ي +##۩ +##ก +##ง +##น +##ม +##ย +##ร +##อ +##า +##เ +##๑ +##་ +##ღ +##ᄀ +##ᄁ +##ᄂ +##ᄃ +##ᄅ +##ᄆ +##ᄇ +##ᄈ +##ᄉ +##ᄋ +##ᄌ +##ᄎ +##ᄏ +##ᄐ +##ᄑ +##ᄒ +##ᅢ +##ᅣ +##ᅥ +##ᅦ +##ᅧ +##ᅨ +##ᅪ +##ᅬ +##ᅭ +##ᅮ +##ᅯ +##ᅲ +##ᅳ +##ᅴ +##ᆷ +##ᆸ +##ᆺ +##ᆻ +##ᗜ +##ᵃ +##ᵉ +##ᵍ +##ᵏ +##ᵐ +##ᵒ +##ᵘ +##‖ +##„ +##† +##• +##‥ +##‧ +##
 +##‰ +##′ +##″ +##‹ +##› +##※ +##‿ +##⁄ +##ⁱ +##⁺ +##ⁿ +##₁ +##₃ +##₄ +##€ +##№ +##ⅰ +##ⅱ +##ⅲ +##ⅳ +##ⅴ +##↔ +##↗ +##↘ +##⇒ +##∀ +##− +##∕ +##∙ +##√ +##∞ +##∟ +##∠ +##∣ +##∩ +##∮ +##∶ +##∼ +##∽ +##≈ +##≒ +##≡ +##≤ +##≥ +##≦ +##≧ +##≪ +##≫ +##⊙ +##⋅ +##⋈ +##⋯ +##⌒ +##① +##② +##③ +##④ +##⑤ +##⑥ +##⑦ +##⑧ +##⑨ +##⑩ +##⑴ +##⑵ +##⑶ +##⑷ +##⑸ +##⒈ +##⒉ +##⒊ +##⒋ +##ⓒ +##ⓔ +##ⓘ +##━ +##┃ +##┆ +##┊ +##┌ +##└ +##├ +##┣ +##═ +##║ +##╚ +##╞ +##╠ +##╭ +##╮ +##╯ +##╰ +##╱ +##╳ +##▂ +##▃ +##▅ +##▇ +##▉ +##▋ +##▌ +##▍ +##▎ +##□ +##▪ +##▫ +##▬ +##△ +##▶ +##► +##▽ +##◇ +##◕ +##◠ +##◢ +##◤ +##☀ +##☕ +##☞ +##☺ +##☼ +##♀ +##♂ +##♠ +##♡ +##♣ +##♦ +##♫ +##♬ +##✈ +##✔ +##✕ +##✖ +##✦ +##✨ +##✪ +##✰ +##✿ +##❀ +##➜ +##➤ +##⦿ +##、 +##。 +##〃 +##々 +##〇 +##〈 +##〉 +##《 +##》 +##「 +##」 +##『 +##』 +##【 +##】 +##〓 +##〔 +##〕 +##〖 +##〗 +##〜 +##〝 +##〞 +##ぃ +##ぇ +##ぬ +##ふ +##ほ +##む +##ゃ +##ゅ +##ゆ +##ょ +##゜ +##ゝ +##ァ +##ゥ +##エ +##ォ +##ケ +##サ +##セ +##ソ +##ッ +##ニ +##ヌ +##ネ +##ノ +##ヘ +##モ +##ャ +##ヤ +##ュ +##ユ +##ョ +##ヨ +##ワ +##ヲ +##・ +##ヽ +##ㄅ +##ㄆ +##ㄇ +##ㄉ +##ㄋ +##ㄌ +##ㄍ +##ㄎ +##ㄏ +##ㄒ +##ㄚ +##ㄛ +##ㄞ +##ㄟ +##ㄢ +##ㄤ +##ㄥ +##ㄧ +##ㄨ +##ㆍ +##㈦ +##㊣ +##㗎 +##一 +##丁 +##七 +##万 +##丈 +##三 +##上 +##下 +##不 +##与 +##丐 +##丑 +##专 +##且 +##丕 +##世 +##丘 +##丙 +##业 +##丛 +##东 +##丝 +##丞 +##丟 +##両 +##丢 +##两 +##严 +##並 +##丧 +##丨 +##个 +##丫 +##中 +##丰 +##串 +##临 +##丶 +##丸 +##丹 +##为 +##主 +##丼 +##丽 +##举 +##丿 +##乂 +##乃 +##久 +##么 +##义 +##之 +##乌 +##乍 +##乎 +##乏 +##乐 +##乒 +##乓 +##乔 +##乖 +##乗 +##乘 +##乙 +##乜 +##九 +##乞 +##也 +##习 +##乡 +##书 +##乩 +##买 +##乱 +##乳 +##乾 +##亀 +##亂 +##了 +##予 +##争 +##事 +##二 +##于 +##亏 +##云 +##互 +##五 +##井 +##亘 +##亙 +##亚 +##些 +##亜 +##亞 +##亟 +##亡 +##亢 +##交 +##亥 +##亦 +##产 +##亨 +##亩 +##享 +##京 +##亭 +##亮 +##亲 +##亳 +##亵 +##人 +##亿 +##什 +##仁 +##仃 +##仄 +##仅 +##仆 +##仇 +##今 +##介 +##仍 +##从 +##仏 +##仑 +##仓 +##仔 +##仕 +##他 +##仗 +##付 +##仙 +##仝 +##仞 +##仟 +##代 +##令 +##以 +##仨 +##仪 +##们 +##仮 +##仰 +##仲 +##件 +##价 +##任 +##份 +##仿 +##企 +##伉 +##伊 +##伍 +##伎 +##伏 +##伐 +##休 +##伕 +##众 +##优 +##伙 +##会 +##伝 +##伞 +##伟 +##传 +##伢 +##伤 +##伦 +##伪 +##伫 +##伯 +##估 +##伴 +##伶 +##伸 +##伺 +##似 +##伽 +##佃 +##但 +##佇 +##佈 +##位 +##低 +##住 +##佐 +##佑 +##体 +##佔 +##何 +##佗 +##佘 +##余 +##佚 +##佛 +##作 +##佝 +##佞 +##佟 +##你 +##佢 +##佣 +##佤 +##佥 +##佩 +##佬 +##佯 +##佰 +##佳 +##併 +##佶 +##佻 +##佼 +##使 +##侃 +##侄 +##來 +##侈 +##例 +##侍 +##侏 +##侑 +##侖 +##侗 +##供 +##依 +##侠 +##価 +##侣 +##侥 +##侦 +##侧 +##侨 +##侬 +##侮 +##侯 +##侵 +##侶 +##侷 +##便 +##係 +##促 +##俄 +##俊 +##俎 +##俏 +##俐 +##俑 +##俗 +##俘 +##俚 +##保 +##俞 +##俟 +##俠 +##信 +##俨 +##俩 +##俪 +##俬 +##俭 +##修 +##俯 +##俱 +##俳 +##俸 +##俺 +##俾 +##倆 +##倉 +##個 +##倌 +##倍 +##倏 +##們 +##倒 +##倔 +##倖 +##倘 +##候 +##倚 +##倜 +##借 +##倡 +##値 +##倦 +##倩 +##倪 +##倫 +##倬 +##倭 +##倶 +##债 +##值 +##倾 +##偃 +##假 +##偈 +##偉 +##偌 +##偎 +##偏 +##偕 +##做 +##停 +##健 +##側 +##偵 +##偶 +##偷 +##偻 +##偽 +##偿 +##傀 +##傅 +##傍 +##傑 +##傘 +##備 +##傚 +##傢 +##傣 +##傥 +##储 +##傩 +##催 +##傭 +##傲 +##傳 +##債 +##傷 +##傻 +##傾 +##僅 +##働 +##像 +##僑 +##僕 +##僖 +##僚 +##僥 +##僧 +##僭 +##僮 +##僱 +##僵 +##價 +##僻 +##儀 +##儂 +##億 +##儆 +##儉 +##儋 +##儒 +##儕 +##儘 +##償 +##儡 +##優 +##儲 +##儷 +##儼 +##儿 +##兀 +##允 +##元 +##兄 +##充 +##兆 +##兇 +##先 +##光 +##克 +##兌 +##免 +##児 +##兑 +##兒 +##兔 +##兖 +##党 +##兜 +##兢 +##入 +##內 +##全 +##兩 +##八 +##公 +##六 +##兮 +##兰 +##共 +##兲 +##关 +##兴 +##兵 +##其 +##具 +##典 +##兹 +##养 +##兼 +##兽 +##冀 +##内 +##円 +##冇 +##冈 +##冉 +##冊 +##册 +##再 +##冏 +##冒 +##冕 +##冗 +##写 +##军 +##农 +##冠 +##冢 +##冤 +##冥 +##冨 +##冪 +##冬 +##冯 +##冰 +##冲 +##决 +##况 +##冶 +##冷 +##冻 +##冼 +##冽 +##冾 +##净 +##凄 +##准 +##凇 +##凈 +##凉 +##凋 +##凌 +##凍 +##减 +##凑 +##凛 +##凜 +##凝 +##几 +##凡 +##凤 +##処 +##凪 +##凭 +##凯 +##凰 +##凱 +##凳 +##凶 +##凸 +##凹 +##出 +##击 +##函 +##凿 +##刀 +##刁 +##刃 +##分 +##切 +##刈 +##刊 +##刍 +##刎 +##刑 +##划 +##列 +##刘 +##则 +##刚 +##创 +##初 +##删 +##判 +##別 +##刨 +##利 +##刪 +##别 +##刮 +##到 +##制 +##刷 +##券 +##刹 +##刺 +##刻 +##刽 +##剁 +##剂 +##剃 +##則 +##剉 +##削 +##剋 +##剌 +##前 +##剎 +##剐 +##剑 +##剔 +##剖 +##剛 +##剜 +##剝 +##剣 +##剤 +##剥 +##剧 +##剩 +##剪 +##副 +##割 +##創 +##剷 +##剽 +##剿 +##劃 +##劇 +##劈 +##劉 +##劊 +##劍 +##劏 +##劑 +##力 +##劝 +##办 +##功 +##加 +##务 +##劣 +##动 +##助 +##努 +##劫 +##劭 +##励 +##劲 +##劳 +##労 +##劵 +##効 +##劾 +##势 +##勁 +##勃 +##勇 +##勉 +##勋 +##勐 +##勒 +##動 +##勖 +##勘 +##務 +##勛 +##勝 +##勞 +##募 +##勢 +##勤 +##勧 +##勳 +##勵 +##勸 +##勺 +##勻 +##勾 +##勿 +##匀 +##包 +##匆 +##匈 +##匍 +##匐 +##匕 +##化 +##北 +##匙 +##匝 +##匠 +##匡 +##匣 +##匪 +##匮 +##匯 +##匱 +##匹 +##区 +##医 +##匾 +##匿 +##區 +##十 +##千 +##卅 +##升 +##午 +##卉 +##半 +##卍 +##华 +##协 +##卑 +##卒 +##卓 +##協 +##单 +##卖 +##南 +##単 +##博 +##卜 +##卞 +##卟 +##占 +##卡 +##卢 +##卤 +##卦 +##卧 +##卫 +##卮 +##卯 +##印 +##危 +##即 +##却 +##卵 +##卷 +##卸 +##卻 +##卿 +##厂 +##厄 +##厅 +##历 +##厉 +##压 +##厌 +##厕 +##厘 +##厚 +##厝 +##原 +##厢 +##厥 +##厦 +##厨 +##厩 +##厭 +##厮 +##厲 +##厳 +##去 +##县 +##叁 +##参 +##參 +##又 +##叉 +##及 +##友 +##双 +##反 +##収 +##发 +##叔 +##取 +##受 +##变 +##叙 +##叛 +##叟 +##叠 +##叡 +##叢 +##口 +##古 +##句 +##另 +##叨 +##叩 +##只 +##叫 +##召 +##叭 +##叮 +##可 +##台 +##叱 +##史 +##右 +##叵 +##叶 +##号 +##司 +##叹 +##叻 +##叼 +##叽 +##吁 +##吃 +##各 +##吆 +##合 +##吉 +##吊 +##吋 +##同 +##名 +##后 +##吏 +##吐 +##向 +##吒 +##吓 +##吕 +##吖 +##吗 +##君 +##吝 +##吞 +##吟 +##吠 +##吡 +##否 +##吧 +##吨 +##吩 +##含 +##听 +##吭 +##吮 +##启 +##吱 +##吳 +##吴 +##吵 +##吶 +##吸 +##吹 +##吻 +##吼 +##吽 +##吾 +##呀 +##呂 +##呃 +##呆 +##呈 +##告 +##呋 +##呎 +##呐 +##呓 +##呕 +##呗 +##员 +##呛 +##呜 +##呢 +##呤 +##呦 +##周 +##呱 +##呲 +##味 +##呵 +##呷 +##呸 +##呻 +##呼 +##命 +##咀 +##咁 +##咂 +##咄 +##咆 +##咋 +##和 +##咎 +##咏 +##咐 +##咒 +##咔 +##咕 +##咖 +##咗 +##咘 +##咙 +##咚 +##咛 +##咣 +##咤 +##咦 +##咧 +##咨 +##咩 +##咪 +##咫 +##咬 +##咭 +##咯 +##咱 +##咲 +##咳 +##咸 +##咻 +##咽 +##咿 +##哀 +##品 +##哂 +##哄 +##哆 +##哇 +##哈 +##哉 +##哋 +##哌 +##响 +##哎 +##哏 +##哐 +##哑 +##哒 +##哔 +##哗 +##哟 +##員 +##哥 +##哦 +##哧 +##哨 +##哩 +##哪 +##哭 +##哮 +##哲 +##哺 +##哼 +##哽 +##唁 +##唄 +##唆 +##唇 +##唉 +##唏 +##唐 +##唑 +##唔 +##唠 +##唤 +##唧 +##唬 +##售 +##唯 +##唰 +##唱 +##唳 +##唷 +##唸 +##唾 +##啃 +##啄 +##商 +##啉 +##啊 +##問 +##啓 +##啕 +##啖 +##啜 +##啞 +##啟 +##啡 +##啤 +##啥 +##啦 +##啧 +##啪 +##啫 +##啬 +##啮 +##啰 +##啱 +##啲 +##啵 +##啶 +##啷 +##啸 +##啻 +##啼 +##啾 +##喀 +##喂 +##喃 +##善 +##喆 +##喇 +##喉 +##喊 +##喋 +##喎 +##喏 +##喔 +##喘 +##喙 +##喚 +##喜 +##喝 +##喟 +##喧 +##喪 +##喫 +##喬 +##單 +##喰 +##喱 +##喲 +##喳 +##喵 +##営 +##喷 +##喹 +##喺 +##喻 +##喽 +##嗅 +##嗆 +##嗇 +##嗎 +##嗑 +##嗒 +##嗓 +##嗔 +##嗖 +##嗚 +##嗜 +##嗝 +##嗟 +##嗡 +##嗣 +##嗤 +##嗦 +##嗨 +##嗪 +##嗬 +##嗯 +##嗰 +##嗲 +##嗳 +##嗶 +##嗷 +##嗽 +##嘀 +##嘅 +##嘆 +##嘈 +##嘉 +##嘌 +##嘍 +##嘎 +##嘔 +##嘖 +##嘗 +##嘘 +##嘚 +##嘛 +##嘜 +##嘞 +##嘟 +##嘢 +##嘣 +##嘤 +##嘧 +##嘩 +##嘭 +##嘮 +##嘯 +##嘰 +##嘱 +##嘲 +##嘴 +##嘶 +##嘸 +##嘹 +##嘻 +##嘿 +##噁 +##噌 +##噎 +##噓 +##噔 +##噗 +##噙 +##噜 +##噠 +##噢 +##噤 +##器 +##噩 +##噪 +##噬 +##噱 +##噴 +##噶 +##噸 +##噹 +##噻 +##噼 +##嚀 +##嚇 +##嚎 +##嚏 +##嚐 +##嚓 +##嚕 +##嚟 +##嚣 +##嚥 +##嚨 +##嚮 +##嚴 +##嚷 +##嚼 +##囂 +##囉 +##囊 +##囍 +##囑 +##囔 +##囗 +##囚 +##四 +##囝 +##回 +##囟 +##因 +##囡 +##团 +##団 +##囤 +##囧 +##囪 +##囫 +##园 +##困 +##囱 +##囲 +##図 +##围 +##囹 +##固 +##国 +##图 +##囿 +##圃 +##圄 +##圆 +##圈 +##國 +##圍 +##圏 +##園 +##圓 +##圖 +##團 +##圜 +##土 +##圣 +##圧 +##在 +##圩 +##圭 +##地 +##圳 +##场 +##圻 +##圾 +##址 +##坂 +##均 +##坊 +##坍 +##坎 +##坏 +##坐 +##坑 +##块 +##坚 +##坛 +##坝 +##坞 +##坟 +##坠 +##坡 +##坤 +##坦 +##坨 +##坪 +##坯 +##坳 +##坵 +##坷 +##垂 +##垃 +##垄 +##型 +##垒 +##垚 +##垛 +##垠 +##垢 +##垣 +##垦 +##垩 +##垫 +##垭 +##垮 +##垵 +##埂 +##埃 +##埋 +##城 +##埔 +##埕 +##埗 +##域 +##埠 +##埤 +##埵 +##執 +##埸 +##培 +##基 +##埼 +##堀 +##堂 +##堃 +##堅 +##堆 +##堇 +##堑 +##堕 +##堙 +##堡 +##堤 +##堪 +##堯 +##堰 +##報 +##場 +##堵 +##堺 +##堿 +##塊 +##塌 +##塑 +##塔 +##塗 +##塘 +##塚 +##塞 +##塢 +##塩 +##填 +##塬 +##塭 +##塵 +##塾 +##墀 +##境 +##墅 +##墉 +##墊 +##墒 +##墓 +##増 +##墘 +##墙 +##墜 +##增 +##墟 +##墨 +##墩 +##墮 +##墳 +##墻 +##墾 +##壁 +##壅 +##壆 +##壇 +##壊 +##壑 +##壓 +##壕 +##壘 +##壞 +##壟 +##壢 +##壤 +##壩 +##士 +##壬 +##壮 +##壯 +##声 +##売 +##壳 +##壶 +##壹 +##壺 +##壽 +##处 +##备 +##変 +##复 +##夏 +##夔 +##夕 +##外 +##夙 +##多 +##夜 +##够 +##夠 +##夢 +##夥 +##大 +##天 +##太 +##夫 +##夭 +##央 +##夯 +##失 +##头 +##夷 +##夸 +##夹 +##夺 +##夾 +##奂 +##奄 +##奇 +##奈 +##奉 +##奋 +##奎 +##奏 +##奐 +##契 +##奔 +##奕 +##奖 +##套 +##奘 +##奚 +##奠 +##奢 +##奥 +##奧 +##奪 +##奬 +##奮 +##女 +##奴 +##奶 +##奸 +##她 +##好 +##如 +##妃 +##妄 +##妆 +##妇 +##妈 +##妊 +##妍 +##妒 +##妓 +##妖 +##妘 +##妙 +##妝 +##妞 +##妣 +##妤 +##妥 +##妨 +##妩 +##妪 +##妮 +##妲 +##妳 +##妹 +##妻 +##妾 +##姆 +##姉 +##姊 +##始 +##姍 +##姐 +##姑 +##姒 +##姓 +##委 +##姗 +##姚 +##姜 +##姝 +##姣 +##姥 +##姦 +##姨 +##姪 +##姫 +##姬 +##姹 +##姻 +##姿 +##威 +##娃 +##娄 +##娅 +##娆 +##娇 +##娉 +##娑 +##娓 +##娘 +##娛 +##娜 +##娟 +##娠 +##娣 +##娥 +##娩 +##娱 +##娲 +##娴 +##娶 +##娼 +##婀 +##婁 +##婆 +##婉 +##婊 +##婕 +##婚 +##婢 +##婦 +##婧 +##婪 +##婭 +##婴 +##婵 +##婶 +##婷 +##婺 +##婿 +##媒 +##媚 +##媛 +##媞 +##媧 +##媲 +##媳 +##媽 +##媾 +##嫁 +##嫂 +##嫉 +##嫌 +##嫑 +##嫔 +##嫖 +##嫘 +##嫚 +##嫡 +##嫣 +##嫦 +##嫩 +##嫲 +##嫵 +##嫻 +##嬅 +##嬉 +##嬌 +##嬗 +##嬛 +##嬢 +##嬤 +##嬪 +##嬰 +##嬴 +##嬷 +##嬸 +##嬿 +##孀 +##孃 +##子 +##孑 +##孔 +##孕 +##孖 +##字 +##存 +##孙 +##孚 +##孛 +##孜 +##孝 +##孟 +##孢 +##季 +##孤 +##学 +##孩 +##孪 +##孫 +##孬 +##孰 +##孱 +##孳 +##孵 +##學 +##孺 +##孽 +##孿 +##宁 +##它 +##宅 +##宇 +##守 +##安 +##宋 +##完 +##宏 +##宓 +##宕 +##宗 +##官 +##宙 +##定 +##宛 +##宜 +##宝 +##实 +##実 +##宠 +##审 +##客 +##宣 +##室 +##宥 +##宦 +##宪 +##宫 +##宮 +##宰 +##害 +##宴 +##宵 +##家 +##宸 +##容 +##宽 +##宾 +##宿 +##寂 +##寄 +##寅 +##密 +##寇 +##富 +##寐 +##寒 +##寓 +##寛 +##寝 +##寞 +##察 +##寡 +##寢 +##寥 +##實 +##寧 +##寨 +##審 +##寫 +##寬 +##寮 +##寰 +##寵 +##寶 +##寸 +##对 +##寺 +##寻 +##导 +##対 +##寿 +##封 +##専 +##射 +##将 +##將 +##專 +##尉 +##尊 +##尋 +##對 +##導 +##小 +##少 +##尔 +##尕 +##尖 +##尘 +##尚 +##尝 +##尤 +##尧 +##尬 +##就 +##尴 +##尷 +##尸 +##尹 +##尺 +##尻 +##尼 +##尽 +##尾 +##尿 +##局 +##屁 +##层 +##屄 +##居 +##屆 +##屈 +##屉 +##届 +##屋 +##屌 +##屍 +##屎 +##屏 +##屐 +##屑 +##展 +##屜 +##属 +##屠 +##屡 +##屢 +##層 +##履 +##屬 +##屯 +##山 +##屹 +##屿 +##岀 +##岁 +##岂 +##岌 +##岐 +##岑 +##岔 +##岖 +##岗 +##岘 +##岙 +##岚 +##岛 +##岡 +##岩 +##岫 +##岬 +##岭 +##岱 +##岳 +##岷 +##岸 +##峇 +##峋 +##峒 +##峙 +##峡 +##峤 +##峥 +##峦 +##峨 +##峪 +##峭 +##峯 +##峰 +##峴 +##島 +##峻 +##峽 +##崁 +##崂 +##崆 +##崇 +##崎 +##崑 +##崔 +##崖 +##崗 +##崙 +##崛 +##崧 +##崩 +##崭 +##崴 +##崽 +##嵇 +##嵊 +##嵋 +##嵌 +##嵐 +##嵘 +##嵩 +##嵬 +##嵯 +##嶂 +##嶄 +##嶇 +##嶋 +##嶙 +##嶺 +##嶼 +##嶽 +##巅 +##巍 +##巒 +##巔 +##巖 +##川 +##州 +##巡 +##巢 +##工 +##左 +##巧 +##巨 +##巩 +##巫 +##差 +##己 +##已 +##巳 +##巴 +##巷 +##巻 +##巽 +##巾 +##巿 +##币 +##市 +##布 +##帅 +##帆 +##师 +##希 +##帐 +##帑 +##帕 +##帖 +##帘 +##帚 +##帛 +##帜 +##帝 +##帥 +##带 +##帧 +##師 +##席 +##帮 +##帯 +##帰 +##帳 +##帶 +##帷 +##常 +##帼 +##帽 +##幀 +##幂 +##幄 +##幅 +##幌 +##幔 +##幕 +##幟 +##幡 +##幢 +##幣 +##幫 +##干 +##平 +##年 +##并 +##幸 +##幹 +##幺 +##幻 +##幼 +##幽 +##幾 +##广 +##庁 +##広 +##庄 +##庆 +##庇 +##床 +##序 +##庐 +##库 +##应 +##底 +##庖 +##店 +##庙 +##庚 +##府 +##庞 +##废 +##庠 +##度 +##座 +##庫 +##庭 +##庵 +##庶 +##康 +##庸 +##庹 +##庾 +##廁 +##廂 +##廃 +##廈 +##廉 +##廊 +##廓 +##廖 +##廚 +##廝 +##廟 +##廠 +##廢 +##廣 +##廬 +##廳 +##延 +##廷 +##建 +##廿 +##开 +##弁 +##异 +##弃 +##弄 +##弈 +##弊 +##弋 +##式 +##弑 +##弒 +##弓 +##弔 +##引 +##弗 +##弘 +##弛 +##弟 +##张 +##弥 +##弦 +##弧 +##弩 +##弭 +##弯 +##弱 +##張 +##強 +##弹 +##强 +##弼 +##弾 +##彅 +##彆 +##彈 +##彌 +##彎 +##归 +##当 +##录 +##彗 +##彙 +##彝 +##形 +##彤 +##彥 +##彦 +##彧 +##彩 +##彪 +##彫 +##彬 +##彭 +##彰 +##影 +##彷 +##役 +##彻 +##彼 +##彿 +##往 +##征 +##径 +##待 +##徇 +##很 +##徉 +##徊 +##律 +##後 +##徐 +##徑 +##徒 +##従 +##徕 +##得 +##徘 +##徙 +##徜 +##從 +##徠 +##御 +##徨 +##復 +##循 +##徬 +##微 +##徳 +##徴 +##徵 +##德 +##徹 +##徼 +##徽 +##心 +##必 +##忆 +##忌 +##忍 +##忏 +##忐 +##忑 +##忒 +##忖 +##志 +##忘 +##忙 +##応 +##忠 +##忡 +##忤 +##忧 +##忪 +##快 +##忱 +##念 +##忻 +##忽 +##忿 +##怀 +##态 +##怂 +##怅 +##怆 +##怎 +##怏 +##怒 +##怔 +##怕 +##怖 +##怙 +##怜 +##思 +##怠 +##怡 +##急 +##怦 +##性 +##怨 +##怪 +##怯 +##怵 +##总 +##怼 +##恁 +##恃 +##恆 +##恋 +##恍 +##恐 +##恒 +##恕 +##恙 +##恚 +##恢 +##恣 +##恤 +##恥 +##恨 +##恩 +##恪 +##恫 +##恬 +##恭 +##息 +##恰 +##恳 +##恵 +##恶 +##恸 +##恺 +##恻 +##恼 +##恿 +##悄 +##悅 +##悉 +##悌 +##悍 +##悔 +##悖 +##悚 +##悟 +##悠 +##患 +##悦 +##您 +##悩 +##悪 +##悬 +##悯 +##悱 +##悲 +##悴 +##悵 +##悶 +##悸 +##悻 +##悼 +##悽 +##情 +##惆 +##惇 +##惊 +##惋 +##惑 +##惕 +##惘 +##惚 +##惜 +##惟 +##惠 +##惡 +##惦 +##惧 +##惨 +##惩 +##惫 +##惬 +##惭 +##惮 +##惯 +##惰 +##惱 +##想 +##惴 +##惶 +##惹 +##惺 +##愁 +##愆 +##愈 +##愉 +##愍 +##意 +##愕 +##愚 +##愛 +##愜 +##感 +##愣 +##愤 +##愧 +##愫 +##愷 +##愿 +##慄 +##慈 +##態 +##慌 +##慎 +##慑 +##慕 +##慘 +##慚 +##慟 +##慢 +##慣 +##慧 +##慨 +##慫 +##慮 +##慰 +##慳 +##慵 +##慶 +##慷 +##慾 +##憂 +##憊 +##憋 +##憎 +##憐 +##憑 +##憔 +##憚 +##憤 +##憧 +##憨 +##憩 +##憫 +##憬 +##憲 +##憶 +##憾 +##懂 +##懇 +##懈 +##應 +##懊 +##懋 +##懑 +##懒 +##懦 +##懲 +##懵 +##懶 +##懷 +##懸 +##懺 +##懼 +##懾 +##懿 +##戀 +##戈 +##戊 +##戌 +##戍 +##戎 +##戏 +##成 +##我 +##戒 +##戕 +##或 +##战 +##戚 +##戛 +##戟 +##戡 +##戦 +##截 +##戬 +##戮 +##戰 +##戲 +##戳 +##戴 +##戶 +##户 +##戸 +##戻 +##戾 +##房 +##所 +##扁 +##扇 +##扈 +##扉 +##手 +##才 +##扎 +##扑 +##扒 +##打 +##扔 +##払 +##托 +##扛 +##扣 +##扦 +##执 +##扩 +##扪 +##扫 +##扬 +##扭 +##扮 +##扯 +##扰 +##扱 +##扳 +##扶 +##批 +##扼 +##找 +##承 +##技 +##抄 +##抉 +##把 +##抑 +##抒 +##抓 +##投 +##抖 +##抗 +##折 +##抚 +##抛 +##抜 +##択 +##抟 +##抠 +##抡 +##抢 +##护 +##报 +##抨 +##披 +##抬 +##抱 +##抵 +##抹 +##押 +##抽 +##抿 +##拂 +##拄 +##担 +##拆 +##拇 +##拈 +##拉 +##拋 +##拌 +##拍 +##拎 +##拐 +##拒 +##拓 +##拔 +##拖 +##拗 +##拘 +##拙 +##拚 +##招 +##拜 +##拟 +##拡 +##拢 +##拣 +##拥 +##拦 +##拧 +##拨 +##择 +##括 +##拭 +##拮 +##拯 +##拱 +##拳 +##拴 +##拷 +##拼 +##拽 +##拾 +##拿 +##持 +##挂 +##指 +##挈 +##按 +##挎 +##挑 +##挖 +##挙 +##挚 +##挛 +##挝 +##挞 +##挟 +##挠 +##挡 +##挣 +##挤 +##挥 +##挨 +##挪 +##挫 +##振 +##挲 +##挹 +##挺 +##挽 +##挾 +##捂 +##捅 +##捆 +##捉 +##捋 +##捌 +##捍 +##捎 +##捏 +##捐 +##捕 +##捞 +##损 +##捡 +##换 +##捣 +##捧 +##捨 +##捩 +##据 +##捱 +##捲 +##捶 +##捷 +##捺 +##捻 +##掀 +##掂 +##掃 +##掇 +##授 +##掉 +##掌 +##掏 +##掐 +##排 +##掖 +##掘 +##掙 +##掛 +##掠 +##採 +##探 +##掣 +##接 +##控 +##推 +##掩 +##措 +##掬 +##掰 +##掲 +##掳 +##掴 +##掷 +##掸 +##掺 +##揀 +##揃 +##揄 +##揆 +##揉 +##揍 +##描 +##提 +##插 +##揖 +##揚 +##換 +##握 +##揣 +##揩 +##揪 +##揭 +##揮 +##援 +##揶 +##揸 +##揹 +##揽 +##搀 +##搁 +##搂 +##搅 +##損 +##搏 +##搐 +##搓 +##搔 +##搖 +##搗 +##搜 +##搞 +##搡 +##搪 +##搬 +##搭 +##搵 +##搶 +##携 +##搽 +##摀 +##摁 +##摄 +##摆 +##摇 +##摈 +##摊 +##摒 +##摔 +##摘 +##摞 +##摟 +##摧 +##摩 +##摯 +##摳 +##摸 +##摹 +##摺 +##摻 +##撂 +##撃 +##撅 +##撇 +##撈 +##撐 +##撑 +##撒 +##撓 +##撕 +##撚 +##撞 +##撤 +##撥 +##撩 +##撫 +##撬 +##播 +##撮 +##撰 +##撲 +##撵 +##撷 +##撸 +##撻 +##撼 +##撿 +##擀 +##擁 +##擂 +##擄 +##擅 +##擇 +##擊 +##擋 +##操 +##擎 +##擒 +##擔 +##擘 +##據 +##擞 +##擠 +##擡 +##擢 +##擦 +##擬 +##擰 +##擱 +##擲 +##擴 +##擷 +##擺 +##擼 +##擾 +##攀 +##攏 +##攒 +##攔 +##攘 +##攙 +##攜 +##攝 +##攞 +##攢 +##攣 +##攤 +##攥 +##攪 +##攫 +##攬 +##支 +##收 +##攸 +##改 +##攻 +##放 +##政 +##故 +##效 +##敌 +##敍 +##敎 +##敏 +##救 +##敕 +##敖 +##敗 +##敘 +##教 +##敛 +##敝 +##敞 +##敢 +##散 +##敦 +##敬 +##数 +##敲 +##整 +##敵 +##敷 +##數 +##斂 +##斃 +##文 +##斋 +##斌 +##斎 +##斐 +##斑 +##斓 +##斗 +##料 +##斛 +##斜 +##斟 +##斡 +##斤 +##斥 +##斧 +##斩 +##斫 +##斬 +##断 +##斯 +##新 +##斷 +##方 +##於 +##施 +##旁 +##旃 +##旅 +##旋 +##旌 +##旎 +##族 +##旖 +##旗 +##无 +##既 +##日 +##旦 +##旧 +##旨 +##早 +##旬 +##旭 +##旮 +##旱 +##时 +##旷 +##旺 +##旻 +##昀 +##昂 +##昆 +##昇 +##昉 +##昊 +##昌 +##明 +##昏 +##易 +##昔 +##昕 +##昙 +##星 +##映 +##春 +##昧 +##昨 +##昭 +##是 +##昱 +##昴 +##昵 +##昶 +##昼 +##显 +##晁 +##時 +##晃 +##晉 +##晋 +##晌 +##晏 +##晒 +##晓 +##晔 +##晕 +##晖 +##晗 +##晚 +##晝 +##晞 +##晟 +##晤 +##晦 +##晨 +##晩 +##普 +##景 +##晰 +##晴 +##晶 +##晷 +##智 +##晾 +##暂 +##暄 +##暇 +##暈 +##暉 +##暌 +##暐 +##暑 +##暖 +##暗 +##暝 +##暢 +##暧 +##暨 +##暫 +##暮 +##暱 +##暴 +##暸 +##暹 +##曄 +##曆 +##曇 +##曉 +##曖 +##曙 +##曜 +##曝 +##曠 +##曦 +##曬 +##曰 +##曲 +##曳 +##更 +##書 +##曹 +##曼 +##曾 +##替 +##最 +##會 +##月 +##有 +##朋 +##服 +##朐 +##朔 +##朕 +##朗 +##望 +##朝 +##期 +##朦 +##朧 +##木 +##未 +##末 +##本 +##札 +##朮 +##术 +##朱 +##朴 +##朵 +##机 +##朽 +##杀 +##杂 +##权 +##杆 +##杈 +##杉 +##李 +##杏 +##材 +##村 +##杓 +##杖 +##杜 +##杞 +##束 +##杠 +##条 +##来 +##杨 +##杭 +##杯 +##杰 +##東 +##杳 +##杵 +##杷 +##杼 +##松 +##板 +##极 +##构 +##枇 +##枉 +##枋 +##析 +##枕 +##林 +##枚 +##果 +##枝 +##枢 +##枣 +##枪 +##枫 +##枭 +##枯 +##枰 +##枱 +##枳 +##架 +##枷 +##枸 +##柄 +##柏 +##某 +##柑 +##柒 +##染 +##柔 +##柘 +##柚 +##柜 +##柞 +##柠 +##柢 +##查 +##柩 +##柬 +##柯 +##柱 +##柳 +##柴 +##柵 +##査 +##柿 +##栀 +##栃 +##栄 +##栅 +##标 +##栈 +##栉 +##栋 +##栎 +##栏 +##树 +##栓 +##栖 +##栗 +##校 +##栩 +##株 +##样 +##核 +##根 +##格 +##栽 +##栾 +##桀 +##桁 +##桂 +##桃 +##桅 +##框 +##案 +##桉 +##桌 +##桎 +##桐 +##桑 +##桓 +##桔 +##桜 +##桠 +##桡 +##桢 +##档 +##桥 +##桦 +##桧 +##桨 +##桩 +##桶 +##桿 +##梁 +##梅 +##梆 +##梏 +##梓 +##梗 +##條 +##梟 +##梢 +##梦 +##梧 +##梨 +##梭 +##梯 +##械 +##梳 +##梵 +##梶 +##检 +##棂 +##棄 +##棉 +##棋 +##棍 +##棒 +##棕 +##棗 +##棘 +##棚 +##棟 +##棠 +##棣 +##棧 +##森 +##棱 +##棲 +##棵 +##棹 +##棺 +##椁 +##椅 +##椋 +##植 +##椎 +##椒 +##検 +##椪 +##椭 +##椰 +##椹 +##椽 +##椿 +##楂 +##楊 +##楓 +##楔 +##楚 +##楝 +##楞 +##楠 +##楣 +##楨 +##楫 +##業 +##楮 +##極 +##楷 +##楸 +##楹 +##楼 +##楽 +##概 +##榄 +##榆 +##榈 +##榉 +##榔 +##榕 +##榖 +##榛 +##榜 +##榨 +##榫 +##榭 +##榮 +##榱 +##榴 +##榷 +##榻 +##槁 +##槃 +##構 +##槌 +##槍 +##槎 +##槐 +##槓 +##様 +##槛 +##槟 +##槤 +##槭 +##槲 +##槳 +##槻 +##槽 +##槿 +##樁 +##樂 +##樊 +##樑 +##樓 +##標 +##樞 +##樟 +##模 +##樣 +##権 +##横 +##樫 +##樯 +##樱 +##樵 +##樸 +##樹 +##樺 +##樽 +##樾 +##橄 +##橇 +##橋 +##橐 +##橘 +##橙 +##機 +##橡 +##橢 +##橫 +##橱 +##橹 +##橼 +##檀 +##檄 +##檎 +##檐 +##檔 +##檗 +##檜 +##檢 +##檬 +##檯 +##檳 +##檸 +##檻 +##櫃 +##櫚 +##櫛 +##櫥 +##櫸 +##櫻 +##欄 +##權 +##欒 +##欖 +##欠 +##次 +##欢 +##欣 +##欧 +##欲 +##欸 +##欺 +##欽 +##款 +##歆 +##歇 +##歉 +##歌 +##歎 +##歐 +##歓 +##歙 +##歛 +##歡 +##止 +##正 +##此 +##步 +##武 +##歧 +##歩 +##歪 +##歯 +##歲 +##歳 +##歴 +##歷 +##歸 +##歹 +##死 +##歼 +##殁 +##殃 +##殆 +##殇 +##殉 +##殊 +##残 +##殒 +##殓 +##殖 +##殘 +##殞 +##殡 +##殤 +##殭 +##殯 +##殲 +##殴 +##段 +##殷 +##殺 +##殼 +##殿 +##毀 +##毁 +##毂 +##毅 +##毆 +##毋 +##母 +##毎 +##每 +##毒 +##毓 +##比 +##毕 +##毗 +##毘 +##毙 +##毛 +##毡 +##毫 +##毯 +##毽 +##氈 +##氏 +##氐 +##民 +##氓 +##气 +##氖 +##気 +##氙 +##氛 +##氟 +##氡 +##氢 +##氣 +##氤 +##氦 +##氧 +##氨 +##氪 +##氫 +##氮 +##氯 +##氰 +##氲 +##水 +##氷 +##永 +##氹 +##氾 +##汀 +##汁 +##求 +##汆 +##汇 +##汉 +##汎 +##汐 +##汕 +##汗 +##汙 +##汛 +##汝 +##汞 +##江 +##池 +##污 +##汤 +##汨 +##汩 +##汪 +##汰 +##汲 +##汴 +##汶 +##汹 +##決 +##汽 +##汾 +##沁 +##沂 +##沃 +##沅 +##沈 +##沉 +##沌 +##沏 +##沐 +##沒 +##沓 +##沖 +##沙 +##沛 +##沟 +##没 +##沢 +##沣 +##沥 +##沦 +##沧 +##沪 +##沫 +##沭 +##沮 +##沱 +##河 +##沸 +##油 +##治 +##沼 +##沽 +##沾 +##沿 +##況 +##泄 +##泉 +##泊 +##泌 +##泓 +##法 +##泗 +##泛 +##泞 +##泠 +##泡 +##波 +##泣 +##泥 +##注 +##泪 +##泫 +##泮 +##泯 +##泰 +##泱 +##泳 +##泵 +##泷 +##泸 +##泻 +##泼 +##泽 +##泾 +##洁 +##洄 +##洋 +##洒 +##洗 +##洙 +##洛 +##洞 +##津 +##洩 +##洪 +##洮 +##洱 +##洲 +##洵 +##洶 +##洸 +##洹 +##活 +##洼 +##洽 +##派 +##流 +##浃 +##浄 +##浅 +##浆 +##浇 +##浊 +##测 +##济 +##浏 +##浑 +##浒 +##浓 +##浔 +##浙 +##浚 +##浜 +##浣 +##浦 +##浩 +##浪 +##浬 +##浮 +##浯 +##浴 +##海 +##浸 +##涂 +##涅 +##涇 +##消 +##涉 +##涌 +##涎 +##涓 +##涔 +##涕 +##涙 +##涛 +##涝 +##涞 +##涟 +##涠 +##涡 +##涣 +##涤 +##润 +##涧 +##涨 +##涩 +##涪 +##涮 +##涯 +##液 +##涵 +##涸 +##涼 +##涿 +##淀 +##淄 +##淅 +##淆 +##淇 +##淋 +##淌 +##淑 +##淒 +##淖 +##淘 +##淙 +##淚 +##淞 +##淡 +##淤 +##淦 +##淨 +##淩 +##淪 +##淫 +##淬 +##淮 +##深 +##淳 +##淵 +##混 +##淹 +##淺 +##添 +##淼 +##清 +##済 +##渉 +##渊 +##渋 +##渍 +##渎 +##渐 +##渔 +##渗 +##渙 +##渚 +##減 +##渝 +##渠 +##渡 +##渣 +##渤 +##渥 +##渦 +##温 +##測 +##渭 +##港 +##渲 +##渴 +##游 +##渺 +##渾 +##湃 +##湄 +##湊 +##湍 +##湖 +##湘 +##湛 +##湟 +##湧 +##湫 +##湮 +##湯 +##湳 +##湾 +##湿 +##満 +##溃 +##溅 +##溉 +##溏 +##源 +##準 +##溜 +##溝 +##溟 +##溢 +##溥 +##溧 +##溪 +##溫 +##溯 +##溱 +##溴 +##溶 +##溺 +##溼 +##滁 +##滂 +##滄 +##滅 +##滇 +##滋 +##滌 +##滑 +##滓 +##滔 +##滕 +##滙 +##滚 +##滝 +##滞 +##滟 +##满 +##滢 +##滤 +##滥 +##滦 +##滨 +##滩 +##滬 +##滯 +##滲 +##滴 +##滷 +##滸 +##滾 +##滿 +##漁 +##漂 +##漆 +##漉 +##漏 +##漓 +##演 +##漕 +##漠 +##漢 +##漣 +##漩 +##漪 +##漫 +##漬 +##漯 +##漱 +##漲 +##漳 +##漸 +##漾 +##漿 +##潆 +##潇 +##潋 +##潍 +##潑 +##潔 +##潘 +##潛 +##潜 +##潞 +##潟 +##潢 +##潤 +##潦 +##潧 +##潭 +##潮 +##潰 +##潴 +##潸 +##潺 +##潼 +##澀 +##澄 +##澆 +##澈 +##澍 +##澎 +##澗 +##澜 +##澡 +##澤 +##澧 +##澱 +##澳 +##澹 +##激 +##濁 +##濂 +##濃 +##濑 +##濒 +##濕 +##濘 +##濛 +##濟 +##濠 +##濡 +##濤 +##濫 +##濬 +##濮 +##濯 +##濱 +##濺 +##濾 +##瀅 +##瀆 +##瀉 +##瀋 +##瀏 +##瀑 +##瀕 +##瀘 +##瀚 +##瀛 +##瀝 +##瀞 +##瀟 +##瀧 +##瀨 +##瀬 +##瀰 +##瀾 +##灌 +##灏 +##灑 +##灘 +##灝 +##灞 +##灣 +##火 +##灬 +##灭 +##灯 +##灰 +##灵 +##灶 +##灸 +##灼 +##災 +##灾 +##灿 +##炀 +##炁 +##炅 +##炉 +##炊 +##炎 +##炒 +##炔 +##炕 +##炖 +##炙 +##炜 +##炫 +##炬 +##炭 +##炮 +##炯 +##炳 +##炷 +##炸 +##点 +##為 +##炼 +##炽 +##烁 +##烂 +##烃 +##烈 +##烊 +##烏 +##烘 +##烙 +##烛 +##烟 +##烤 +##烦 +##烧 +##烨 +##烩 +##烫 +##烬 +##热 +##烯 +##烷 +##烹 +##烽 +##焉 +##焊 +##焕 +##焖 +##焗 +##焘 +##焙 +##焚 +##焜 +##無 +##焦 +##焯 +##焰 +##焱 +##然 +##焼 +##煅 +##煉 +##煊 +##煌 +##煎 +##煒 +##煖 +##煙 +##煜 +##煞 +##煤 +##煥 +##煦 +##照 +##煨 +##煩 +##煮 +##煲 +##煸 +##煽 +##熄 +##熊 +##熏 +##熒 +##熔 +##熙 +##熟 +##熠 +##熨 +##熬 +##熱 +##熵 +##熹 +##熾 +##燁 +##燃 +##燄 +##燈 +##燉 +##燊 +##燎 +##燒 +##燔 +##燕 +##燙 +##燜 +##營 +##燥 +##燦 +##燧 +##燭 +##燮 +##燴 +##燻 +##燼 +##燿 +##爆 +##爍 +##爐 +##爛 +##爪 +##爬 +##爭 +##爰 +##爱 +##爲 +##爵 +##父 +##爷 +##爸 +##爹 +##爺 +##爻 +##爽 +##爾 +##牆 +##片 +##版 +##牌 +##牍 +##牒 +##牙 +##牛 +##牝 +##牟 +##牠 +##牡 +##牢 +##牦 +##牧 +##物 +##牯 +##牲 +##牴 +##牵 +##特 +##牺 +##牽 +##犀 +##犁 +##犄 +##犊 +##犍 +##犒 +##犢 +##犧 +##犬 +##犯 +##状 +##犷 +##犸 +##犹 +##狀 +##狂 +##狄 +##狈 +##狎 +##狐 +##狒 +##狗 +##狙 +##狞 +##狠 +##狡 +##狩 +##独 +##狭 +##狮 +##狰 +##狱 +##狸 +##狹 +##狼 +##狽 +##猎 +##猕 +##猖 +##猗 +##猙 +##猛 +##猜 +##猝 +##猥 +##猩 +##猪 +##猫 +##猬 +##献 +##猴 +##猶 +##猷 +##猾 +##猿 +##獄 +##獅 +##獎 +##獐 +##獒 +##獗 +##獠 +##獣 +##獨 +##獭 +##獰 +##獲 +##獵 +##獷 +##獸 +##獺 +##獻 +##獼 +##獾 +##玄 +##率 +##玉 +##王 +##玑 +##玖 +##玛 +##玟 +##玠 +##玥 +##玩 +##玫 +##玮 +##环 +##现 +##玲 +##玳 +##玷 +##玺 +##玻 +##珀 +##珂 +##珅 +##珈 +##珉 +##珊 +##珍 +##珏 +##珐 +##珑 +##珙 +##珞 +##珠 +##珣 +##珥 +##珩 +##珪 +##班 +##珮 +##珲 +##珺 +##現 +##球 +##琅 +##理 +##琇 +##琉 +##琊 +##琍 +##琏 +##琐 +##琛 +##琢 +##琥 +##琦 +##琨 +##琪 +##琬 +##琮 +##琰 +##琲 +##琳 +##琴 +##琵 +##琶 +##琺 +##琼 +##瑀 +##瑁 +##瑄 +##瑋 +##瑕 +##瑗 +##瑙 +##瑚 +##瑛 +##瑜 +##瑞 +##瑟 +##瑠 +##瑣 +##瑤 +##瑩 +##瑪 +##瑯 +##瑰 +##瑶 +##瑾 +##璀 +##璁 +##璃 +##璇 +##璉 +##璋 +##璎 +##璐 +##璜 +##璞 +##璟 +##璧 +##璨 +##環 +##璽 +##璿 +##瓊 +##瓏 +##瓒 +##瓜 +##瓢 +##瓣 +##瓤 +##瓦 +##瓮 +##瓯 +##瓴 +##瓶 +##瓷 +##甄 +##甌 +##甕 +##甘 +##甙 +##甚 +##甜 +##生 +##產 +##産 +##甥 +##甦 +##用 +##甩 +##甫 +##甬 +##甭 +##甯 +##田 +##由 +##甲 +##申 +##电 +##男 +##甸 +##町 +##画 +##甾 +##畀 +##畅 +##界 +##畏 +##畑 +##畔 +##留 +##畜 +##畝 +##畢 +##略 +##畦 +##番 +##畫 +##異 +##畲 +##畳 +##畴 +##當 +##畸 +##畹 +##畿 +##疆 +##疇 +##疊 +##疏 +##疑 +##疔 +##疖 +##疗 +##疙 +##疚 +##疝 +##疟 +##疡 +##疣 +##疤 +##疥 +##疫 +##疮 +##疯 +##疱 +##疲 +##疳 +##疵 +##疸 +##疹 +##疼 +##疽 +##疾 +##痂 +##病 +##症 +##痈 +##痉 +##痊 +##痍 +##痒 +##痔 +##痕 +##痘 +##痙 +##痛 +##痞 +##痠 +##痢 +##痣 +##痤 +##痧 +##痨 +##痪 +##痫 +##痰 +##痱 +##痴 +##痹 +##痺 +##痼 +##痿 +##瘀 +##瘁 +##瘋 +##瘍 +##瘓 +##瘘 +##瘙 +##瘟 +##瘠 +##瘡 +##瘢 +##瘤 +##瘦 +##瘧 +##瘩 +##瘪 +##瘫 +##瘴 +##瘸 +##瘾 +##療 +##癇 +##癌 +##癒 +##癖 +##癜 +##癞 +##癡 +##癢 +##癣 +##癥 +##癫 +##癬 +##癮 +##癱 +##癲 +##癸 +##発 +##登 +##發 +##白 +##百 +##皂 +##的 +##皆 +##皇 +##皈 +##皋 +##皎 +##皑 +##皓 +##皖 +##皙 +##皚 +##皮 +##皰 +##皱 +##皴 +##皺 +##皿 +##盂 +##盃 +##盅 +##盆 +##盈 +##益 +##盎 +##盏 +##盐 +##监 +##盒 +##盔 +##盖 +##盗 +##盘 +##盛 +##盜 +##盞 +##盟 +##盡 +##監 +##盤 +##盥 +##盧 +##盪 +##目 +##盯 +##盱 +##盲 +##直 +##相 +##盹 +##盼 +##盾 +##省 +##眈 +##眉 +##看 +##県 +##眙 +##眞 +##真 +##眠 +##眦 +##眨 +##眩 +##眯 +##眶 +##眷 +##眸 +##眺 +##眼 +##眾 +##着 +##睁 +##睇 +##睏 +##睐 +##睑 +##睛 +##睜 +##睞 +##睡 +##睢 +##督 +##睥 +##睦 +##睨 +##睪 +##睫 +##睬 +##睹 +##睽 +##睾 +##睿 +##瞄 +##瞅 +##瞇 +##瞋 +##瞌 +##瞎 +##瞑 +##瞒 +##瞓 +##瞞 +##瞟 +##瞠 +##瞥 +##瞧 +##瞩 +##瞪 +##瞬 +##瞭 +##瞰 +##瞳 +##瞻 +##瞼 +##瞿 +##矇 +##矍 +##矗 +##矚 +##矛 +##矜 +##矢 +##矣 +##知 +##矩 +##矫 +##短 +##矮 +##矯 +##石 +##矶 +##矽 +##矾 +##矿 +##码 +##砂 +##砌 +##砍 +##砒 +##研 +##砖 +##砗 +##砚 +##砝 +##砣 +##砥 +##砧 +##砭 +##砰 +##砲 +##破 +##砷 +##砸 +##砺 +##砼 +##砾 +##础 +##硅 +##硐 +##硒 +##硕 +##硝 +##硫 +##硬 +##确 +##硯 +##硼 +##碁 +##碇 +##碉 +##碌 +##碍 +##碎 +##碑 +##碓 +##碗 +##碘 +##碚 +##碛 +##碟 +##碣 +##碧 +##碩 +##碰 +##碱 +##碳 +##碴 +##確 +##碼 +##碾 +##磁 +##磅 +##磊 +##磋 +##磐 +##磕 +##磚 +##磡 +##磨 +##磬 +##磯 +##磲 +##磷 +##磺 +##礁 +##礎 +##礙 +##礡 +##礦 +##礪 +##礫 +##礴 +##示 +##礼 +##社 +##祀 +##祁 +##祂 +##祇 +##祈 +##祉 +##祎 +##祐 +##祕 +##祖 +##祗 +##祚 +##祛 +##祜 +##祝 +##神 +##祟 +##祠 +##祢 +##祥 +##票 +##祭 +##祯 +##祷 +##祸 +##祺 +##祿 +##禀 +##禁 +##禄 +##禅 +##禍 +##禎 +##福 +##禛 +##禦 +##禧 +##禪 +##禮 +##禱 +##禹 +##禺 +##离 +##禽 +##禾 +##禿 +##秀 +##私 +##秃 +##秆 +##秉 +##秋 +##种 +##科 +##秒 +##秘 +##租 +##秣 +##秤 +##秦 +##秧 +##秩 +##秭 +##积 +##称 +##秸 +##移 +##秽 +##稀 +##稅 +##程 +##稍 +##税 +##稔 +##稗 +##稚 +##稜 +##稞 +##稟 +##稠 +##稣 +##種 +##稱 +##稲 +##稳 +##稷 +##稹 +##稻 +##稼 +##稽 +##稿 +##穀 +##穂 +##穆 +##穌 +##積 +##穎 +##穗 +##穢 +##穩 +##穫 +##穴 +##究 +##穷 +##穹 +##空 +##穿 +##突 +##窃 +##窄 +##窈 +##窍 +##窑 +##窒 +##窓 +##窕 +##窖 +##窗 +##窘 +##窜 +##窝 +##窟 +##窠 +##窥 +##窦 +##窨 +##窩 +##窪 +##窮 +##窯 +##窺 +##窿 +##竄 +##竅 +##竇 +##竊 +##立 +##竖 +##站 +##竜 +##竞 +##竟 +##章 +##竣 +##童 +##竭 +##端 +##競 +##竹 +##竺 +##竽 +##竿 +##笃 +##笆 +##笈 +##笋 +##笏 +##笑 +##笔 +##笙 +##笛 +##笞 +##笠 +##符 +##笨 +##第 +##笹 +##笺 +##笼 +##筆 +##等 +##筊 +##筋 +##筍 +##筏 +##筐 +##筑 +##筒 +##答 +##策 +##筛 +##筝 +##筠 +##筱 +##筲 +##筵 +##筷 +##筹 +##签 +##简 +##箇 +##箋 +##箍 +##箏 +##箐 +##箔 +##箕 +##算 +##箝 +##管 +##箩 +##箫 +##箭 +##箱 +##箴 +##箸 +##節 +##篁 +##範 +##篆 +##篇 +##築 +##篑 +##篓 +##篙 +##篝 +##篠 +##篡 +##篤 +##篩 +##篪 +##篮 +##篱 +##篷 +##簇 +##簌 +##簍 +##簡 +##簦 +##簧 +##簪 +##簫 +##簷 +##簸 +##簽 +##簾 +##簿 +##籁 +##籃 +##籌 +##籍 +##籐 +##籟 +##籠 +##籤 +##籬 +##籮 +##籲 +##米 +##类 +##籼 +##籽 +##粄 +##粉 +##粑 +##粒 +##粕 +##粗 +##粘 +##粟 +##粤 +##粥 +##粧 +##粪 +##粮 +##粱 +##粲 +##粳 +##粵 +##粹 +##粼 +##粽 +##精 +##粿 +##糅 +##糊 +##糍 +##糕 +##糖 +##糗 +##糙 +##糜 +##糞 +##糟 +##糠 +##糧 +##糬 +##糯 +##糰 +##糸 +##系 +##糾 +##紀 +##紂 +##約 +##紅 +##紉 +##紊 +##紋 +##納 +##紐 +##紓 +##純 +##紗 +##紘 +##紙 +##級 +##紛 +##紜 +##素 +##紡 +##索 +##紧 +##紫 +##紮 +##累 +##細 +##紳 +##紹 +##紺 +##終 +##絃 +##組 +##絆 +##経 +##結 +##絕 +##絞 +##絡 +##絢 +##給 +##絨 +##絮 +##統 +##絲 +##絳 +##絵 +##絶 +##絹 +##綁 +##綏 +##綑 +##經 +##継 +##続 +##綜 +##綠 +##綢 +##綦 +##綫 +##綬 +##維 +##綱 +##網 +##綴 +##綵 +##綸 +##綺 +##綻 +##綽 +##綾 +##綿 +##緊 +##緋 +##総 +##緑 +##緒 +##緘 +##線 +##緝 +##緞 +##締 +##緣 +##編 +##緩 +##緬 +##緯 +##練 +##緹 +##緻 +##縁 +##縄 +##縈 +##縛 +##縝 +##縣 +##縫 +##縮 +##縱 +##縴 +##縷 +##總 +##績 +##繁 +##繃 +##繆 +##繇 +##繋 +##織 +##繕 +##繚 +##繞 +##繡 +##繩 +##繪 +##繫 +##繭 +##繳 +##繹 +##繼 +##繽 +##纂 +##續 +##纍 +##纏 +##纓 +##纔 +##纖 +##纜 +##纠 +##红 +##纣 +##纤 +##约 +##级 +##纨 +##纪 +##纫 +##纬 +##纭 +##纯 +##纰 +##纱 +##纲 +##纳 +##纵 +##纶 +##纷 +##纸 +##纹 +##纺 +##纽 +##纾 +##线 +##绀 +##练 +##组 +##绅 +##细 +##织 +##终 +##绊 +##绍 +##绎 +##经 +##绑 +##绒 +##结 +##绔 +##绕 +##绘 +##给 +##绚 +##绛 +##络 +##绝 +##绞 +##统 +##绡 +##绢 +##绣 +##绥 +##绦 +##继 +##绩 +##绪 +##绫 +##续 +##绮 +##绯 +##绰 +##绳 +##维 +##绵 +##绶 +##绷 +##绸 +##绻 +##综 +##绽 +##绾 +##绿 +##缀 +##缄 +##缅 +##缆 +##缇 +##缈 +##缉 +##缎 +##缓 +##缔 +##缕 +##编 +##缘 +##缙 +##缚 +##缜 +##缝 +##缠 +##缢 +##缤 +##缥 +##缨 +##缩 +##缪 +##缭 +##缮 +##缰 +##缱 +##缴 +##缸 +##缺 +##缽 +##罂 +##罄 +##罌 +##罐 +##网 +##罔 +##罕 +##罗 +##罚 +##罡 +##罢 +##罩 +##罪 +##置 +##罰 +##署 +##罵 +##罷 +##罹 +##羁 +##羅 +##羈 +##羊 +##羌 +##美 +##羔 +##羚 +##羞 +##羟 +##羡 +##羣 +##群 +##羥 +##羧 +##羨 +##義 +##羯 +##羲 +##羸 +##羹 +##羽 +##羿 +##翁 +##翅 +##翊 +##翌 +##翎 +##習 +##翔 +##翘 +##翟 +##翠 +##翡 +##翦 +##翩 +##翰 +##翱 +##翳 +##翹 +##翻 +##翼 +##耀 +##老 +##考 +##耄 +##者 +##耆 +##耋 +##而 +##耍 +##耐 +##耒 +##耕 +##耗 +##耘 +##耙 +##耦 +##耨 +##耳 +##耶 +##耷 +##耸 +##耻 +##耽 +##耿 +##聂 +##聆 +##聊 +##聋 +##职 +##聒 +##联 +##聖 +##聘 +##聚 +##聞 +##聪 +##聯 +##聰 +##聲 +##聳 +##聴 +##聶 +##職 +##聽 +##聾 +##聿 +##肃 +##肄 +##肅 +##肆 +##肇 +##肉 +##肋 +##肌 +##肏 +##肓 +##肖 +##肘 +##肚 +##肛 +##肝 +##肠 +##股 +##肢 +##肤 +##肥 +##肩 +##肪 +##肮 +##肯 +##肱 +##育 +##肴 +##肺 +##肽 +##肾 +##肿 +##胀 +##胁 +##胃 +##胄 +##胆 +##背 +##胍 +##胎 +##胖 +##胚 +##胛 +##胜 +##胝 +##胞 +##胡 +##胤 +##胥 +##胧 +##胫 +##胭 +##胯 +##胰 +##胱 +##胳 +##胴 +##胶 +##胸 +##胺 +##能 +##脂 +##脅 +##脆 +##脇 +##脈 +##脉 +##脊 +##脍 +##脏 +##脐 +##脑 +##脓 +##脖 +##脘 +##脚 +##脛 +##脣 +##脩 +##脫 +##脯 +##脱 +##脲 +##脳 +##脸 +##脹 +##脾 +##腆 +##腈 +##腊 +##腋 +##腌 +##腎 +##腐 +##腑 +##腓 +##腔 +##腕 +##腥 +##腦 +##腩 +##腫 +##腭 +##腮 +##腰 +##腱 +##腳 +##腴 +##腸 +##腹 +##腺 +##腻 +##腼 +##腾 +##腿 +##膀 +##膈 +##膊 +##膏 +##膑 +##膘 +##膚 +##膛 +##膜 +##膝 +##膠 +##膦 +##膨 +##膩 +##膳 +##膺 +##膻 +##膽 +##膾 +##膿 +##臀 +##臂 +##臃 +##臆 +##臉 +##臊 +##臍 +##臓 +##臘 +##臟 +##臣 +##臥 +##臧 +##臨 +##自 +##臬 +##臭 +##至 +##致 +##臺 +##臻 +##臼 +##臾 +##舀 +##舂 +##舅 +##舆 +##與 +##興 +##舉 +##舊 +##舌 +##舍 +##舎 +##舐 +##舒 +##舔 +##舖 +##舗 +##舛 +##舜 +##舞 +##舟 +##航 +##舫 +##般 +##舰 +##舱 +##舵 +##舶 +##舷 +##舸 +##船 +##舺 +##舾 +##艇 +##艋 +##艘 +##艙 +##艦 +##艮 +##良 +##艰 +##艱 +##色 +##艳 +##艷 +##艹 +##艺 +##艾 +##节 +##芃 +##芈 +##芊 +##芋 +##芍 +##芎 +##芒 +##芙 +##芜 +##芝 +##芡 +##芥 +##芦 +##芩 +##芪 +##芫 +##芬 +##芭 +##芮 +##芯 +##花 +##芳 +##芷 +##芸 +##芹 +##芻 +##芽 +##芾 +##苁 +##苄 +##苇 +##苋 +##苍 +##苏 +##苑 +##苒 +##苓 +##苔 +##苕 +##苗 +##苛 +##苜 +##苞 +##苟 +##苡 +##苣 +##若 +##苦 +##苫 +##苯 +##英 +##苷 +##苹 +##苻 +##茁 +##茂 +##范 +##茄 +##茅 +##茉 +##茎 +##茏 +##茗 +##茜 +##茧 +##茨 +##茫 +##茬 +##茭 +##茯 +##茱 +##茲 +##茴 +##茵 +##茶 +##茸 +##茹 +##茼 +##荀 +##荃 +##荆 +##草 +##荊 +##荏 +##荐 +##荒 +##荔 +##荖 +##荘 +##荚 +##荞 +##荟 +##荠 +##荡 +##荣 +##荤 +##荥 +##荧 +##荨 +##荪 +##荫 +##药 +##荳 +##荷 +##荸 +##荻 +##荼 +##荽 +##莅 +##莆 +##莉 +##莊 +##莎 +##莒 +##莓 +##莖 +##莘 +##莞 +##莠 +##莢 +##莧 +##莪 +##莫 +##莱 +##莲 +##莴 +##获 +##莹 +##莺 +##莽 +##莿 +##菀 +##菁 +##菅 +##菇 +##菈 +##菊 +##菌 +##菏 +##菓 +##菖 +##菘 +##菜 +##菟 +##菠 +##菡 +##菩 +##華 +##菱 +##菲 +##菸 +##菽 +##萁 +##萃 +##萄 +##萊 +##萋 +##萌 +##萍 +##萎 +##萘 +##萝 +##萤 +##营 +##萦 +##萧 +##萨 +##萩 +##萬 +##萱 +##萵 +##萸 +##萼 +##落 +##葆 +##葉 +##著 +##葚 +##葛 +##葡 +##董 +##葦 +##葩 +##葫 +##葬 +##葭 +##葯 +##葱 +##葳 +##葵 +##葷 +##葺 +##蒂 +##蒋 +##蒐 +##蒔 +##蒙 +##蒜 +##蒞 +##蒟 +##蒡 +##蒨 +##蒲 +##蒸 +##蒹 +##蒻 +##蒼 +##蒿 +##蓁 +##蓄 +##蓆 +##蓉 +##蓋 +##蓑 +##蓓 +##蓖 +##蓝 +##蓟 +##蓦 +##蓬 +##蓮 +##蓼 +##蓿 +##蔑 +##蔓 +##蔔 +##蔗 +##蔘 +##蔚 +##蔡 +##蔣 +##蔥 +##蔫 +##蔬 +##蔭 +##蔵 +##蔷 +##蔺 +##蔻 +##蔼 +##蔽 +##蕁 +##蕃 +##蕈 +##蕉 +##蕊 +##蕎 +##蕙 +##蕤 +##蕨 +##蕩 +##蕪 +##蕭 +##蕲 +##蕴 +##蕻 +##蕾 +##薄 +##薅 +##薇 +##薈 +##薊 +##薏 +##薑 +##薔 +##薙 +##薛 +##薦 +##薨 +##薩 +##薪 +##薬 +##薯 +##薰 +##薹 +##藉 +##藍 +##藏 +##藐 +##藓 +##藕 +##藜 +##藝 +##藤 +##藥 +##藩 +##藹 +##藻 +##藿 +##蘆 +##蘇 +##蘊 +##蘋 +##蘑 +##蘚 +##蘭 +##蘸 +##蘼 +##蘿 +##虎 +##虏 +##虐 +##虑 +##虔 +##處 +##虚 +##虛 +##虜 +##虞 +##號 +##虢 +##虧 +##虫 +##虬 +##虱 +##虹 +##虻 +##虽 +##虾 +##蚀 +##蚁 +##蚂 +##蚊 +##蚌 +##蚓 +##蚕 +##蚜 +##蚝 +##蚣 +##蚤 +##蚩 +##蚪 +##蚯 +##蚱 +##蚵 +##蛀 +##蛆 +##蛇 +##蛊 +##蛋 +##蛎 +##蛐 +##蛔 +##蛙 +##蛛 +##蛟 +##蛤 +##蛭 +##蛮 +##蛰 +##蛳 +##蛹 +##蛻 +##蛾 +##蜀 +##蜂 +##蜃 +##蜆 +##蜇 +##蜈 +##蜊 +##蜍 +##蜒 +##蜓 +##蜕 +##蜗 +##蜘 +##蜚 +##蜜 +##蜡 +##蜢 +##蜥 +##蜱 +##蜴 +##蜷 +##蜻 +##蜿 +##蝇 +##蝈 +##蝉 +##蝌 +##蝎 +##蝕 +##蝗 +##蝙 +##蝟 +##蝠 +##蝦 +##蝨 +##蝴 +##蝶 +##蝸 +##蝼 +##螂 +##螃 +##融 +##螞 +##螢 +##螨 +##螯 +##螳 +##螺 +##蟀 +##蟄 +##蟆 +##蟋 +##蟎 +##蟑 +##蟒 +##蟠 +##蟬 +##蟲 +##蟹 +##蟻 +##蟾 +##蠅 +##蠍 +##蠔 +##蠕 +##蠛 +##蠟 +##蠡 +##蠢 +##蠣 +##蠱 +##蠶 +##蠹 +##蠻 +##血 +##衄 +##衅 +##衆 +##行 +##衍 +##術 +##衔 +##街 +##衙 +##衛 +##衝 +##衞 +##衡 +##衢 +##衣 +##补 +##表 +##衩 +##衫 +##衬 +##衮 +##衰 +##衲 +##衷 +##衹 +##衾 +##衿 +##袁 +##袂 +##袄 +##袅 +##袈 +##袋 +##袍 +##袒 +##袖 +##袜 +##袞 +##袤 +##袪 +##被 +##袭 +##袱 +##裁 +##裂 +##装 +##裆 +##裊 +##裏 +##裔 +##裕 +##裘 +##裙 +##補 +##裝 +##裟 +##裡 +##裤 +##裨 +##裱 +##裳 +##裴 +##裸 +##裹 +##製 +##裾 +##褂 +##複 +##褐 +##褒 +##褓 +##褔 +##褚 +##褥 +##褪 +##褫 +##褲 +##褶 +##褻 +##襁 +##襄 +##襟 +##襠 +##襪 +##襬 +##襯 +##襲 +##西 +##要 +##覃 +##覆 +##覇 +##見 +##規 +##覓 +##視 +##覚 +##覦 +##覧 +##親 +##覬 +##観 +##覷 +##覺 +##覽 +##觀 +##见 +##观 +##规 +##觅 +##视 +##览 +##觉 +##觊 +##觎 +##觐 +##觑 +##角 +##觞 +##解 +##觥 +##触 +##觸 +##言 +##訂 +##計 +##訊 +##討 +##訓 +##訕 +##訖 +##託 +##記 +##訛 +##訝 +##訟 +##訣 +##訥 +##訪 +##設 +##許 +##訳 +##訴 +##訶 +##診 +##註 +##証 +##詆 +##詐 +##詔 +##評 +##詛 +##詞 +##詠 +##詡 +##詢 +##詣 +##試 +##詩 +##詫 +##詬 +##詭 +##詮 +##詰 +##話 +##該 +##詳 +##詹 +##詼 +##誅 +##誇 +##誉 +##誌 +##認 +##誓 +##誕 +##誘 +##語 +##誠 +##誡 +##誣 +##誤 +##誥 +##誦 +##誨 +##說 +##説 +##読 +##誰 +##課 +##誹 +##誼 +##調 +##諄 +##談 +##請 +##諏 +##諒 +##論 +##諗 +##諜 +##諡 +##諦 +##諧 +##諫 +##諭 +##諮 +##諱 +##諳 +##諷 +##諸 +##諺 +##諾 +##謀 +##謁 +##謂 +##謄 +##謊 +##謎 +##謐 +##謔 +##謗 +##謙 +##講 +##謝 +##謠 +##謨 +##謬 +##謹 +##謾 +##譁 +##證 +##譎 +##譏 +##識 +##譙 +##譚 +##譜 +##警 +##譬 +##譯 +##議 +##譲 +##譴 +##護 +##譽 +##讀 +##變 +##讓 +##讚 +##讞 +##计 +##订 +##认 +##讥 +##讧 +##讨 +##让 +##讪 +##讫 +##训 +##议 +##讯 +##记 +##讲 +##讳 +##讴 +##讶 +##讷 +##许 +##讹 +##论 +##讼 +##讽 +##设 +##访 +##诀 +##证 +##诃 +##评 +##诅 +##识 +##诈 +##诉 +##诊 +##诋 +##词 +##诏 +##译 +##试 +##诗 +##诘 +##诙 +##诚 +##诛 +##话 +##诞 +##诟 +##诠 +##诡 +##询 +##诣 +##诤 +##该 +##详 +##诧 +##诩 +##诫 +##诬 +##语 +##误 +##诰 +##诱 +##诲 +##说 +##诵 +##诶 +##请 +##诸 +##诺 +##读 +##诽 +##课 +##诿 +##谀 +##谁 +##调 +##谄 +##谅 +##谆 +##谈 +##谊 +##谋 +##谌 +##谍 +##谎 +##谏 +##谐 +##谑 +##谒 +##谓 +##谔 +##谕 +##谗 +##谘 +##谙 +##谚 +##谛 +##谜 +##谟 +##谢 +##谣 +##谤 +##谥 +##谦 +##谧 +##谨 +##谩 +##谪 +##谬 +##谭 +##谯 +##谱 +##谲 +##谴 +##谶 +##谷 +##豁 +##豆 +##豇 +##豈 +##豉 +##豊 +##豌 +##豎 +##豐 +##豔 +##豚 +##象 +##豢 +##豪 +##豫 +##豬 +##豹 +##豺 +##貂 +##貅 +##貌 +##貓 +##貔 +##貘 +##貝 +##貞 +##負 +##財 +##貢 +##貧 +##貨 +##販 +##貪 +##貫 +##責 +##貯 +##貰 +##貳 +##貴 +##貶 +##買 +##貸 +##費 +##貼 +##貽 +##貿 +##賀 +##賁 +##賂 +##賃 +##賄 +##資 +##賈 +##賊 +##賑 +##賓 +##賜 +##賞 +##賠 +##賡 +##賢 +##賣 +##賤 +##賦 +##質 +##賬 +##賭 +##賴 +##賺 +##購 +##賽 +##贅 +##贈 +##贊 +##贍 +##贏 +##贓 +##贖 +##贛 +##贝 +##贞 +##负 +##贡 +##财 +##责 +##贤 +##败 +##账 +##货 +##质 +##贩 +##贪 +##贫 +##贬 +##购 +##贮 +##贯 +##贰 +##贱 +##贲 +##贴 +##贵 +##贷 +##贸 +##费 +##贺 +##贻 +##贼 +##贾 +##贿 +##赁 +##赂 +##赃 +##资 +##赅 +##赈 +##赊 +##赋 +##赌 +##赎 +##赏 +##赐 +##赓 +##赔 +##赖 +##赘 +##赚 +##赛 +##赝 +##赞 +##赠 +##赡 +##赢 +##赣 +##赤 +##赦 +##赧 +##赫 +##赭 +##走 +##赳 +##赴 +##赵 +##赶 +##起 +##趁 +##超 +##越 +##趋 +##趕 +##趙 +##趟 +##趣 +##趨 +##足 +##趴 +##趵 +##趸 +##趺 +##趾 +##跃 +##跄 +##跆 +##跋 +##跌 +##跎 +##跑 +##跖 +##跚 +##跛 +##距 +##跟 +##跡 +##跤 +##跨 +##跩 +##跪 +##路 +##跳 +##践 +##跷 +##跹 +##跺 +##跻 +##踉 +##踊 +##踌 +##踏 +##踐 +##踝 +##踞 +##踟 +##踢 +##踩 +##踪 +##踮 +##踱 +##踴 +##踵 +##踹 +##蹂 +##蹄 +##蹇 +##蹈 +##蹉 +##蹊 +##蹋 +##蹑 +##蹒 +##蹙 +##蹟 +##蹣 +##蹤 +##蹦 +##蹩 +##蹬 +##蹭 +##蹲 +##蹴 +##蹶 +##蹺 +##蹼 +##蹿 +##躁 +##躇 +##躉 +##躊 +##躋 +##躍 +##躏 +##躪 +##身 +##躬 +##躯 +##躲 +##躺 +##軀 +##車 +##軋 +##軌 +##軍 +##軒 +##軟 +##転 +##軸 +##軼 +##軽 +##軾 +##較 +##載 +##輒 +##輓 +##輔 +##輕 +##輛 +##輝 +##輟 +##輩 +##輪 +##輯 +##輸 +##輻 +##輾 +##輿 +##轄 +##轅 +##轆 +##轉 +##轍 +##轎 +##轟 +##车 +##轧 +##轨 +##轩 +##转 +##轭 +##轮 +##软 +##轰 +##轲 +##轴 +##轶 +##轻 +##轼 +##载 +##轿 +##较 +##辄 +##辅 +##辆 +##辇 +##辈 +##辉 +##辊 +##辍 +##辐 +##辑 +##输 +##辕 +##辖 +##辗 +##辘 +##辙 +##辛 +##辜 +##辞 +##辟 +##辣 +##辦 +##辨 +##辩 +##辫 +##辭 +##辮 +##辯 +##辰 +##辱 +##農 +##边 +##辺 +##辻 +##込 +##辽 +##达 +##迁 +##迂 +##迄 +##迅 +##过 +##迈 +##迎 +##运 +##近 +##返 +##还 +##这 +##进 +##远 +##违 +##连 +##迟 +##迢 +##迤 +##迥 +##迦 +##迩 +##迪 +##迫 +##迭 +##述 +##迴 +##迷 +##迸 +##迹 +##迺 +##追 +##退 +##送 +##适 +##逃 +##逅 +##逆 +##选 +##逊 +##逍 +##透 +##逐 +##递 +##途 +##逕 +##逗 +##這 +##通 +##逛 +##逝 +##逞 +##速 +##造 +##逢 +##連 +##逮 +##週 +##進 +##逵 +##逶 +##逸 +##逻 +##逼 +##逾 +##遁 +##遂 +##遅 +##遇 +##遊 +##運 +##遍 +##過 +##遏 +##遐 +##遑 +##遒 +##道 +##達 +##違 +##遗 +##遙 +##遛 +##遜 +##遞 +##遠 +##遢 +##遣 +##遥 +##遨 +##適 +##遭 +##遮 +##遲 +##遴 +##遵 +##遶 +##遷 +##選 +##遺 +##遼 +##遽 +##避 +##邀 +##邁 +##邂 +##邃 +##還 +##邇 +##邈 +##邊 +##邋 +##邏 +##邑 +##邓 +##邕 +##邛 +##邝 +##邢 +##那 +##邦 +##邨 +##邪 +##邬 +##邮 +##邯 +##邰 +##邱 +##邳 +##邵 +##邸 +##邹 +##邺 +##邻 +##郁 +##郅 +##郊 +##郎 +##郑 +##郜 +##郝 +##郡 +##郢 +##郤 +##郦 +##郧 +##部 +##郫 +##郭 +##郴 +##郵 +##郷 +##郸 +##都 +##鄂 +##鄉 +##鄒 +##鄔 +##鄙 +##鄞 +##鄢 +##鄧 +##鄭 +##鄰 +##鄱 +##鄲 +##鄺 +##酉 +##酊 +##酋 +##酌 +##配 +##酐 +##酒 +##酗 +##酚 +##酝 +##酢 +##酣 +##酥 +##酩 +##酪 +##酬 +##酮 +##酯 +##酰 +##酱 +##酵 +##酶 +##酷 +##酸 +##酿 +##醃 +##醇 +##醉 +##醋 +##醍 +##醐 +##醒 +##醚 +##醛 +##醜 +##醞 +##醣 +##醪 +##醫 +##醬 +##醮 +##醯 +##醴 +##醺 +##釀 +##釁 +##采 +##釉 +##释 +##釋 +##里 +##重 +##野 +##量 +##釐 +##金 +##釗 +##釘 +##釜 +##針 +##釣 +##釦 +##釧 +##釵 +##鈀 +##鈉 +##鈍 +##鈎 +##鈔 +##鈕 +##鈞 +##鈣 +##鈦 +##鈪 +##鈴 +##鈺 +##鈾 +##鉀 +##鉄 +##鉅 +##鉉 +##鉑 +##鉗 +##鉚 +##鉛 +##鉤 +##鉴 +##鉻 +##銀 +##銃 +##銅 +##銑 +##銓 +##銖 +##銘 +##銜 +##銬 +##銭 +##銮 +##銳 +##銷 +##銹 +##鋁 +##鋅 +##鋒 +##鋤 +##鋪 +##鋰 +##鋸 +##鋼 +##錄 +##錐 +##錘 +##錚 +##錠 +##錢 +##錦 +##錨 +##錫 +##錮 +##錯 +##録 +##錳 +##錶 +##鍊 +##鍋 +##鍍 +##鍛 +##鍥 +##鍰 +##鍵 +##鍺 +##鍾 +##鎂 +##鎊 +##鎌 +##鎏 +##鎔 +##鎖 +##鎗 +##鎚 +##鎧 +##鎬 +##鎮 +##鎳 +##鏈 +##鏖 +##鏗 +##鏘 +##鏞 +##鏟 +##鏡 +##鏢 +##鏤 +##鏽 +##鐘 +##鐮 +##鐲 +##鐳 +##鐵 +##鐸 +##鐺 +##鑄 +##鑊 +##鑑 +##鑒 +##鑣 +##鑫 +##鑰 +##鑲 +##鑼 +##鑽 +##鑾 +##鑿 +##针 +##钉 +##钊 +##钎 +##钏 +##钒 +##钓 +##钗 +##钙 +##钛 +##钜 +##钝 +##钞 +##钟 +##钠 +##钡 +##钢 +##钣 +##钤 +##钥 +##钦 +##钧 +##钨 +##钩 +##钮 +##钯 +##钰 +##钱 +##钳 +##钴 +##钵 +##钺 +##钻 +##钼 +##钾 +##钿 +##铀 +##铁 +##铂 +##铃 +##铄 +##铅 +##铆 +##铉 +##铎 +##铐 +##铛 +##铜 +##铝 +##铠 +##铡 +##铢 +##铣 +##铤 +##铨 +##铩 +##铬 +##铭 +##铮 +##铰 +##铲 +##铵 +##银 +##铸 +##铺 +##链 +##铿 +##销 +##锁 +##锂 +##锄 +##锅 +##锆 +##锈 +##锉 +##锋 +##锌 +##锏 +##锐 +##锑 +##错 +##锚 +##锟 +##锡 +##锢 +##锣 +##锤 +##锥 +##锦 +##锭 +##键 +##锯 +##锰 +##锲 +##锵 +##锹 +##锺 +##锻 +##镀 +##镁 +##镂 +##镇 +##镉 +##镌 +##镍 +##镐 +##镑 +##镕 +##镖 +##镗 +##镛 +##镜 +##镣 +##镭 +##镯 +##镰 +##镳 +##镶 +##長 +##长 +##門 +##閃 +##閉 +##開 +##閎 +##閏 +##閑 +##閒 +##間 +##閔 +##閘 +##閡 +##関 +##閣 +##閥 +##閨 +##閩 +##閱 +##閲 +##閹 +##閻 +##閾 +##闆 +##闇 +##闊 +##闌 +##闍 +##闔 +##闕 +##闖 +##闘 +##關 +##闡 +##闢 +##门 +##闪 +##闫 +##闭 +##问 +##闯 +##闰 +##闲 +##间 +##闵 +##闷 +##闸 +##闹 +##闺 +##闻 +##闽 +##闾 +##阀 +##阁 +##阂 +##阅 +##阆 +##阇 +##阈 +##阉 +##阎 +##阐 +##阑 +##阔 +##阕 +##阖 +##阙 +##阚 +##阜 +##队 +##阡 +##阪 +##阮 +##阱 +##防 +##阳 +##阴 +##阵 +##阶 +##阻 +##阿 +##陀 +##陂 +##附 +##际 +##陆 +##陇 +##陈 +##陋 +##陌 +##降 +##限 +##陕 +##陛 +##陝 +##陞 +##陟 +##陡 +##院 +##陣 +##除 +##陨 +##险 +##陪 +##陰 +##陲 +##陳 +##陵 +##陶 +##陷 +##陸 +##険 +##陽 +##隅 +##隆 +##隈 +##隊 +##隋 +##隍 +##階 +##随 +##隐 +##隔 +##隕 +##隘 +##隙 +##際 +##障 +##隠 +##隣 +##隧 +##隨 +##險 +##隱 +##隴 +##隶 +##隸 +##隻 +##隼 +##隽 +##难 +##雀 +##雁 +##雄 +##雅 +##集 +##雇 +##雉 +##雋 +##雌 +##雍 +##雎 +##雏 +##雑 +##雒 +##雕 +##雖 +##雙 +##雛 +##雜 +##雞 +##離 +##難 +##雨 +##雪 +##雯 +##雰 +##雲 +##雳 +##零 +##雷 +##雹 +##電 +##雾 +##需 +##霁 +##霄 +##霆 +##震 +##霈 +##霉 +##霊 +##霍 +##霎 +##霏 +##霑 +##霓 +##霖 +##霜 +##霞 +##霧 +##霭 +##霰 +##露 +##霸 +##霹 +##霽 +##霾 +##靂 +##靄 +##靈 +##青 +##靓 +##靖 +##静 +##靚 +##靛 +##靜 +##非 +##靠 +##靡 +##面 +##靥 +##靦 +##革 +##靳 +##靴 +##靶 +##靼 +##鞅 +##鞋 +##鞍 +##鞏 +##鞑 +##鞘 +##鞠 +##鞣 +##鞦 +##鞭 +##韆 +##韋 +##韌 +##韓 +##韜 +##韦 +##韧 +##韩 +##韬 +##韭 +##音 +##韵 +##韶 +##韻 +##響 +##頁 +##頂 +##頃 +##項 +##順 +##須 +##頌 +##預 +##頑 +##頒 +##頓 +##頗 +##領 +##頜 +##頡 +##頤 +##頫 +##頭 +##頰 +##頷 +##頸 +##頹 +##頻 +##頼 +##顆 +##題 +##額 +##顎 +##顏 +##顔 +##願 +##顛 +##類 +##顧 +##顫 +##顯 +##顱 +##顴 +##页 +##顶 +##顷 +##项 +##顺 +##须 +##顼 +##顽 +##顾 +##顿 +##颁 +##颂 +##预 +##颅 +##领 +##颇 +##颈 +##颉 +##颊 +##颌 +##颍 +##颐 +##频 +##颓 +##颔 +##颖 +##颗 +##题 +##颚 +##颛 +##颜 +##额 +##颞 +##颠 +##颡 +##颢 +##颤 +##颦 +##颧 +##風 +##颯 +##颱 +##颳 +##颶 +##颼 +##飄 +##飆 +##风 +##飒 +##飓 +##飕 +##飘 +##飙 +##飚 +##飛 +##飞 +##食 +##飢 +##飨 +##飩 +##飪 +##飯 +##飲 +##飼 +##飽 +##飾 +##餃 +##餅 +##餉 +##養 +##餌 +##餐 +##餒 +##餓 +##餘 +##餚 +##餛 +##餞 +##餡 +##館 +##餮 +##餵 +##餾 +##饅 +##饈 +##饋 +##饌 +##饍 +##饑 +##饒 +##饕 +##饗 +##饞 +##饥 +##饨 +##饪 +##饬 +##饭 +##饮 +##饯 +##饰 +##饱 +##饲 +##饴 +##饵 +##饶 +##饷 +##饺 +##饼 +##饽 +##饿 +##馀 +##馁 +##馄 +##馅 +##馆 +##馈 +##馋 +##馍 +##馏 +##馒 +##馔 +##首 +##馗 +##香 +##馥 +##馨 +##馬 +##馭 +##馮 +##馳 +##馴 +##駁 +##駄 +##駅 +##駆 +##駐 +##駒 +##駕 +##駛 +##駝 +##駭 +##駱 +##駿 +##騁 +##騎 +##騏 +##験 +##騙 +##騨 +##騰 +##騷 +##驀 +##驅 +##驊 +##驍 +##驒 +##驕 +##驗 +##驚 +##驛 +##驟 +##驢 +##驥 +##马 +##驭 +##驮 +##驯 +##驰 +##驱 +##驳 +##驴 +##驶 +##驷 +##驸 +##驹 +##驻 +##驼 +##驾 +##驿 +##骁 +##骂 +##骄 +##骅 +##骆 +##骇 +##骈 +##骊 +##骋 +##验 +##骏 +##骐 +##骑 +##骗 +##骚 +##骛 +##骜 +##骞 +##骠 +##骡 +##骤 +##骥 +##骧 +##骨 +##骯 +##骰 +##骶 +##骷 +##骸 +##骼 +##髂 +##髅 +##髋 +##髏 +##髒 +##髓 +##體 +##髖 +##高 +##髦 +##髪 +##髮 +##髯 +##髻 +##鬃 +##鬆 +##鬍 +##鬓 +##鬚 +##鬟 +##鬢 +##鬣 +##鬥 +##鬧 +##鬱 +##鬼 +##魁 +##魂 +##魄 +##魅 +##魇 +##魍 +##魏 +##魔 +##魘 +##魚 +##魯 +##魷 +##鮑 +##鮨 +##鮪 +##鮭 +##鮮 +##鯉 +##鯊 +##鯖 +##鯛 +##鯨 +##鯰 +##鯽 +##鰍 +##鰓 +##鰭 +##鰲 +##鰻 +##鰾 +##鱈 +##鱉 +##鱔 +##鱗 +##鱷 +##鱸 +##鱼 +##鱿 +##鲁 +##鲈 +##鲍 +##鲑 +##鲛 +##鲜 +##鲟 +##鲢 +##鲤 +##鲨 +##鲫 +##鲱 +##鲲 +##鲶 +##鲷 +##鲸 +##鳃 +##鳄 +##鳅 +##鳌 +##鳍 +##鳕 +##鳖 +##鳗 +##鳝 +##鳞 +##鳥 +##鳩 +##鳳 +##鳴 +##鳶 +##鴉 +##鴕 +##鴛 +##鴦 +##鴨 +##鴻 +##鴿 +##鵑 +##鵜 +##鵝 +##鵡 +##鵬 +##鵰 +##鵲 +##鶘 +##鶩 +##鶯 +##鶴 +##鷗 +##鷲 +##鷹 +##鷺 +##鸚 +##鸞 +##鸟 +##鸠 +##鸡 +##鸢 +##鸣 +##鸥 +##鸦 +##鸨 +##鸪 +##鸭 +##鸯 +##鸳 +##鸵 +##鸽 +##鸾 +##鸿 +##鹂 +##鹃 +##鹄 +##鹅 +##鹈 +##鹉 +##鹊 +##鹌 +##鹏 +##鹑 +##鹕 +##鹘 +##鹜 +##鹞 +##鹤 +##鹦 +##鹧 +##鹫 +##鹭 +##鹰 +##鹳 +##鹵 +##鹹 +##鹼 +##鹽 +##鹿 +##麂 +##麋 +##麒 +##麓 +##麗 +##麝 +##麟 +##麥 +##麦 +##麩 +##麴 +##麵 +##麸 +##麺 +##麻 +##麼 +##麽 +##麾 +##黃 +##黄 +##黍 +##黎 +##黏 +##黑 +##黒 +##黔 +##默 +##黛 +##黜 +##黝 +##點 +##黠 +##黨 +##黯 +##黴 +##鼋 +##鼎 +##鼐 +##鼓 +##鼠 +##鼬 +##鼹 +##鼻 +##鼾 +##齁 +##齊 +##齋 +##齐 +##齒 +##齡 +##齢 +##齣 +##齦 +##齿 +##龄 +##龅 +##龈 +##龊 +##龋 +##龌 +##龍 +##龐 +##龔 +##龕 +##龙 +##龚 +##龛 +##龜 +##龟 +##︰ +##︱ +##︶ +##︿ +##﹁ +##﹂ +##﹍ +##﹏ +##﹐ +##﹑ +##﹒ +##﹔ +##﹕ +##﹖ +##﹗ +##﹙ +##﹚ +##﹝ +##﹞ +##﹡ +##﹣ +##! +##" +### +##$ +##% +##& +##' +##( +##) +##* +##, +##- +##. +##/ +##: +##; +##< +##? +##@ +##[ +##\ +##] +##^ +##_ +##` +##f +##h +##j +##u +##w +##z +##{ +##} +##。 +##「 +##」 +##、 +##・ +##ッ +##ー +##イ +##ク +##シ +##ス +##ト +##ノ +##フ +##ラ +##ル +##ン +##゙ +##゚ +## ̄ +##¥ +##👍 +##🔥 +##😂 +##😎 diff --git a/baselines/models/roberta/run_classifier.py b/baselines/models/roberta/run_classifier.py new file mode 100644 index 0000000..d5a5ed7 --- /dev/null +++ b/baselines/models/roberta/run_classifier.py @@ -0,0 +1,1486 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-09 22:45:40 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization_finetuning as optimization +import tokenization +import tensorflow as tf +# from loss import bi_tempered_logistic_loss + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) - 1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num + 1) * extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num * extra_len: max_len] + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews( + ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + if set_type == "test": + #label = "0" + label = tokenization.convert_to_unicode(line[1]) + else: + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class SentencePairClassificationProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train_0827.tsv")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_0827.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_0827.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[0]) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, + axis=-1) # todo 08-29 try temp-loss + ###############bi_tempered_logistic_loss############################################################################ + # print("##cross entropy loss is used...."); tf.logging.info("##cross entropy loss is used....") + # t1=0.9 #t1=0.90 + # t2=1.05 #t2=1.05 + # per_example_loss=bi_tempered_logistic_loss(log_probs,one_hot_labels,t1,t2,label_smoothing=0.1,num_iters=5) # TODO label_smoothing=0.0 + # tf.logging.info("per_example_loss:"+str(per_example_loss.shape)) + ##############bi_tempered_logistic_loss############################################################################# + + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "sentence_pair": SentencePairClassificationProcessor, + "lcqmc": LCQMCProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "xnli": XnliProcessor, + "thucnews": THUCNewsProcessor, + "bq": BQProcessor, + "iflydata": iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + # Cloud TPU: Invalid TPU configuration, ensure ClusterResolver is passed to tpu. + print("###tpu_cluster_resolver:", tpu_cluster_resolver) + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) # TODO + print("###length of total train_examples:", len(train_examples)) + num_train_steps = int(len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + train_file_exists = os.path.exists(train_file) + print("###train_file_exists:", train_file_exists, " ;train_file:", train_file) + if not train_file_exists: # if tf_record file not exist, convert from raw text file. # TODO + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + # dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_roberta.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_roberta.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + # test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_roberta.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, + steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_roberta.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta/run_classifier_bq.sh b/baselines/models/roberta/run_classifier_bq.sh new file mode 100644 index 0000000..d1e8e35 --- /dev/null +++ b/baselines/models/roberta/run_classifier_bq.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:54:57 + +TASK_NAME="bq" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_iflydata.sh b/baselines/models/roberta/run_classifier_iflydata.sh new file mode 100644 index 0000000..951f696 --- /dev/null +++ b/baselines/models/roberta/run_classifier_iflydata.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:53:42 + +TASK_NAME="iflydata" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_inews.sh b/baselines/models/roberta/run_classifier_inews.sh new file mode 100644 index 0000000..6b5d044 --- /dev/null +++ b/baselines/models/roberta/run_classifier_inews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:53:47 + +TASK_NAME="inews" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_lcqmc.sh b/baselines/models/roberta/run_classifier_lcqmc.sh new file mode 100644 index 0000000..c91ecba --- /dev/null +++ b/baselines/models/roberta/run_classifier_lcqmc.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:53:51 + +TASK_NAME="lcqmc" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_thucnews.sh b/baselines/models/roberta/run_classifier_thucnews.sh new file mode 100644 index 0000000..62ef4f1 --- /dev/null +++ b/baselines/models/roberta/run_classifier_thucnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:53:56 + +TASK_NAME="thucnews" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_tnews.sh b/baselines/models/roberta/run_classifier_tnews.sh new file mode 100644 index 0000000..528ec8d --- /dev/null +++ b/baselines/models/roberta/run_classifier_tnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:54:00 + +TASK_NAME="tnews" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_classifier_xnli.sh b/baselines/models/roberta/run_classifier_xnli.sh new file mode 100644 index 0000000..7f55ea7 --- /dev/null +++ b/baselines/models/roberta/run_classifier_xnli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:54:06 + +TASK_NAME="xnli" +MODEL_NAME="roeberta_zh_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_LARGE_DIR ]; then + mkdir -p $ROBERTA_LARGE_DIR + echo "makedir $ROBERTA_LARGE_DIR" +fi +cd $ROBERTA_LARGE_DIR +if [ ! -f "bert_config_large.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "checkpoint" ] || [ ! -f "roberta_zh_large_model.ckpt.index" ] || [ ! -f "roberta_zh_large_model.ckpt.meta" ] || [ ! -f "roberta_zh_large_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/roeberta_zh_L-24_H-1024_A-16.zip + unzip roeberta_zh_L-24_H-1024_A-16.zip + rm roeberta_zh_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$ROBERTA_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_ner.py b/baselines/models/roberta/run_ner.py new file mode 100644 index 0000000..b6df9e7 --- /dev/null +++ b/baselines/models/roberta/run_ner.py @@ -0,0 +1,844 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON']*(len(word)-1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION']*(len(word)-1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION']*(len(word)-1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask = label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None +): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision(label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example(ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), \ + os.path.join(FLAGS.output_dir, "tmp"), \ + os.path.join(FLAGS.data_dir, "test_results_roberta_large.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta/run_ner_msra.sh b/baselines/models/roberta/run_ner_msra.sh new file mode 100644 index 0000000..7bbba33 --- /dev/null +++ b/baselines/models/roberta/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_LARGE_DIR=$CURRENT_DIR/prev_trained_model/roberta_zh_L-24_H-1024_A-16 +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_LARGE_DIR/vocab.txt \ + --bert_config_file=$BERT_LARGE_DIR/bert_config_large.json \ + --init_checkpoint=$BERT_LARGE_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=8 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta/run_pretraining.py b/baselines/models/roberta/run_pretraining.py new file mode 100644 index 0000000..a2e8c6b --- /dev/null +++ b/baselines/models/roberta/run_pretraining.py @@ -0,0 +1,498 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, # TODO TODO TODO 可以计算单不算成绩 + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + # batch_size=masked_lm_log_probs.shape[0] + # next_sentence_example_loss=tf.zeros((batch_size)) #tf.constant(0.0,dtype=tf.float32) + # next_sentence_log_probs=tf.zeros((batch_size,2)) + total_loss = masked_lm_loss # TODO remove next sentence loss 2019-08-08, + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + print("init_checkpoint:",init_checkpoint) + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs,[-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax(masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + # next_sentence_example_loss=0.0 TODO + # next_sentence_log_probs=0.0 # TODO + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot(label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: # 必须是训练或验证的类型 + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) # 从json文件中获得配置信息 + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] # 输入可以是多个文件,以“逗号隔开”;可以是一个匹配形式的,如“input_x*” + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + #if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( # TODO + tpu=FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + print("###tpu_cluster_resolver:",tpu_cluster_resolver,";FLAGS.use_tpu:",FLAGS.use_tpu,";FLAGS.tpu_name:",FLAGS.tpu_name,";FLAGS.tpu_zone:",FLAGS.tpu_zone) + # ###tpu_cluster_resolver: ;FLAGS.use_tpu: True ;FLAGS.tpu_name: grpc://10.240.1.83:8470 + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + keep_checkpoint_max=20, # 10 + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta/tf_metrics.py b/baselines/models/roberta/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/roberta/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/roberta/tokenization.py b/baselines/models/roberta/tokenization.py new file mode 100644 index 0000000..f7020e8 --- /dev/null +++ b/baselines/models/roberta/tokenization.py @@ -0,0 +1,401 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + #print("items:",items) #['[CLS]', '日', '##期', ',', '但', '被', '##告', '金', '##东', '##福', '载', '##明', '[MASK]', 'U', '##N', '##K', ']', '保', '##证', '本', '##月', '1', '##4', '[MASK]', '到', '##位', ',', '2', '##0', '##1', '##5', '年', '6', '[MASK]', '1', '##1', '日', '[', 'U', '##N', '##K', ']', ',', '原', '##告', '[MASK]', '认', '##可', '于', '2', '##0', '##1', '##5', '[MASK]', '6', '月', '[MASK]', '[MASK]', '日', '##向', '被', '##告', '主', '##张', '权', '##利', '。', '而', '[MASK]', '[MASK]', '自', '[MASK]', '[MASK]', '[MASK]', '[MASK]', '年', '6', '月', '1', '##1', '日', '[SEP]', '原', '##告', '于', '2', '##0', '##1', '##6', '[MASK]', '6', '[MASK]', '2', '##4', '日', '起', '##诉', ',', '主', '##张', '保', '##证', '责', '##任', ',', '已', '超', '##过', '保', '##证', '期', '##限', '[MASK]', '保', '##证', '人', '依', '##法', '不', '##再', '承', '##担', '保', '##证', '[MASK]', '[MASK]', '[MASK]', '[SEP]'] + for i,item in enumerate(items): + #print(i,"item:",item) # ##期 + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/roberta/tpu/run_classifier_inews.sh b/baselines/models/roberta/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..2edab12 --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta/tpu/run_classifier_jdcomment.sh b/baselines/models/roberta/tpu/run_classifier_jdcomment.sh new file mode 100755 index 0000000..4b88a95 --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_jdcomment.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="jdcomment" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.230.1.2:8470 diff --git a/baselines/models/roberta/tpu/run_classifier_lcqmc.sh b/baselines/models/roberta/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..6e90e0b --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-large/roeberta_zh_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-large/roeberta_zh_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config_large.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.16.0.2:8470 diff --git a/baselines/models/roberta/tpu/run_classifier_thucnews.sh b/baselines/models/roberta/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..9f690c0 --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta/tpu/run_classifier_tnews.sh b/baselines/models/roberta/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..2038486 --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-large/roeberta_zh_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-large/roeberta_zh_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config_large.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/roberta_zh_large_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/roberta/tpu/run_classifier_xnli.sh b/baselines/models/roberta/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..9dd4c2a --- /dev/null +++ b/baselines/models/roberta/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_ext/CONTRIBUTING.md b/baselines/models/roberta_wwm_ext/CONTRIBUTING.md new file mode 100644 index 0000000..124b4b3 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# How to Contribute + +BERT needs to maintain permanent compatibility with the pre-trained model files, +so we do not plan to make any major changes to this library (other than what was +promised in the README). However, we can accept small patches related to +re-factoring and documentation. To submit contributes, there are just a few +small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/baselines/models/roberta_wwm_ext/LICENSE b/baselines/models/roberta_wwm_ext/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/models/roberta_wwm_ext/__init__.py b/baselines/models/roberta_wwm_ext/__init__.py new file mode 100644 index 0000000..effb57b --- /dev/null +++ b/baselines/models/roberta_wwm_ext/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/baselines/models/roberta_wwm_ext/conlleval.py b/baselines/models/roberta_wwm_ext/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/roberta_wwm_ext/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/roberta_wwm_ext/create_pretraining_data.py b/baselines/models/roberta_wwm_ext/create_pretraining_data.py new file mode 100644 index 0000000..5340d96 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/create_pretraining_data.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/extract_features.py b/baselines/models/roberta_wwm_ext/extract_features.py new file mode 100644 index 0000000..60e3830 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/extract_features.py @@ -0,0 +1,419 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract pre-computed feature vectors from BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs +import collections +import json +import re + +import modeling +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "") + +flags.DEFINE_string("output_file", None, "") + +flags.DEFINE_string("layers", "-1,-2,-3,-4", "") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("master", None, + "If using a TPU, the address of the master.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_one_hot_embeddings", False, + "If True, tf.one_hot will be used for embedding lookups, otherwise " + "tf.nn.embedding_lookup will be used. On TPUs, this should be True " + "since it is much faster.") + + +class InputExample(object): + + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids + + +def input_fn_builder(features, seq_length): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] + + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) + + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d + + return input_fn + + +def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] + + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, + initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + all_layers = model.get_all_encoder_layers() + + predictions = { + "unique_id": unique_ids, + } + + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +def convert_examples_to_features(examples, seq_length, tokenizer): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def read_examples(input_file): + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + examples = read_examples(FLAGS.input_file) + + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) + + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) + + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/modeling.py b/baselines/models/roberta_wwm_ext/modeling.py new file mode 100644 index 0000000..fed5259 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/roberta_wwm_ext/modeling_test.py b/baselines/models/roberta_wwm_ext/modeling_test.py new file mode 100644 index 0000000..817ad2d --- /dev/null +++ b/baselines/models/roberta_wwm_ext/modeling_test.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +import modeling +import six +import tensorflow as tf + + +class BertModelTest(tf.test.TestCase): + + class BertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_ext/multilingual.md b/baselines/models/roberta_wwm_ext/multilingual.md new file mode 100644 index 0000000..3b38379 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/multilingual.md @@ -0,0 +1,303 @@ +## Models + +There are two multilingual models currently available. We do not plan to release +more single-language models, but we may release `BERT-Large` versions of these +two in the future: + +* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: + 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: + 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: + Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M + parameters + +**The `Multilingual Cased (New)` model also fixes normalization issues in many +languages, so it is recommended in languages with non-Latin alphabets (and is +often better for most languages with Latin alphabets). When using this model, +make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other +scripts.** + +See the [list of languages](#list-of-languages) that the Multilingual model +supports. The Multilingual model does include Chinese (and English), but if your +fine-tuning data is Chinese-only, then the Chinese model will likely produce +better results. + +## Results + +To evaluate these systems, we use the +[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a +version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the +dev and test sets have been translated (by humans) into 15 languages. Note that +the training set was *machine* translated (we used the translations provided by +XNLI, not Google NMT). For clarity, we only report on 6 languages below: + + + +| System | English | Chinese | Spanish | German | Arabic | Urdu | +| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | +| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | +| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | +| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | +| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | +| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | +| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | + + + +The first two rows are baselines from the XNLI paper and the last three rows are +our results with BERT. + +**Translate Train** means that the MultiNLI training set was machine translated +from English into the foreign language. So training and evaluation were both +done in the foreign language. Unfortunately, training was done on +machine-translated data, so it is impossible to quantify how much of the lower +accuracy (compared to English) is due to the quality of the machine translation +vs. the quality of the pre-trained model. + +**Translate Test** means that the XNLI test set was machine translated from the +foreign language into English. So training and evaluation were both done on +English. However, test evaluation was done on machine-translated English, so the +accuracy depends on the quality of the machine translation system. + +**Zero Shot** means that the Multilingual BERT system was fine-tuned on English +MultiNLI, and then evaluated on the foreign language XNLI test. In this case, +machine translation was not involved at all in either the pre-training or +fine-tuning. + +Note that the English result is worse than the 84.2 MultiNLI baseline because +this training used Multilingual BERT rather than English-only BERT. This implies +that for high-resource languages, the Multilingual model is somewhat worse than +a single-language model. However, it is not feasible for us to train and +maintain dozens of single-language models. Therefore, if your goal is to maximize +performance with a language other than English or Chinese, you might find it +beneficial to run pre-training for additional steps starting from our +Multilingual model on data from your language of interest. + +Here is a comparison of training Chinese models with the Multilingual +`BERT-Base` and Chinese-only `BERT-Base`: + +System | Chinese +----------------------- | ------- +XNLI Baseline | 67.0 +BERT Multilingual Model | 74.2 +BERT Chinese-only Model | 77.2 + +Similar to English, the single-language model does 3% better than the +Multilingual model. + +## Fine-tuning Example + +The multilingual model does **not** require any special consideration or API +changes. We did update the implementation of `BasicTokenizer` in +`tokenization.py` to support Chinese character tokenization, so please update if +you forked it. However, we did not change the tokenization API. + +To test the new models, we did modify `run_classifier.py` to add support for the +[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language +version of MultiNLI where the dev/test sets have been human-translated, and the +training set has been machine-translated. + +To run the fine-tuning code, please download the +[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the +[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) +and then unpack both .zip files into some directory `$XNLI_DIR`. + +To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` +(Chinese by default), so please modify `XnliProcessor` if you want to run on +another language. + +This is a large dataset, so this will training will take a few hours on a GPU +(or about 30 minutes on a Cloud TPU). To run an experiment quickly for +debugging, just set `num_train_epochs` to a small value like `0.1`. + +```shell +export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 +export XNLI_DIR=/path/to/xnli + +python run_classifier.py \ + --task_name=XNLI \ + --do_train=true \ + --do_eval=true \ + --data_dir=$XNLI_DIR \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --output_dir=/tmp/xnli_output/ +``` + +With the Chinese-only model, the results should look something like this: + +``` + ***** Eval results ***** +eval_accuracy = 0.774116 +eval_loss = 0.83554 +global_step = 24543 +loss = 0.74603 +``` + +## Details + +### Data Source and Sampling + +The languages chosen were the +[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). +The entire Wikipedia dump for each language (excluding user and talk pages) was +taken as the training data for each language + +However, the size of the Wikipedia for a given language varies greatly, and +therefore low-resource languages may be "under-represented" in terms of the +neural network model (under the assumption that languages are "competing" for +limited model capacity to some extent). At the same time, we also don't want +to overfit the model by performing thousands of epochs over a tiny Wikipedia +for a particular language. + +To balance these two factors, we performed exponentially smoothed weighting of +the data during pre-training data creation (and WordPiece vocab creation). In +other words, let's say that the probability of a language is *P(L)*, e.g., +*P(English) = 0.21* means that after concatenating all of the Wikipedias +together, 21% of our data is English. We exponentiate each probability by some +factor *S* and then re-normalize, and sample from that distribution. In our case +we use *S=0.7*. So, high-resource languages like English will be under-sampled, +and low-resource languages like Icelandic will be over-sampled. E.g., in the +original distribution English would be sampled 1000x more than Icelandic, but +after smoothing it's only sampled 100x more. + +### Tokenization + +For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are +weighted the same way as the data, so low-resource languages are upweighted by +some factor. We intentionally do *not* use any marker to denote the input +language (so that zero-shot training can work). + +Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace +characters, we add spaces around every character in the +[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) +before applying WordPiece. This means that Chinese is effectively +character-tokenized. Note that the CJK Unicode block only includes +Chinese-origin characters and does *not* include Hangul Korean or +Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like +all other languages. + +For all other languages, we apply the +[same recipe as English](https://github.com/google-research/bert#tokenization): +(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace +tokenization. We understand that accent markers have substantial meaning in some +languages, but felt that the benefits of reducing the effective vocabulary make +up for this. Generally the strong contextual models of BERT should make up for +any ambiguity introduced by stripping accent markers. + +### List of Languages + +The multilingual model supports the following languages. These languages were +chosen because they are the top 100 languages with the largest Wikipedias: + +* Afrikaans +* Albanian +* Arabic +* Aragonese +* Armenian +* Asturian +* Azerbaijani +* Bashkir +* Basque +* Bavarian +* Belarusian +* Bengali +* Bishnupriya Manipuri +* Bosnian +* Breton +* Bulgarian +* Burmese +* Catalan +* Cebuano +* Chechen +* Chinese (Simplified) +* Chinese (Traditional) +* Chuvash +* Croatian +* Czech +* Danish +* Dutch +* English +* Estonian +* Finnish +* French +* Galician +* Georgian +* German +* Greek +* Gujarati +* Haitian +* Hebrew +* Hindi +* Hungarian +* Icelandic +* Ido +* Indonesian +* Irish +* Italian +* Japanese +* Javanese +* Kannada +* Kazakh +* Kirghiz +* Korean +* Latin +* Latvian +* Lithuanian +* Lombard +* Low Saxon +* Luxembourgish +* Macedonian +* Malagasy +* Malay +* Malayalam +* Marathi +* Minangkabau +* Nepali +* Newar +* Norwegian (Bokmal) +* Norwegian (Nynorsk) +* Occitan +* Persian (Farsi) +* Piedmontese +* Polish +* Portuguese +* Punjabi +* Romanian +* Russian +* Scots +* Serbian +* Serbo-Croatian +* Sicilian +* Slovak +* Slovenian +* South Azerbaijani +* Spanish +* Sundanese +* Swahili +* Swedish +* Tagalog +* Tajik +* Tamil +* Tatar +* Telugu +* Turkish +* Ukrainian +* Urdu +* Uzbek +* Vietnamese +* Volapük +* Waray-Waray +* Welsh +* West Frisian +* Western Punjabi +* Yoruba + +The **Multilingual Cased (New)** release contains additionally **Thai** and +**Mongolian**, which were not included in the original release. diff --git a/baselines/models/roberta_wwm_ext/optimization.py b/baselines/models/roberta_wwm_ext/optimization.py new file mode 100644 index 0000000..d33dabd --- /dev/null +++ b/baselines/models/roberta_wwm_ext/optimization.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/roberta_wwm_ext/optimization_test.py b/baselines/models/roberta_wwm_ext/optimization_test.py new file mode 100644 index 0000000..4f2dcf1 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/optimization_test.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import optimization +import tensorflow as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_ext/requirements.txt b/baselines/models/roberta_wwm_ext/requirements.txt new file mode 100644 index 0000000..357b5ea --- /dev/null +++ b/baselines/models/roberta_wwm_ext/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 1.11.0 # CPU Version of TensorFlow. +# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. diff --git a/baselines/models/roberta_wwm_ext/run_classifier.py b/baselines/models/roberta_wwm_ext/run_classifier.py new file mode 100644 index 0000000..ca6d89d --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier.py @@ -0,0 +1,1540 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-09 23:01:09 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + #if set_type == "test": + # label = "0" + #else: + # label = tokenization.convert_to_unicode(line[0]) + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + #if set_type == "test": + # label = "0" + #else: + # label = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "lcqmc": LCQMCProcessor, + "bq": BQProcessor, + "thucnews":THUCNewsProcessor, + "iflydata": iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + ## dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_roberta_wwm_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_roberta_wwm_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + ## test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_roberta_wwm_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_roberta_wwm_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/run_classifier_bq.sh b/baselines/models/roberta_wwm_ext/run_classifier_bq.sh new file mode 100644 index 0000000..c8e630a --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_bq.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:56:53 + +TASK_NAME="bq" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_iflydata.sh b/baselines/models/roberta_wwm_ext/run_classifier_iflydata.sh new file mode 100644 index 0000000..a343a03 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_iflydata.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:55:49 + +TASK_NAME="iflydata" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_inews.sh b/baselines/models/roberta_wwm_ext/run_classifier_inews.sh new file mode 100644 index 0000000..02c114f --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_inews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:55:54 + +TASK_NAME="inews" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_lcqmc.sh b/baselines/models/roberta_wwm_ext/run_classifier_lcqmc.sh new file mode 100644 index 0000000..2f9fe9c --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_lcqmc.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:55:59 + +TASK_NAME="lcqmc" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_thucnews.sh b/baselines/models/roberta_wwm_ext/run_classifier_thucnews.sh new file mode 100644 index 0000000..2ba7282 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_thucnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:56:04 + +TASK_NAME="thucnews" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_tnews.sh b/baselines/models/roberta_wwm_ext/run_classifier_tnews.sh new file mode 100644 index 0000000..0179103 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_tnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:56:08 + +TASK_NAME="tnews" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_classifier_with_tfhub.py b/baselines/models/roberta_wwm_ext/run_classifier_with_tfhub.py new file mode 100644 index 0000000..9d2f80f --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_with_tfhub.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner with TF-Hub.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import optimization +import run_classifier +import tokenization +import tensorflow as tf +import tensorflow_hub as hub + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "bert_hub_module_handle", None, + "Handle for the BERT TF-Hub module.") + + +def create_model(is_training, input_ids, input_mask, segment_ids, labels, + num_labels, bert_hub_module_handle): + """Creates a classification model.""" + tags = set() + if is_training: + tags.add("train") + bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) + bert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + bert_outputs = bert_module( + inputs=bert_inputs, + signature="tokens", + as_dict=True) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use + # bert_outputs["sequence_output"] instead. + output_layer = bert_outputs["pooled_output"] + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(num_labels, learning_rate, num_train_steps, + num_warmup_steps, use_tpu, bert_hub_module_handle): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, + bert_hub_module_handle) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics) + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions={"probabilities": probabilities}) + else: + raise ValueError( + "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def create_tokenizer_from_hub_module(bert_hub_module_handle): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + bert_module = hub.Module(bert_hub_module_handle) + tokenization_info = bert_module(signature="tokenization_info", as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + return tokenization.FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": run_classifier.ColaProcessor, + "mnli": run_classifier.MnliProcessor, + "mrpc": run_classifier.MrpcProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + num_labels=len(label_list), + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + bert_hub_module_handle=FLAGS.bert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_features = run_classifier.convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = run_classifier.input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = run_classifier.convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = run_classifier.input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + if FLAGS.use_tpu: + # Discard batch remainder if running on TPU + n = len(predict_examples) + predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)] + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + run_classifier.file_based_convert_examples_to_features( + predict_examples, label_list, FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = run_classifier.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=FLAGS.use_tpu) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + tf.logging.info("***** Predict results *****") + for prediction in result: + probabilities = prediction["probabilities"] + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("bert_hub_module_handle") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/run_classifier_xnli.sh b/baselines/models/roberta_wwm_ext/run_classifier_xnli.sh new file mode 100644 index 0000000..78cd4d7 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_classifier_xnli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:56:17 + +TASK_NAME="xnli" +MODEL_NAME="chinese_roberta_wwm_ext_L-12_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_DIR ]; then + mkdir -p $ROBERTA_WWM_DIR + echo "makedir $ROBERTA_WWM_DIR" +fi +cd $ROBERTA_WWM_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + unzip chinese_roberta_wwm_ext_L-12_H-768_A-12.zip + rm chinese_roberta_wwm_ext_L-12_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_ner.py b/baselines/models/roberta_wwm_ext/run_ner.py new file mode 100644 index 0000000..fb74517 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_ner.py @@ -0,0 +1,844 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON']*(len(word)-1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION']*(len(word)-1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION']*(len(word)-1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask = label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None +): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision(label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example(ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), \ + os.path.join(FLAGS.output_dir, "tmp"), \ + os.path.join(FLAGS.data_dir, "test_results_roberta_wwm_ext.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/run_ner_msra.sh b/baselines/models/roberta_wwm_ext/run_ner_msra.sh new file mode 100644 index 0000000..0467843 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/chinese_roberta_wwm_ext_L-12_H-768_A-12 +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_ext/run_pretraining.py b/baselines/models/roberta_wwm_ext/run_pretraining.py new file mode 100644 index 0000000..b118f62 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_pretraining.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/run_squad.py b/baselines/models/roberta_wwm_ext/run_squad.py new file mode 100644 index 0000000..edd4c3e --- /dev/null +++ b/baselines/models/roberta_wwm_ext/run_squad.py @@ -0,0 +1,1283 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import math +import os +import random +import modeling +import optimization +import tokenization +import six +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +flags.DEFINE_float( + "null_score_diff_threshold", 0.0, + "If null_score - best_non_null is greater than the threshold predict null.") + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def input_fn_builder(input_file, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if FLAGS.version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if FLAGS.version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if FLAGS.version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not FLAGS.version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > FLAGS.null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if FLAGS.version_2_with_negative: + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def validate_flags_or_throw(bert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + validate_flags_or_throw(bert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + train_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "train.tf_record"), + is_training=True) + convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = input_fn_builder( + input_file=train_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), + is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + all_results = [] + + predict_input_fn = input_fn_builder( + input_file=eval_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") + + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_ext/tf_metrics.py b/baselines/models/roberta_wwm_ext/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/roberta_wwm_ext/tokenization.py b/baselines/models/roberta_wwm_ext/tokenization.py new file mode 100644 index 0000000..0ee1359 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tokenization.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/roberta_wwm_ext/tokenization_test.py b/baselines/models/roberta_wwm_ext/tokenization_test.py new file mode 100644 index 0000000..0afaedd --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile +import tokenization +import six +import tensorflow as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_inews.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..2edab12 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_jdcomment.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_jdcomment.sh new file mode 100755 index 0000000..4b88a95 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_jdcomment.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="jdcomment" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.230.1.2:8470 diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_lcqmc.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..fe1aef4 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-base/chinese_roberta_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-base/chinese_roberta_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.16.0.2:8470 diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_thucnews.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..9f690c0 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_tnews.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..b93ca51 --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-base/chinese_roberta_wwm_ext_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-base/chinese_roberta_wwm_ext_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/roberta_wwm_ext/tpu/run_classifier_xnli.sh b/baselines/models/roberta_wwm_ext/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..9dd4c2a --- /dev/null +++ b/baselines/models/roberta_wwm_ext/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/CONTRIBUTING.md b/baselines/models/roberta_wwm_large_ext/CONTRIBUTING.md new file mode 100644 index 0000000..124b4b3 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# How to Contribute + +BERT needs to maintain permanent compatibility with the pre-trained model files, +so we do not plan to make any major changes to this library (other than what was +promised in the README). However, we can accept small patches related to +re-factoring and documentation. To submit contributes, there are just a few +small guidelines you need to follow. + +## Contributor License Agreement + +Contributions to this project must be accompanied by a Contributor License +Agreement. You (or your employer) retain the copyright to your contribution; +this simply gives us permission to use and redistribute your contributions as +part of the project. Head over to to see +your current agreements on file or to sign a new one. + +You generally only need to submit a CLA once, so if you've already submitted one +(even if it was for a different project), you probably don't need to do it +again. + +## Code reviews + +All submissions, including submissions by project members, require review. We +use GitHub pull requests for this purpose. Consult +[GitHub Help](https://help.github.com/articles/about-pull-requests/) for more +information on using pull requests. + +## Community Guidelines + +This project follows +[Google's Open Source Community Guidelines](https://opensource.google.com/conduct/). diff --git a/baselines/models/roberta_wwm_large_ext/LICENSE b/baselines/models/roberta_wwm_large_ext/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/baselines/models/roberta_wwm_large_ext/__init__.py b/baselines/models/roberta_wwm_large_ext/__init__.py new file mode 100644 index 0000000..effb57b --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/baselines/models/roberta_wwm_large_ext/conlleval.py b/baselines/models/roberta_wwm_large_ext/conlleval.py new file mode 100644 index 0000000..8a8a75d --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/conlleval.py @@ -0,0 +1,300 @@ +# Python version of the evaluation script from CoNLL'00- +# Originates from: https://github.com/spyysalo/conlleval.py + + +# Intentional differences: +# - accept any space as delimiter by default +# - optional file argument (default STDIN) +# - option to set boundary (-b argument) +# - LaTeX output (-l argument) not supported +# - raw tags (-r argument) not supported + +# add function :evaluate(predicted_label, ori_label): which will not read from file + +import sys +import re +import codecs +from collections import defaultdict, namedtuple + +ANY_SPACE = '' + + +class FormatError(Exception): + pass + +Metrics = namedtuple('Metrics', 'tp fp fn prec rec fscore') + + +class EvalCounts(object): + def __init__(self): + self.correct_chunk = 0 # number of correctly identified chunks + self.correct_tags = 0 # number of correct chunk tags + self.found_correct = 0 # number of chunks in corpus + self.found_guessed = 0 # number of identified chunks + self.token_counter = 0 # token counter (ignores sentence breaks) + + # counts by type + self.t_correct_chunk = defaultdict(int) + self.t_found_correct = defaultdict(int) + self.t_found_guessed = defaultdict(int) + + +def parse_args(argv): + import argparse + parser = argparse.ArgumentParser( + description='evaluate tagging results using CoNLL criteria', + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + arg = parser.add_argument + arg('-b', '--boundary', metavar='STR', default='-X-', + help='sentence boundary') + arg('-d', '--delimiter', metavar='CHAR', default=ANY_SPACE, + help='character delimiting items in input') + arg('-o', '--otag', metavar='CHAR', default='O', + help='alternative outside tag') + arg('file', nargs='?', default=None) + return parser.parse_args(argv) + + +def parse_tag(t): + m = re.match(r'^([^-]*)-(.*)$', t) + return m.groups() if m else (t, '') + + +def evaluate(iterable, options=None): + if options is None: + options = parse_args([]) # use defaults + + counts = EvalCounts() + num_features = None # number of features per line + in_correct = False # currently processed chunks is correct until now + last_correct = 'O' # previous chunk tag in corpus + last_correct_type = '' # type of previously identified chunk tag + last_guessed = 'O' # previously identified chunk tag + last_guessed_type = '' # type of previous chunk tag in corpus + + for line in iterable: + line = line.rstrip('\r\n') + + if options.delimiter == ANY_SPACE: + features = line.split() + else: + features = line.split(options.delimiter) + + if num_features is None: + num_features = len(features) + elif num_features != len(features) and len(features) != 0: + raise FormatError('unexpected number of features: %d (%d)' % + (len(features), num_features)) + + if len(features) == 0 or features[0] == options.boundary: + features = [options.boundary, 'O', 'O'] + if len(features) < 3: + raise FormatError('unexpected number of features in line %s' % line) + + guessed, guessed_type = parse_tag(features.pop()) + correct, correct_type = parse_tag(features.pop()) + first_item = features.pop(0) + + if first_item == options.boundary: + guessed = 'O' + + end_correct = end_of_chunk(last_correct, correct, + last_correct_type, correct_type) + end_guessed = end_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + start_correct = start_of_chunk(last_correct, correct, + last_correct_type, correct_type) + start_guessed = start_of_chunk(last_guessed, guessed, + last_guessed_type, guessed_type) + + if in_correct: + if (end_correct and end_guessed and + last_guessed_type == last_correct_type): + in_correct = False + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + elif (end_correct != end_guessed or guessed_type != correct_type): + in_correct = False + + if start_correct and start_guessed and guessed_type == correct_type: + in_correct = True + + if start_correct: + counts.found_correct += 1 + counts.t_found_correct[correct_type] += 1 + if start_guessed: + counts.found_guessed += 1 + counts.t_found_guessed[guessed_type] += 1 + if first_item != options.boundary: + if correct == guessed and guessed_type == correct_type: + counts.correct_tags += 1 + counts.token_counter += 1 + + last_guessed = guessed + last_correct = correct + last_guessed_type = guessed_type + last_correct_type = correct_type + + if in_correct: + counts.correct_chunk += 1 + counts.t_correct_chunk[last_correct_type] += 1 + + return counts + + + +def uniq(iterable): + seen = set() + return [i for i in iterable if not (i in seen or seen.add(i))] + + +def calculate_metrics(correct, guessed, total): + tp, fp, fn = correct, guessed-correct, total-correct + p = 0 if tp + fp == 0 else 1.*tp / (tp + fp) + r = 0 if tp + fn == 0 else 1.*tp / (tp + fn) + f = 0 if p + r == 0 else 2 * p * r / (p + r) + return Metrics(tp, fp, fn, p, r, f) + + +def metrics(counts): + c = counts + overall = calculate_metrics( + c.correct_chunk, c.found_guessed, c.found_correct + ) + by_type = {} + for t in uniq(list(c.t_found_correct) + list(c.t_found_guessed)): + by_type[t] = calculate_metrics( + c.t_correct_chunk[t], c.t_found_guessed[t], c.t_found_correct[t] + ) + return overall, by_type + + +def report(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + out.write('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + out.write('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + + if c.token_counter > 0: + out.write('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + out.write('precision: %6.2f%%; ' % (100.*overall.prec)) + out.write('recall: %6.2f%%; ' % (100.*overall.rec)) + out.write('FB1: %6.2f\n' % (100.*overall.fscore)) + + for i, m in sorted(by_type.items()): + out.write('%17s: ' % i) + out.write('precision: %6.2f%%; ' % (100.*m.prec)) + out.write('recall: %6.2f%%; ' % (100.*m.rec)) + out.write('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + + +def report_notprint(counts, out=None): + if out is None: + out = sys.stdout + + overall, by_type = metrics(counts) + + c = counts + final_report = [] + line = [] + line.append('processed %d tokens with %d phrases; ' % + (c.token_counter, c.found_correct)) + line.append('found: %d phrases; correct: %d.\n' % + (c.found_guessed, c.correct_chunk)) + final_report.append("".join(line)) + + if c.token_counter > 0: + line = [] + line.append('accuracy: %6.2f%%; ' % + (100.*c.correct_tags/c.token_counter)) + line.append('precision: %6.2f%%; ' % (100.*overall.prec)) + line.append('recall: %6.2f%%; ' % (100.*overall.rec)) + line.append('FB1: %6.2f\n' % (100.*overall.fscore)) + final_report.append("".join(line)) + + for i, m in sorted(by_type.items()): + line = [] + line.append('%17s: ' % i) + line.append('precision: %6.2f%%; ' % (100.*m.prec)) + line.append('recall: %6.2f%%; ' % (100.*m.rec)) + line.append('FB1: %6.2f %d\n' % (100.*m.fscore, c.t_found_guessed[i])) + final_report.append("".join(line)) + return final_report + + +def end_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk ended between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_end = False + + if prev_tag == 'E': chunk_end = True + if prev_tag == 'S': chunk_end = True + + if prev_tag == 'B' and tag == 'B': chunk_end = True + if prev_tag == 'B' and tag == 'S': chunk_end = True + if prev_tag == 'B' and tag == 'O': chunk_end = True + if prev_tag == 'I' and tag == 'B': chunk_end = True + if prev_tag == 'I' and tag == 'S': chunk_end = True + if prev_tag == 'I' and tag == 'O': chunk_end = True + + if prev_tag != 'O' and prev_tag != '.' and prev_type != type_: + chunk_end = True + + # these chunks are assumed to have length 1 + if prev_tag == ']': chunk_end = True + if prev_tag == '[': chunk_end = True + + return chunk_end + + +def start_of_chunk(prev_tag, tag, prev_type, type_): + # check if a chunk started between the previous and current word + # arguments: previous and current chunk tags, previous and current types + chunk_start = False + + if tag == 'B': chunk_start = True + if tag == 'S': chunk_start = True + + if prev_tag == 'E' and tag == 'E': chunk_start = True + if prev_tag == 'E' and tag == 'I': chunk_start = True + if prev_tag == 'S' and tag == 'E': chunk_start = True + if prev_tag == 'S' and tag == 'I': chunk_start = True + if prev_tag == 'O' and tag == 'E': chunk_start = True + if prev_tag == 'O' and tag == 'I': chunk_start = True + + if tag != 'O' and tag != '.' and prev_type != type_: + chunk_start = True + + # these chunks are assumed to have length 1 + if tag == '[': chunk_start = True + if tag == ']': chunk_start = True + + return chunk_start + + +def return_report(input_file): + with codecs.open(input_file, "r", "utf8") as f: + counts = evaluate(f) + return report_notprint(counts) + + +def main(argv): + args = parse_args(argv[1:]) + + if args.file is None: + counts = evaluate(sys.stdin, args) + else: + with open(args.file) as f: + counts = evaluate(f, args) + report(counts) + +if __name__ == '__main__': + sys.exit(main(sys.argv)) \ No newline at end of file diff --git a/baselines/models/roberta_wwm_large_ext/create_pretraining_data.py b/baselines/models/roberta_wwm_large_ext/create_pretraining_data.py new file mode 100644 index 0000000..5340d96 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/create_pretraining_data.py @@ -0,0 +1,469 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Create masked LM/next sentence masked_lm TF examples for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import random +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, + "Input raw text file (or comma-separated list of files).") + +flags.DEFINE_string( + "output_file", None, + "Output TF example file (or comma-separated list of files).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_bool( + "do_whole_word_mask", False, + "Whether to use whole word masking rather than per-WordPiece masking.") + +flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.") + +flags.DEFINE_integer("max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence.") + +flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.") + +flags.DEFINE_integer( + "dupe_factor", 10, + "Number of times to duplicate the input data (with different masks).") + +flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.") + +flags.DEFINE_float( + "short_seq_prob", 0.1, + "Probability of creating sequences which are shorter than the " + "maximum length.") + + +class TrainingInstance(object): + """A single training instance (sentence pair).""" + + def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels, + is_random_next): + self.tokens = tokens + self.segment_ids = segment_ids + self.is_random_next = is_random_next + self.masked_lm_positions = masked_lm_positions + self.masked_lm_labels = masked_lm_labels + + def __str__(self): + s = "" + s += "tokens: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.tokens])) + s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids])) + s += "is_random_next: %s\n" % self.is_random_next + s += "masked_lm_positions: %s\n" % (" ".join( + [str(x) for x in self.masked_lm_positions])) + s += "masked_lm_labels: %s\n" % (" ".join( + [tokenization.printable_text(x) for x in self.masked_lm_labels])) + s += "\n" + return s + + def __repr__(self): + return self.__str__() + + +def write_instance_to_example_files(instances, tokenizer, max_seq_length, + max_predictions_per_seq, output_files): + """Create TF example files from `TrainingInstance`s.""" + writers = [] + for output_file in output_files: + writers.append(tf.python_io.TFRecordWriter(output_file)) + + writer_index = 0 + + total_written = 0 + for (inst_index, instance) in enumerate(instances): + input_ids = tokenizer.convert_tokens_to_ids(instance.tokens) + input_mask = [1] * len(input_ids) + segment_ids = list(instance.segment_ids) + assert len(input_ids) <= max_seq_length + + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + masked_lm_positions = list(instance.masked_lm_positions) + masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels) + masked_lm_weights = [1.0] * len(masked_lm_ids) + + while len(masked_lm_positions) < max_predictions_per_seq: + masked_lm_positions.append(0) + masked_lm_ids.append(0) + masked_lm_weights.append(0.0) + + next_sentence_label = 1 if instance.is_random_next else 0 + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(input_ids) + features["input_mask"] = create_int_feature(input_mask) + features["segment_ids"] = create_int_feature(segment_ids) + features["masked_lm_positions"] = create_int_feature(masked_lm_positions) + features["masked_lm_ids"] = create_int_feature(masked_lm_ids) + features["masked_lm_weights"] = create_float_feature(masked_lm_weights) + features["next_sentence_labels"] = create_int_feature([next_sentence_label]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + + writers[writer_index].write(tf_example.SerializeToString()) + writer_index = (writer_index + 1) % len(writers) + + total_written += 1 + + if inst_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in instance.tokens])) + + for feature_name in features.keys(): + feature = features[feature_name] + values = [] + if feature.int64_list.value: + values = feature.int64_list.value + elif feature.float_list.value: + values = feature.float_list.value + tf.logging.info( + "%s: %s" % (feature_name, " ".join([str(x) for x in values]))) + + for writer in writers: + writer.close() + + tf.logging.info("Wrote %d total instances", total_written) + + +def create_int_feature(values): + feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return feature + + +def create_float_feature(values): + feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return feature + + +def create_training_instances(input_files, tokenizer, max_seq_length, + dupe_factor, short_seq_prob, masked_lm_prob, + max_predictions_per_seq, rng): + """Create `TrainingInstance`s from raw text.""" + all_documents = [[]] + + # Input file format: + # (1) One sentence per line. These should ideally be actual sentences, not + # entire paragraphs or arbitrary spans of text. (Because we use the + # sentence boundaries for the "next sentence prediction" task). + # (2) Blank lines between documents. Document boundaries are needed so + # that the "next sentence prediction" task doesn't span between documents. + for input_file in input_files: + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + + # Empty lines are used as document delimiters + if not line: + all_documents.append([]) + tokens = tokenizer.tokenize(line) + if tokens: + all_documents[-1].append(tokens) + + # Remove empty documents + all_documents = [x for x in all_documents if x] + rng.shuffle(all_documents) + + vocab_words = list(tokenizer.vocab.keys()) + instances = [] + for _ in range(dupe_factor): + for document_index in range(len(all_documents)): + instances.extend( + create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng)) + + rng.shuffle(instances) + return instances + + +def create_instances_from_document( + all_documents, document_index, max_seq_length, short_seq_prob, + masked_lm_prob, max_predictions_per_seq, vocab_words, rng): + """Creates `TrainingInstance`s for a single document.""" + document = all_documents[document_index] + + # Account for [CLS], [SEP], [SEP] + max_num_tokens = max_seq_length - 3 + + # We *usually* want to fill up the entire sequence since we are padding + # to `max_seq_length` anyways, so short sequences are generally wasted + # computation. However, we *sometimes* + # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter + # sequences to minimize the mismatch between pre-training and fine-tuning. + # The `target_seq_length` is just a rough target however, whereas + # `max_seq_length` is a hard limit. + target_seq_length = max_num_tokens + if rng.random() < short_seq_prob: + target_seq_length = rng.randint(2, max_num_tokens) + + # We DON'T just concatenate all of the tokens from a document into a long + # sequence and choose an arbitrary split point because this would make the + # next sentence prediction task too easy. Instead, we split the input into + # segments "A" and "B" based on the actual "sentences" provided by the user + # input. + instances = [] + current_chunk = [] + current_length = 0 + i = 0 + while i < len(document): + segment = document[i] + current_chunk.append(segment) + current_length += len(segment) + if i == len(document) - 1 or current_length >= target_seq_length: + if current_chunk: + # `a_end` is how many segments from `current_chunk` go into the `A` + # (first) sentence. + a_end = 1 + if len(current_chunk) >= 2: + a_end = rng.randint(1, len(current_chunk) - 1) + + tokens_a = [] + for j in range(a_end): + tokens_a.extend(current_chunk[j]) + + tokens_b = [] + # Random next + is_random_next = False + if len(current_chunk) == 1 or rng.random() < 0.5: + is_random_next = True + target_b_length = target_seq_length - len(tokens_a) + + # This should rarely go for more than one iteration for large + # corpora. However, just to be careful, we try to make sure that + # the random document is not the same as the document + # we're processing. + for _ in range(10): + random_document_index = rng.randint(0, len(all_documents) - 1) + if random_document_index != document_index: + break + + random_document = all_documents[random_document_index] + random_start = rng.randint(0, len(random_document) - 1) + for j in range(random_start, len(random_document)): + tokens_b.extend(random_document[j]) + if len(tokens_b) >= target_b_length: + break + # We didn't actually use these segments so we "put them back" so + # they don't go to waste. + num_unused_segments = len(current_chunk) - a_end + i -= num_unused_segments + # Actual next + else: + is_random_next = False + for j in range(a_end, len(current_chunk)): + tokens_b.extend(current_chunk[j]) + truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng) + + assert len(tokens_a) >= 1 + assert len(tokens_b) >= 1 + + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + + tokens.append("[SEP]") + segment_ids.append(0) + + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + (tokens, masked_lm_positions, + masked_lm_labels) = create_masked_lm_predictions( + tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng) + instance = TrainingInstance( + tokens=tokens, + segment_ids=segment_ids, + is_random_next=is_random_next, + masked_lm_positions=masked_lm_positions, + masked_lm_labels=masked_lm_labels) + instances.append(instance) + current_chunk = [] + current_length = 0 + i += 1 + + return instances + + +MaskedLmInstance = collections.namedtuple("MaskedLmInstance", + ["index", "label"]) + + +def create_masked_lm_predictions(tokens, masked_lm_prob, + max_predictions_per_seq, vocab_words, rng): + """Creates the predictions for the masked LM objective.""" + + cand_indexes = [] + for (i, token) in enumerate(tokens): + if token == "[CLS]" or token == "[SEP]": + continue + # Whole Word Masking means that if we mask all of the wordpieces + # corresponding to an original word. When a word has been split into + # WordPieces, the first token does not have any marker and any subsequence + # tokens are prefixed with ##. So whenever we see the ## token, we + # append it to the previous set of word indexes. + # + # Note that Whole Word Masking does *not* change the training code + # at all -- we still predict each WordPiece independently, softmaxed + # over the entire vocabulary. + if (FLAGS.do_whole_word_mask and len(cand_indexes) >= 1 and + token.startswith("##")): + cand_indexes[-1].append(i) + else: + cand_indexes.append([i]) + + rng.shuffle(cand_indexes) + + output_tokens = list(tokens) + + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + masked_lms = [] + covered_indexes = set() + for index_set in cand_indexes: + if len(masked_lms) >= num_to_predict: + break + # If adding a whole-word mask would exceed the maximum number of + # predictions, then just skip this candidate. + if len(masked_lms) + len(index_set) > num_to_predict: + continue + is_any_index_covered = False + for index in index_set: + if index in covered_indexes: + is_any_index_covered = True + break + if is_any_index_covered: + continue + for index in index_set: + covered_indexes.add(index) + + masked_token = None + # 80% of the time, replace with [MASK] + if rng.random() < 0.8: + masked_token = "[MASK]" + else: + # 10% of the time, keep original + if rng.random() < 0.5: + masked_token = tokens[index] + # 10% of the time, replace with random word + else: + masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)] + + output_tokens[index] = masked_token + + masked_lms.append(MaskedLmInstance(index=index, label=tokens[index])) + assert len(masked_lms) <= num_to_predict + masked_lms = sorted(masked_lms, key=lambda x: x.index) + + masked_lm_positions = [] + masked_lm_labels = [] + for p in masked_lms: + masked_lm_positions.append(p.index) + masked_lm_labels.append(p.label) + + return (output_tokens, masked_lm_positions, masked_lm_labels) + + +def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng): + """Truncates a pair of sequences to a maximum sequence length.""" + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_num_tokens: + break + + trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b + assert len(trunc_tokens) >= 1 + + # We want to sometimes truncate from the front and sometimes from the + # back to add more randomness and avoid biases. + if rng.random() < 0.5: + del trunc_tokens[0] + else: + trunc_tokens.pop() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Reading from input files ***") + for input_file in input_files: + tf.logging.info(" %s", input_file) + + rng = random.Random(FLAGS.random_seed) + instances = create_training_instances( + input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor, + FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq, + rng) + + output_files = FLAGS.output_file.split(",") + tf.logging.info("*** Writing to output files ***") + for output_file in output_files: + tf.logging.info(" %s", output_file) + + write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length, + FLAGS.max_predictions_per_seq, output_files) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("output_file") + flags.mark_flag_as_required("vocab_file") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/extract_features.py b/baselines/models/roberta_wwm_large_ext/extract_features.py new file mode 100644 index 0000000..60e3830 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/extract_features.py @@ -0,0 +1,419 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Extract pre-computed feature vectors from BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import codecs +import collections +import json +import re + +import modeling +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string("input_file", None, "") + +flags.DEFINE_string("output_file", None, "") + +flags.DEFINE_string("layers", "-1,-2,-3,-4", "") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer("batch_size", 32, "Batch size for predictions.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_string("master", None, + "If using a TPU, the address of the master.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "use_one_hot_embeddings", False, + "If True, tf.one_hot will be used for embedding lookups, otherwise " + "tf.nn.embedding_lookup will be used. On TPUs, this should be True " + "since it is much faster.") + + +class InputExample(object): + + def __init__(self, unique_id, text_a, text_b): + self.unique_id = unique_id + self.text_a = text_a + self.text_b = text_b + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): + self.unique_id = unique_id + self.tokens = tokens + self.input_ids = input_ids + self.input_mask = input_mask + self.input_type_ids = input_type_ids + + +def input_fn_builder(features, seq_length): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_unique_ids = [] + all_input_ids = [] + all_input_mask = [] + all_input_type_ids = [] + + for feature in features: + all_unique_ids.append(feature.unique_id) + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_input_type_ids.append(feature.input_type_ids) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "unique_ids": + tf.constant(all_unique_ids, shape=[num_examples], dtype=tf.int32), + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "input_type_ids": + tf.constant( + all_input_type_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + }) + + d = d.batch(batch_size=batch_size, drop_remainder=False) + return d + + return input_fn + + +def model_fn_builder(bert_config, init_checkpoint, layer_indexes, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + input_type_ids = features["input_type_ids"] + + model = modeling.BertModel( + config=bert_config, + is_training=False, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=input_type_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + if mode != tf.estimator.ModeKeys.PREDICT: + raise ValueError("Only PREDICT modes are supported: %s" % (mode)) + + tvars = tf.trainable_variables() + scaffold_fn = None + (assignment_map, + initialized_variable_names) = modeling.get_assignment_map_from_checkpoint( + tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + all_layers = model.get_all_encoder_layers() + + predictions = { + "unique_id": unique_ids, + } + + for (i, layer_index) in enumerate(layer_indexes): + predictions["layer_output_%d" % i] = all_layers[layer_index] + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +def convert_examples_to_features(examples, seq_length, tokenizer): + """Loads a data file into a list of `InputBatch`s.""" + + features = [] + for (ex_index, example) in enumerate(examples): + tokens_a = tokenizer.tokenize(example.text_a) + + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > seq_length - 2: + tokens_a = tokens_a[0:(seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + input_type_ids = [] + tokens.append("[CLS]") + input_type_ids.append(0) + for token in tokens_a: + tokens.append(token) + input_type_ids.append(0) + tokens.append("[SEP]") + input_type_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + input_type_ids.append(1) + tokens.append("[SEP]") + input_type_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < seq_length: + input_ids.append(0) + input_mask.append(0) + input_type_ids.append(0) + + assert len(input_ids) == seq_length + assert len(input_mask) == seq_length + assert len(input_type_ids) == seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (example.unique_id)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids])) + + features.append( + InputFeatures( + unique_id=example.unique_id, + tokens=tokens, + input_ids=input_ids, + input_mask=input_mask, + input_type_ids=input_type_ids)) + return features + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def read_examples(input_file): + """Read a list of `InputExample`s from an input file.""" + examples = [] + unique_id = 0 + with tf.gfile.GFile(input_file, "r") as reader: + while True: + line = tokenization.convert_to_unicode(reader.readline()) + if not line: + break + line = line.strip() + text_a = None + text_b = None + m = re.match(r"^(.*) \|\|\| (.*)$", line) + if m is None: + text_a = line + else: + text_a = m.group(1) + text_b = m.group(2) + examples.append( + InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) + unique_id += 1 + return examples + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + layer_indexes = [int(x) for x in FLAGS.layers.split(",")] + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=FLAGS.master, + tpu_config=tf.contrib.tpu.TPUConfig( + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + examples = read_examples(FLAGS.input_file) + + features = convert_examples_to_features( + examples=examples, seq_length=FLAGS.max_seq_length, tokenizer=tokenizer) + + unique_id_to_feature = {} + for feature in features: + unique_id_to_feature[feature.unique_id] = feature + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + layer_indexes=layer_indexes, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_one_hot_embeddings) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + predict_batch_size=FLAGS.batch_size) + + input_fn = input_fn_builder( + features=features, seq_length=FLAGS.max_seq_length) + + with codecs.getwriter("utf-8")(tf.gfile.Open(FLAGS.output_file, + "w")) as writer: + for result in estimator.predict(input_fn, yield_single_examples=True): + unique_id = int(result["unique_id"]) + feature = unique_id_to_feature[unique_id] + output_json = collections.OrderedDict() + output_json["linex_index"] = unique_id + all_features = [] + for (i, token) in enumerate(feature.tokens): + all_layers = [] + for (j, layer_index) in enumerate(layer_indexes): + layer_output = result["layer_output_%d" % j] + layers = collections.OrderedDict() + layers["index"] = layer_index + layers["values"] = [ + round(float(x), 6) for x in layer_output[i:(i + 1)].flat + ] + all_layers.append(layers) + features = collections.OrderedDict() + features["token"] = token + features["layers"] = all_layers + all_features.append(features) + output_json["features"] = all_features + writer.write(json.dumps(output_json) + "\n") + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("init_checkpoint") + flags.mark_flag_as_required("output_file") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/modeling.py b/baselines/models/roberta_wwm_large_ext/modeling.py new file mode 100644 index 0000000..fed5259 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/modeling.py @@ -0,0 +1,986 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""The main BERT model and related functions.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import json +import math +import re +import numpy as np +import six +import tensorflow as tf + + +class BertConfig(object): + """Configuration for `BertModel`.""" + + def __init__(self, + vocab_size, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. + hidden_dropout_prob: The dropout probability for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The stdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size=None) + for (key, value) in six.iteritems(json_object): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with tf.gfile.GFile(json_file, "r") as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class BertModel(object): + """BERT model ("Bidirectional Encoder Representations from Transformers"). + + Example usage: + + ```python + # Already been converted into WordPiece token ids + input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) + input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) + token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) + + config = modeling.BertConfig(vocab_size=32000, hidden_size=512, + num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) + + model = modeling.BertModel(config=config, is_training=True, + input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) + + label_embeddings = tf.get_variable(...) + pooled_output = model.get_pooled_output() + logits = tf.matmul(pooled_output, label_embeddings) + ... + ``` + """ + + def __init__(self, + config, + is_training, + input_ids, + input_mask=None, + token_type_ids=None, + use_one_hot_embeddings=False, + scope=None): + """Constructor for BertModel. + + Args: + config: `BertConfig` instance. + is_training: bool. true for training model, false for eval model. Controls + whether dropout will be applied. + input_ids: int32 Tensor of shape [batch_size, seq_length]. + input_mask: (optional) int32 Tensor of shape [batch_size, seq_length]. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + use_one_hot_embeddings: (optional) bool. Whether to use one-hot word + embeddings or tf.embedding_lookup() for the word embeddings. + scope: (optional) variable scope. Defaults to "bert". + + Raises: + ValueError: The config is invalid or one of the input tensor shapes + is invalid. + """ + config = copy.deepcopy(config) + if not is_training: + config.hidden_dropout_prob = 0.0 + config.attention_probs_dropout_prob = 0.0 + + input_shape = get_shape_list(input_ids, expected_rank=2) + batch_size = input_shape[0] + seq_length = input_shape[1] + + if input_mask is None: + input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) + + if token_type_ids is None: + token_type_ids = tf.zeros(shape=[batch_size, seq_length], dtype=tf.int32) + + with tf.variable_scope(scope, default_name="bert"): + with tf.variable_scope("embeddings"): + # Perform embedding lookup on the word ids. + (self.embedding_output, self.embedding_table) = embedding_lookup( + input_ids=input_ids, + vocab_size=config.vocab_size, + embedding_size=config.hidden_size, + initializer_range=config.initializer_range, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=use_one_hot_embeddings) + + # Add positional embeddings and token type embeddings, then layer + # normalize and perform dropout. + self.embedding_output = embedding_postprocessor( + input_tensor=self.embedding_output, + use_token_type=True, + token_type_ids=token_type_ids, + token_type_vocab_size=config.type_vocab_size, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=config.initializer_range, + max_position_embeddings=config.max_position_embeddings, + dropout_prob=config.hidden_dropout_prob) + + with tf.variable_scope("encoder"): + # This converts a 2D mask of shape [batch_size, seq_length] to a 3D + # mask of shape [batch_size, seq_length, seq_length] which is used + # for the attention scores. + attention_mask = create_attention_mask_from_input_mask( + input_ids, input_mask) + + # Run the stacked transformer. + # `sequence_output` shape = [batch_size, seq_length, hidden_size]. + self.all_encoder_layers = transformer_model( + input_tensor=self.embedding_output, + attention_mask=attention_mask, + hidden_size=config.hidden_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + intermediate_size=config.intermediate_size, + intermediate_act_fn=get_activation(config.hidden_act), + hidden_dropout_prob=config.hidden_dropout_prob, + attention_probs_dropout_prob=config.attention_probs_dropout_prob, + initializer_range=config.initializer_range, + do_return_all_layers=True) + + self.sequence_output = self.all_encoder_layers[-1] + # The "pooler" converts the encoded sequence tensor of shape + # [batch_size, seq_length, hidden_size] to a tensor of shape + # [batch_size, hidden_size]. This is necessary for segment-level + # (or segment-pair-level) classification tasks where we need a fixed + # dimensional representation of the segment. + with tf.variable_scope("pooler"): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. We assume that this has been pre-trained + first_token_tensor = tf.squeeze(self.sequence_output[:, 0:1, :], axis=1) + self.pooled_output = tf.layers.dense( + first_token_tensor, + config.hidden_size, + activation=tf.tanh, + kernel_initializer=create_initializer(config.initializer_range)) + + def get_pooled_output(self): + return self.pooled_output + + def get_sequence_output(self): + """Gets final hidden layer of encoder. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the final hidden of the transformer encoder. + """ + return self.sequence_output + + def get_all_encoder_layers(self): + return self.all_encoder_layers + + def get_embedding_output(self): + """Gets output of the embedding lookup (i.e., input to the transformer). + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size] corresponding + to the output of the embedding layer, after summing the word + embeddings with the positional embeddings and the token type embeddings, + then performing layer normalization. This is the input to the transformer. + """ + return self.embedding_output + + def get_embedding_table(self): + return self.embedding_table + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def get_activation(activation_string): + """Maps a string to a Python function, e.g., "relu" => `tf.nn.relu`. + + Args: + activation_string: String name of the activation function. + + Returns: + A Python function corresponding to the activation function. If + `activation_string` is None, empty, or "linear", this will return None. + If `activation_string` is not a string, it will return `activation_string`. + + Raises: + ValueError: The `activation_string` does not correspond to a known + activation. + """ + + # We assume that anything that"s not a string is already an activation + # function, so we just return it. + if not isinstance(activation_string, six.string_types): + return activation_string + + if not activation_string: + return None + + act = activation_string.lower() + if act == "linear": + return None + elif act == "relu": + return tf.nn.relu + elif act == "gelu": + return gelu + elif act == "tanh": + return tf.tanh + else: + raise ValueError("Unsupported activation: %s" % act) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + continue + assignment_map[name] = name + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +def dropout(input_tensor, dropout_prob): + """Perform dropout. + + Args: + input_tensor: float Tensor. + dropout_prob: Python float. The probability of dropping out a value (NOT of + *keeping* a dimension as in `tf.nn.dropout`). + + Returns: + A version of `input_tensor` with dropout applied. + """ + if dropout_prob is None or dropout_prob == 0.0: + return input_tensor + + output = tf.nn.dropout(input_tensor, 1.0 - dropout_prob) + return output + + +def layer_norm(input_tensor, name=None): + """Run layer normalization on the last dimension of the tensor.""" + return tf.contrib.layers.layer_norm( + inputs=input_tensor, begin_norm_axis=-1, begin_params_axis=-1, scope=name) + + +def layer_norm_and_dropout(input_tensor, dropout_prob, name=None): + """Runs layer normalization followed by dropout.""" + output_tensor = layer_norm(input_tensor, name) + output_tensor = dropout(output_tensor, dropout_prob) + return output_tensor + + +def create_initializer(initializer_range=0.02): + """Creates a `truncated_normal_initializer` with the given range.""" + return tf.truncated_normal_initializer(stddev=initializer_range) + + +def embedding_lookup(input_ids, + vocab_size, + embedding_size=128, + initializer_range=0.02, + word_embedding_name="word_embeddings", + use_one_hot_embeddings=False): + """Looks up words embeddings for id tensor. + + Args: + input_ids: int32 Tensor of shape [batch_size, seq_length] containing word + ids. + vocab_size: int. Size of the embedding vocabulary. + embedding_size: int. Width of the word embeddings. + initializer_range: float. Embedding initialization range. + word_embedding_name: string. Name of the embedding table. + use_one_hot_embeddings: bool. If True, use one-hot method for word + embeddings. If False, use `tf.gather()`. + + Returns: + float Tensor of shape [batch_size, seq_length, embedding_size]. + """ + # This function assumes that the input is of shape [batch_size, seq_length, + # num_inputs]. + # + # If the input is a 2D tensor of shape [batch_size, seq_length], we + # reshape to [batch_size, seq_length, 1]. + if input_ids.shape.ndims == 2: + input_ids = tf.expand_dims(input_ids, axis=[-1]) + + embedding_table = tf.get_variable( + name=word_embedding_name, + shape=[vocab_size, embedding_size], + initializer=create_initializer(initializer_range)) + + flat_input_ids = tf.reshape(input_ids, [-1]) + if use_one_hot_embeddings: + one_hot_input_ids = tf.one_hot(flat_input_ids, depth=vocab_size) + output = tf.matmul(one_hot_input_ids, embedding_table) + else: + output = tf.gather(embedding_table, flat_input_ids) + + input_shape = get_shape_list(input_ids) + + output = tf.reshape(output, + input_shape[0:-1] + [input_shape[-1] * embedding_size]) + return (output, embedding_table) + + +def embedding_postprocessor(input_tensor, + use_token_type=False, + token_type_ids=None, + token_type_vocab_size=16, + token_type_embedding_name="token_type_embeddings", + use_position_embeddings=True, + position_embedding_name="position_embeddings", + initializer_range=0.02, + max_position_embeddings=512, + dropout_prob=0.1): + """Performs various post-processing on a word embedding tensor. + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, + embedding_size]. + use_token_type: bool. Whether to add embeddings for `token_type_ids`. + token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length]. + Must be specified if `use_token_type` is True. + token_type_vocab_size: int. The vocabulary size of `token_type_ids`. + token_type_embedding_name: string. The name of the embedding table variable + for token type ids. + use_position_embeddings: bool. Whether to add position embeddings for the + position of each token in the sequence. + position_embedding_name: string. The name of the embedding table variable + for positional embeddings. + initializer_range: float. Range of the weight initialization. + max_position_embeddings: int. Maximum sequence length that might ever be + used with this model. This can be longer than the sequence length of + input_tensor, but cannot be shorter. + dropout_prob: float. Dropout probability applied to the final output tensor. + + Returns: + float tensor with same shape as `input_tensor`. + + Raises: + ValueError: One of the tensor shapes or input values is invalid. + """ + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + width = input_shape[2] + + output = input_tensor + + if use_token_type: + if token_type_ids is None: + raise ValueError("`token_type_ids` must be specified if" + "`use_token_type` is True.") + token_type_table = tf.get_variable( + name=token_type_embedding_name, + shape=[token_type_vocab_size, width], + initializer=create_initializer(initializer_range)) + # This vocab will be small so we always do one-hot here, since it is always + # faster for a small vocabulary. + flat_token_type_ids = tf.reshape(token_type_ids, [-1]) + one_hot_ids = tf.one_hot(flat_token_type_ids, depth=token_type_vocab_size) + token_type_embeddings = tf.matmul(one_hot_ids, token_type_table) + token_type_embeddings = tf.reshape(token_type_embeddings, + [batch_size, seq_length, width]) + output += token_type_embeddings + + if use_position_embeddings: + assert_op = tf.assert_less_equal(seq_length, max_position_embeddings) + with tf.control_dependencies([assert_op]): + full_position_embeddings = tf.get_variable( + name=position_embedding_name, + shape=[max_position_embeddings, width], + initializer=create_initializer(initializer_range)) + # Since the position embedding table is a learned variable, we create it + # using a (long) sequence length `max_position_embeddings`. The actual + # sequence length might be shorter than this, for faster training of + # tasks that do not have long sequences. + # + # So `full_position_embeddings` is effectively an embedding table + # for position [0, 1, 2, ..., max_position_embeddings-1], and the current + # sequence has positions [0, 1, 2, ... seq_length-1], so we can just + # perform a slice. + position_embeddings = tf.slice(full_position_embeddings, [0, 0], + [seq_length, -1]) + num_dims = len(output.shape.as_list()) + + # Only the last two dimensions are relevant (`seq_length` and `width`), so + # we broadcast among the first dimensions, which is typically just + # the batch size. + position_broadcast_shape = [] + for _ in range(num_dims - 2): + position_broadcast_shape.append(1) + position_broadcast_shape.extend([seq_length, width]) + position_embeddings = tf.reshape(position_embeddings, + position_broadcast_shape) + output += position_embeddings + + output = layer_norm_and_dropout(output, dropout_prob) + return output + + +def create_attention_mask_from_input_mask(from_tensor, to_mask): + """Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + batch_size = from_shape[0] + from_seq_length = from_shape[1] + + to_shape = get_shape_list(to_mask, expected_rank=2) + to_seq_length = to_shape[1] + + to_mask = tf.cast( + tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + # + # `broadcast_ones` = [batch_size, from_seq_length, 1] + broadcast_ones = tf.ones( + shape=[batch_size, from_seq_length, 1], dtype=tf.float32) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + +def attention_layer(from_tensor, + to_tensor, + attention_mask=None, + num_attention_heads=1, + size_per_head=512, + query_act=None, + key_act=None, + value_act=None, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + do_return_2d_tensor=False, + batch_size=None, + from_seq_length=None, + to_seq_length=None): + """Performs multi-headed attention from `from_tensor` to `to_tensor`. + + This is an implementation of multi-headed attention based on "Attention + is all you Need". If `from_tensor` and `to_tensor` are the same, then + this is self-attention. Each timestep in `from_tensor` attends to the + corresponding sequence in `to_tensor`, and returns a fixed-with vector. + + This function first projects `from_tensor` into a "query" tensor and + `to_tensor` into "key" and "value" tensors. These are (effectively) a list + of tensors of length `num_attention_heads`, where each tensor is of shape + [batch_size, seq_length, size_per_head]. + + Then, the query and key tensors are dot-producted and scaled. These are + softmaxed to obtain attention probabilities. The value tensors are then + interpolated by these probabilities, then concatenated back to a single + tensor and returned. + + In practice, the multi-headed attention are done with transposes and + reshapes rather than actual separate tensors. + + Args: + from_tensor: float Tensor of shape [batch_size, from_seq_length, + from_width]. + to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. + attention_mask: (optional) int32 Tensor of shape [batch_size, + from_seq_length, to_seq_length]. The values should be 1 or 0. The + attention scores will effectively be set to -infinity for any positions in + the mask that are 0, and will be unchanged for positions that are 1. + num_attention_heads: int. Number of attention heads. + size_per_head: int. Size of each attention head. + query_act: (optional) Activation function for the query transform. + key_act: (optional) Activation function for the key transform. + value_act: (optional) Activation function for the value transform. + attention_probs_dropout_prob: (optional) float. Dropout probability of the + attention probabilities. + initializer_range: float. Range of the weight initializer. + do_return_2d_tensor: bool. If True, the output will be of shape [batch_size + * from_seq_length, num_attention_heads * size_per_head]. If False, the + output will be of shape [batch_size, from_seq_length, num_attention_heads + * size_per_head]. + batch_size: (Optional) int. If the input is 2D, this might be the batch size + of the 3D version of the `from_tensor` and `to_tensor`. + from_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `from_tensor`. + to_seq_length: (Optional) If the input is 2D, this might be the seq length + of the 3D version of the `to_tensor`. + + Returns: + float Tensor of shape [batch_size, from_seq_length, + num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is + true, this will be of shape [batch_size * from_seq_length, + num_attention_heads * size_per_head]). + + Raises: + ValueError: Any of the arguments or tensor shapes are invalid. + """ + + def transpose_for_scores(input_tensor, batch_size, num_attention_heads, + seq_length, width): + output_tensor = tf.reshape( + input_tensor, [batch_size, seq_length, num_attention_heads, width]) + + output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) + return output_tensor + + from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) + to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) + + if len(from_shape) != len(to_shape): + raise ValueError( + "The rank of `from_tensor` must match the rank of `to_tensor`.") + + if len(from_shape) == 3: + batch_size = from_shape[0] + from_seq_length = from_shape[1] + to_seq_length = to_shape[1] + elif len(from_shape) == 2: + if (batch_size is None or from_seq_length is None or to_seq_length is None): + raise ValueError( + "When passing in rank 2 tensors to attention_layer, the values " + "for `batch_size`, `from_seq_length`, and `to_seq_length` " + "must all be specified.") + + # Scalar dimensions referenced here: + # B = batch size (number of sequences) + # F = `from_tensor` sequence length + # T = `to_tensor` sequence length + # N = `num_attention_heads` + # H = `size_per_head` + + from_tensor_2d = reshape_to_matrix(from_tensor) + to_tensor_2d = reshape_to_matrix(to_tensor) + + # `query_layer` = [B*F, N*H] + query_layer = tf.layers.dense( + from_tensor_2d, + num_attention_heads * size_per_head, + activation=query_act, + name="query", + kernel_initializer=create_initializer(initializer_range)) + + # `key_layer` = [B*T, N*H] + key_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=key_act, + name="key", + kernel_initializer=create_initializer(initializer_range)) + + # `value_layer` = [B*T, N*H] + value_layer = tf.layers.dense( + to_tensor_2d, + num_attention_heads * size_per_head, + activation=value_act, + name="value", + kernel_initializer=create_initializer(initializer_range)) + + # `query_layer` = [B, N, F, H] + query_layer = transpose_for_scores(query_layer, batch_size, + num_attention_heads, from_seq_length, + size_per_head) + + # `key_layer` = [B, N, T, H] + key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, + to_seq_length, size_per_head) + + # Take the dot product between "query" and "key" to get the raw + # attention scores. + # `attention_scores` = [B, N, F, T] + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + attention_scores = tf.multiply(attention_scores, + 1.0 / math.sqrt(float(size_per_head))) + + if attention_mask is not None: + # `attention_mask` = [B, 1, F, T] + attention_mask = tf.expand_dims(attention_mask, axis=[1]) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 + + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_scores += adder + + # Normalize the attention scores to probabilities. + # `attention_probs` = [B, N, F, T] + attention_probs = tf.nn.softmax(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = dropout(attention_probs, attention_probs_dropout_prob) + + # `value_layer` = [B, T, N, H] + value_layer = tf.reshape( + value_layer, + [batch_size, to_seq_length, num_attention_heads, size_per_head]) + + # `value_layer` = [B, N, T, H] + value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) + + # `context_layer` = [B, N, F, H] + context_layer = tf.matmul(attention_probs, value_layer) + + # `context_layer` = [B, F, N, H] + context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) + + if do_return_2d_tensor: + # `context_layer` = [B*F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size * from_seq_length, num_attention_heads * size_per_head]) + else: + # `context_layer` = [B, F, N*H] + context_layer = tf.reshape( + context_layer, + [batch_size, from_seq_length, num_attention_heads * size_per_head]) + + return context_layer + + +def transformer_model(input_tensor, + attention_mask=None, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + intermediate_act_fn=gelu, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + do_return_all_layers=False): + """Multi-headed, multi-layer Transformer from "Attention is All You Need". + + This is almost an exact implementation of the original Transformer encoder. + + See the original paper: + https://arxiv.org/abs/1706.03762 + + Also see: + https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/models/transformer.py + + Args: + input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size]. + attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length, + seq_length], with 1 for positions that can be attended to and 0 in + positions that should not be. + hidden_size: int. Hidden size of the Transformer. + num_hidden_layers: int. Number of layers (blocks) in the Transformer. + num_attention_heads: int. Number of attention heads in the Transformer. + intermediate_size: int. The size of the "intermediate" (a.k.a., feed + forward) layer. + intermediate_act_fn: function. The non-linear activation function to apply + to the output of the intermediate/feed-forward layer. + hidden_dropout_prob: float. Dropout probability for the hidden layers. + attention_probs_dropout_prob: float. Dropout probability of the attention + probabilities. + initializer_range: float. Range of the initializer (stddev of truncated + normal). + do_return_all_layers: Whether to also return all layers or just the final + layer. + + Returns: + float Tensor of shape [batch_size, seq_length, hidden_size], the final + hidden layer of the Transformer. + + Raises: + ValueError: A Tensor shape or parameter is invalid. + """ + if hidden_size % num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (hidden_size, num_attention_heads)) + + attention_head_size = int(hidden_size / num_attention_heads) + input_shape = get_shape_list(input_tensor, expected_rank=3) + batch_size = input_shape[0] + seq_length = input_shape[1] + input_width = input_shape[2] + + # The Transformer performs sum residuals on all layers so the input needs + # to be the same as the hidden size. + if input_width != hidden_size: + raise ValueError("The width of the input tensor (%d) != hidden size (%d)" % + (input_width, hidden_size)) + + # We keep the representation as a 2D tensor to avoid re-shaping it back and + # forth from a 3D tensor to a 2D tensor. Re-shapes are normally free on + # the GPU/CPU but may not be free on the TPU, so we want to minimize them to + # help the optimizer. + prev_output = reshape_to_matrix(input_tensor) + + all_layer_outputs = [] + for layer_idx in range(num_hidden_layers): + with tf.variable_scope("layer_%d" % layer_idx): + layer_input = prev_output + + with tf.variable_scope("attention"): + attention_heads = [] + with tf.variable_scope("self"): + attention_head = attention_layer( + from_tensor=layer_input, + to_tensor=layer_input, + attention_mask=attention_mask, + num_attention_heads=num_attention_heads, + size_per_head=attention_head_size, + attention_probs_dropout_prob=attention_probs_dropout_prob, + initializer_range=initializer_range, + do_return_2d_tensor=True, + batch_size=batch_size, + from_seq_length=seq_length, + to_seq_length=seq_length) + attention_heads.append(attention_head) + + attention_output = None + if len(attention_heads) == 1: + attention_output = attention_heads[0] + else: + # In the case where we have other sequences, we just concatenate + # them to the self-attention head before the projection. + attention_output = tf.concat(attention_heads, axis=-1) + + # Run a linear projection of `hidden_size` then add a residual + # with `layer_input`. + with tf.variable_scope("output"): + attention_output = tf.layers.dense( + attention_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + attention_output = dropout(attention_output, hidden_dropout_prob) + attention_output = layer_norm(attention_output + layer_input) + + # The activation is only applied to the "intermediate" hidden layer. + with tf.variable_scope("intermediate"): + intermediate_output = tf.layers.dense( + attention_output, + intermediate_size, + activation=intermediate_act_fn, + kernel_initializer=create_initializer(initializer_range)) + + # Down-project back to `hidden_size` then add the residual. + with tf.variable_scope("output"): + layer_output = tf.layers.dense( + intermediate_output, + hidden_size, + kernel_initializer=create_initializer(initializer_range)) + layer_output = dropout(layer_output, hidden_dropout_prob) + layer_output = layer_norm(layer_output + attention_output) + prev_output = layer_output + all_layer_outputs.append(layer_output) + + if do_return_all_layers: + final_outputs = [] + for layer_output in all_layer_outputs: + final_output = reshape_from_matrix(layer_output, input_shape) + final_outputs.append(final_output) + return final_outputs + else: + final_output = reshape_from_matrix(prev_output, input_shape) + return final_output + + +def get_shape_list(tensor, expected_rank=None, name=None): + """Returns a list of the shape of tensor, preferring static dimensions. + + Args: + tensor: A tf.Tensor object to find the shape of. + expected_rank: (optional) int. The expected rank of `tensor`. If this is + specified and the `tensor` has a different rank, and exception will be + thrown. + name: Optional name of the tensor for the error message. + + Returns: + A list of dimensions of the shape of tensor. All static dimensions will + be returned as python integers, and dynamic dimensions will be returned + as tf.Tensor scalars. + """ + if name is None: + name = tensor.name + + if expected_rank is not None: + assert_rank(tensor, expected_rank, name) + + shape = tensor.shape.as_list() + + non_static_indexes = [] + for (index, dim) in enumerate(shape): + if dim is None: + non_static_indexes.append(index) + + if not non_static_indexes: + return shape + + dyn_shape = tf.shape(tensor) + for index in non_static_indexes: + shape[index] = dyn_shape[index] + return shape + + +def reshape_to_matrix(input_tensor): + """Reshapes a >= rank 2 tensor to a rank 2 tensor (i.e., a matrix).""" + ndims = input_tensor.shape.ndims + if ndims < 2: + raise ValueError("Input tensor must have at least rank 2. Shape = %s" % + (input_tensor.shape)) + if ndims == 2: + return input_tensor + + width = input_tensor.shape[-1] + output_tensor = tf.reshape(input_tensor, [-1, width]) + return output_tensor + + +def reshape_from_matrix(output_tensor, orig_shape_list): + """Reshapes a rank 2 tensor back to its original rank >= 2 tensor.""" + if len(orig_shape_list) == 2: + return output_tensor + + output_shape = get_shape_list(output_tensor) + + orig_dims = orig_shape_list[0:-1] + width = output_shape[-1] + + return tf.reshape(output_tensor, orig_dims + [width]) + + +def assert_rank(tensor, expected_rank, name=None): + """Raises an exception if the tensor rank is not of the expected rank. + + Args: + tensor: A tf.Tensor to check the rank of. + expected_rank: Python integer or list of integers, expected rank. + name: Optional name of the tensor for the error message. + + Raises: + ValueError: If the expected shape doesn't match the actual shape. + """ + if name is None: + name = tensor.name + + expected_rank_dict = {} + if isinstance(expected_rank, six.integer_types): + expected_rank_dict[expected_rank] = True + else: + for x in expected_rank: + expected_rank_dict[x] = True + + actual_rank = tensor.shape.ndims + if actual_rank not in expected_rank_dict: + scope_name = tf.get_variable_scope().name + raise ValueError( + "For the tensor `%s` in scope `%s`, the actual rank " + "`%d` (shape = %s) is not equal to the expected rank `%s`" % + (name, scope_name, actual_rank, str(tensor.shape), str(expected_rank))) diff --git a/baselines/models/roberta_wwm_large_ext/modeling_test.py b/baselines/models/roberta_wwm_large_ext/modeling_test.py new file mode 100644 index 0000000..817ad2d --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/modeling_test.py @@ -0,0 +1,277 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import random +import re + +import modeling +import six +import tensorflow as tf + + +class BertModelTest(tf.test.TestCase): + + class BertModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + initializer_range=0.02, + scope=None): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.scope = scope + + def create_model(self): + input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], + self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = BertModelTest.ids_tensor( + [self.batch_size, self.seq_length], self.type_vocab_size) + + config = modeling.BertConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + model = modeling.BertModel( + config=config, + is_training=self.is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=token_type_ids, + scope=self.scope) + + outputs = { + "embedding_output": model.get_embedding_output(), + "sequence_output": model.get_sequence_output(), + "pooled_output": model.get_pooled_output(), + "all_encoder_layers": model.get_all_encoder_layers(), + } + return outputs + + def check_output(self, result): + self.parent.assertAllEqual( + result["embedding_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual( + result["sequence_output"].shape, + [self.batch_size, self.seq_length, self.hidden_size]) + + self.parent.assertAllEqual(result["pooled_output"].shape, + [self.batch_size, self.hidden_size]) + + def test_default(self): + self.run_tester(BertModelTest.BertModelTester(self)) + + def test_config_to_json_string(self): + config = modeling.BertConfig(vocab_size=99, hidden_size=37) + obj = json.loads(config.to_json_string()) + self.assertEqual(obj["vocab_size"], 99) + self.assertEqual(obj["hidden_size"], 37) + + def run_tester(self, tester): + with self.test_session() as sess: + ops = tester.create_model() + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + output_result = sess.run(ops) + tester.check_output(output_result) + + self.assert_all_tensors_reachable(sess, [init_op, ops]) + + @classmethod + def ids_tensor(cls, shape, vocab_size, rng=None, name=None): + """Creates a random int32 tensor of the shape within the vocab size.""" + if rng is None: + rng = random.Random() + + total_dims = 1 + for dim in shape: + total_dims *= dim + + values = [] + for _ in range(total_dims): + values.append(rng.randint(0, vocab_size - 1)) + + return tf.constant(value=values, dtype=tf.int32, shape=shape, name=name) + + def assert_all_tensors_reachable(self, sess, outputs): + """Checks that all the tensors in the graph are reachable from outputs.""" + graph = sess.graph + + ignore_strings = [ + "^.*/assert_less_equal/.*$", + "^.*/dilation_rate$", + "^.*/Tensordot/concat$", + "^.*/Tensordot/concat/axis$", + "^testing/.*$", + ] + + ignore_regexes = [re.compile(x) for x in ignore_strings] + + unreachable = self.get_unreachable_ops(graph, outputs) + filtered_unreachable = [] + for x in unreachable: + do_ignore = False + for r in ignore_regexes: + m = r.match(x.name) + if m is not None: + do_ignore = True + if do_ignore: + continue + filtered_unreachable.append(x) + unreachable = filtered_unreachable + + self.assertEqual( + len(unreachable), 0, "The following ops are unreachable: %s" % + (" ".join([x.name for x in unreachable]))) + + @classmethod + def get_unreachable_ops(cls, graph, outputs): + """Finds all of the tensors in graph that are unreachable from outputs.""" + outputs = cls.flatten_recursive(outputs) + output_to_op = collections.defaultdict(list) + op_to_all = collections.defaultdict(list) + assign_out_to_in = collections.defaultdict(list) + + for op in graph.get_operations(): + for x in op.inputs: + op_to_all[op.name].append(x.name) + for y in op.outputs: + output_to_op[y.name].append(op.name) + op_to_all[op.name].append(y.name) + if str(op.type) == "Assign": + for y in op.outputs: + for x in op.inputs: + assign_out_to_in[y.name].append(x.name) + + assign_groups = collections.defaultdict(list) + for out_name in assign_out_to_in.keys(): + name_group = assign_out_to_in[out_name] + for n1 in name_group: + assign_groups[n1].append(out_name) + for n2 in name_group: + if n1 != n2: + assign_groups[n1].append(n2) + + seen_tensors = {} + stack = [x.name for x in outputs] + while stack: + name = stack.pop() + if name in seen_tensors: + continue + seen_tensors[name] = True + + if name in output_to_op: + for op_name in output_to_op[name]: + if op_name in op_to_all: + for input_name in op_to_all[op_name]: + if input_name not in stack: + stack.append(input_name) + + expanded_names = [] + if name in assign_groups: + for assign_name in assign_groups[name]: + expanded_names.append(assign_name) + + for expanded_name in expanded_names: + if expanded_name not in stack: + stack.append(expanded_name) + + unreachable_ops = [] + for op in graph.get_operations(): + is_unreachable = False + all_names = [x.name for x in op.inputs] + [x.name for x in op.outputs] + for name in all_names: + if name not in seen_tensors: + is_unreachable = True + if is_unreachable: + unreachable_ops.append(op) + return unreachable_ops + + @classmethod + def flatten_recursive(cls, item): + """Flattens (potentially nested) a tuple/dictionary/list to a list.""" + output = [] + if isinstance(item, list): + output.extend(item) + elif isinstance(item, tuple): + output.extend(list(item)) + elif isinstance(item, dict): + for (_, v) in six.iteritems(item): + output.append(v) + else: + return [item] + + flat_output = [] + for x in output: + flat_output.extend(cls.flatten_recursive(x)) + return flat_output + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_large_ext/multilingual.md b/baselines/models/roberta_wwm_large_ext/multilingual.md new file mode 100644 index 0000000..3b38379 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/multilingual.md @@ -0,0 +1,303 @@ +## Models + +There are two multilingual models currently available. We do not plan to release +more single-language models, but we may release `BERT-Large` versions of these +two in the future: + +* **[`BERT-Base, Multilingual Cased (New, recommended)`](https://storage.googleapis.com/bert_models/2018_11_23/multi_cased_L-12_H-768_A-12.zip)**: + 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Multilingual Uncased (Orig, not recommended)`](https://storage.googleapis.com/bert_models/2018_11_03/multilingual_L-12_H-768_A-12.zip)**: + 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters +* **[`BERT-Base, Chinese`](https://storage.googleapis.com/bert_models/2018_11_03/chinese_L-12_H-768_A-12.zip)**: + Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M + parameters + +**The `Multilingual Cased (New)` model also fixes normalization issues in many +languages, so it is recommended in languages with non-Latin alphabets (and is +often better for most languages with Latin alphabets). When using this model, +make sure to pass `--do_lower_case=false` to `run_pretraining.py` and other +scripts.** + +See the [list of languages](#list-of-languages) that the Multilingual model +supports. The Multilingual model does include Chinese (and English), but if your +fine-tuning data is Chinese-only, then the Chinese model will likely produce +better results. + +## Results + +To evaluate these systems, we use the +[XNLI dataset](https://github.com/facebookresearch/XNLI) dataset, which is a +version of [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) where the +dev and test sets have been translated (by humans) into 15 languages. Note that +the training set was *machine* translated (we used the translations provided by +XNLI, not Google NMT). For clarity, we only report on 6 languages below: + + + +| System | English | Chinese | Spanish | German | Arabic | Urdu | +| --------------------------------- | -------- | -------- | -------- | -------- | -------- | -------- | +| XNLI Baseline - Translate Train | 73.7 | 67.0 | 68.8 | 66.5 | 65.8 | 56.6 | +| XNLI Baseline - Translate Test | 73.7 | 68.3 | 70.7 | 68.7 | 66.8 | 59.3 | +| BERT - Translate Train Cased | **81.9** | **76.6** | **77.8** | **75.9** | **70.7** | 61.6 | +| BERT - Translate Train Uncased | 81.4 | 74.2 | 77.3 | 75.2 | 70.5 | 61.7 | +| BERT - Translate Test Uncased | 81.4 | 70.1 | 74.9 | 74.4 | 70.4 | **62.1** | +| BERT - Zero Shot Uncased | 81.4 | 63.8 | 74.3 | 70.5 | 62.1 | 58.3 | + + + +The first two rows are baselines from the XNLI paper and the last three rows are +our results with BERT. + +**Translate Train** means that the MultiNLI training set was machine translated +from English into the foreign language. So training and evaluation were both +done in the foreign language. Unfortunately, training was done on +machine-translated data, so it is impossible to quantify how much of the lower +accuracy (compared to English) is due to the quality of the machine translation +vs. the quality of the pre-trained model. + +**Translate Test** means that the XNLI test set was machine translated from the +foreign language into English. So training and evaluation were both done on +English. However, test evaluation was done on machine-translated English, so the +accuracy depends on the quality of the machine translation system. + +**Zero Shot** means that the Multilingual BERT system was fine-tuned on English +MultiNLI, and then evaluated on the foreign language XNLI test. In this case, +machine translation was not involved at all in either the pre-training or +fine-tuning. + +Note that the English result is worse than the 84.2 MultiNLI baseline because +this training used Multilingual BERT rather than English-only BERT. This implies +that for high-resource languages, the Multilingual model is somewhat worse than +a single-language model. However, it is not feasible for us to train and +maintain dozens of single-language models. Therefore, if your goal is to maximize +performance with a language other than English or Chinese, you might find it +beneficial to run pre-training for additional steps starting from our +Multilingual model on data from your language of interest. + +Here is a comparison of training Chinese models with the Multilingual +`BERT-Base` and Chinese-only `BERT-Base`: + +System | Chinese +----------------------- | ------- +XNLI Baseline | 67.0 +BERT Multilingual Model | 74.2 +BERT Chinese-only Model | 77.2 + +Similar to English, the single-language model does 3% better than the +Multilingual model. + +## Fine-tuning Example + +The multilingual model does **not** require any special consideration or API +changes. We did update the implementation of `BasicTokenizer` in +`tokenization.py` to support Chinese character tokenization, so please update if +you forked it. However, we did not change the tokenization API. + +To test the new models, we did modify `run_classifier.py` to add support for the +[XNLI dataset](https://github.com/facebookresearch/XNLI). This is a 15-language +version of MultiNLI where the dev/test sets have been human-translated, and the +training set has been machine-translated. + +To run the fine-tuning code, please download the +[XNLI dev/test set](https://s3.amazonaws.com/xnli/XNLI-1.0.zip) and the +[XNLI machine-translated training set](https://s3.amazonaws.com/xnli/XNLI-MT-1.0.zip) +and then unpack both .zip files into some directory `$XNLI_DIR`. + +To run fine-tuning on XNLI. The language is hard-coded into `run_classifier.py` +(Chinese by default), so please modify `XnliProcessor` if you want to run on +another language. + +This is a large dataset, so this will training will take a few hours on a GPU +(or about 30 minutes on a Cloud TPU). To run an experiment quickly for +debugging, just set `num_train_epochs` to a small value like `0.1`. + +```shell +export BERT_BASE_DIR=/path/to/bert/chinese_L-12_H-768_A-12 # or multilingual_L-12_H-768_A-12 +export XNLI_DIR=/path/to/xnli + +python run_classifier.py \ + --task_name=XNLI \ + --do_train=true \ + --do_eval=true \ + --data_dir=$XNLI_DIR \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=5e-5 \ + --num_train_epochs=2.0 \ + --output_dir=/tmp/xnli_output/ +``` + +With the Chinese-only model, the results should look something like this: + +``` + ***** Eval results ***** +eval_accuracy = 0.774116 +eval_loss = 0.83554 +global_step = 24543 +loss = 0.74603 +``` + +## Details + +### Data Source and Sampling + +The languages chosen were the +[top 100 languages with the largest Wikipedias](https://meta.wikimedia.org/wiki/List_of_Wikipedias). +The entire Wikipedia dump for each language (excluding user and talk pages) was +taken as the training data for each language + +However, the size of the Wikipedia for a given language varies greatly, and +therefore low-resource languages may be "under-represented" in terms of the +neural network model (under the assumption that languages are "competing" for +limited model capacity to some extent). At the same time, we also don't want +to overfit the model by performing thousands of epochs over a tiny Wikipedia +for a particular language. + +To balance these two factors, we performed exponentially smoothed weighting of +the data during pre-training data creation (and WordPiece vocab creation). In +other words, let's say that the probability of a language is *P(L)*, e.g., +*P(English) = 0.21* means that after concatenating all of the Wikipedias +together, 21% of our data is English. We exponentiate each probability by some +factor *S* and then re-normalize, and sample from that distribution. In our case +we use *S=0.7*. So, high-resource languages like English will be under-sampled, +and low-resource languages like Icelandic will be over-sampled. E.g., in the +original distribution English would be sampled 1000x more than Icelandic, but +after smoothing it's only sampled 100x more. + +### Tokenization + +For tokenization, we use a 110k shared WordPiece vocabulary. The word counts are +weighted the same way as the data, so low-resource languages are upweighted by +some factor. We intentionally do *not* use any marker to denote the input +language (so that zero-shot training can work). + +Because Chinese (and Japanese Kanji and Korean Hanja) does not have whitespace +characters, we add spaces around every character in the +[CJK Unicode range](https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_\(Unicode_block\)) +before applying WordPiece. This means that Chinese is effectively +character-tokenized. Note that the CJK Unicode block only includes +Chinese-origin characters and does *not* include Hangul Korean or +Katakana/Hiragana Japanese, which are tokenized with whitespace+WordPiece like +all other languages. + +For all other languages, we apply the +[same recipe as English](https://github.com/google-research/bert#tokenization): +(a) lower casing+accent removal, (b) punctuation splitting, (c) whitespace +tokenization. We understand that accent markers have substantial meaning in some +languages, but felt that the benefits of reducing the effective vocabulary make +up for this. Generally the strong contextual models of BERT should make up for +any ambiguity introduced by stripping accent markers. + +### List of Languages + +The multilingual model supports the following languages. These languages were +chosen because they are the top 100 languages with the largest Wikipedias: + +* Afrikaans +* Albanian +* Arabic +* Aragonese +* Armenian +* Asturian +* Azerbaijani +* Bashkir +* Basque +* Bavarian +* Belarusian +* Bengali +* Bishnupriya Manipuri +* Bosnian +* Breton +* Bulgarian +* Burmese +* Catalan +* Cebuano +* Chechen +* Chinese (Simplified) +* Chinese (Traditional) +* Chuvash +* Croatian +* Czech +* Danish +* Dutch +* English +* Estonian +* Finnish +* French +* Galician +* Georgian +* German +* Greek +* Gujarati +* Haitian +* Hebrew +* Hindi +* Hungarian +* Icelandic +* Ido +* Indonesian +* Irish +* Italian +* Japanese +* Javanese +* Kannada +* Kazakh +* Kirghiz +* Korean +* Latin +* Latvian +* Lithuanian +* Lombard +* Low Saxon +* Luxembourgish +* Macedonian +* Malagasy +* Malay +* Malayalam +* Marathi +* Minangkabau +* Nepali +* Newar +* Norwegian (Bokmal) +* Norwegian (Nynorsk) +* Occitan +* Persian (Farsi) +* Piedmontese +* Polish +* Portuguese +* Punjabi +* Romanian +* Russian +* Scots +* Serbian +* Serbo-Croatian +* Sicilian +* Slovak +* Slovenian +* South Azerbaijani +* Spanish +* Sundanese +* Swahili +* Swedish +* Tagalog +* Tajik +* Tamil +* Tatar +* Telugu +* Turkish +* Ukrainian +* Urdu +* Uzbek +* Vietnamese +* Volapük +* Waray-Waray +* Welsh +* West Frisian +* Western Punjabi +* Yoruba + +The **Multilingual Cased (New)** release contains additionally **Thai** and +**Mongolian**, which were not included in the original release. diff --git a/baselines/models/roberta_wwm_large_ext/optimization.py b/baselines/models/roberta_wwm_large_ext/optimization.py new file mode 100644 index 0000000..d33dabd --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/optimization.py @@ -0,0 +1,174 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Functions and classes related to optimization (weight updates).""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import re +import tensorflow as tf + + +def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu): + """Creates an optimizer training op.""" + global_step = tf.train.get_or_create_global_step() + + learning_rate = tf.constant(value=init_lr, shape=[], dtype=tf.float32) + + # Implements linear decay of the learning rate. + learning_rate = tf.train.polynomial_decay( + learning_rate, + global_step, + num_train_steps, + end_learning_rate=0.0, + power=1.0, + cycle=False) + + # Implements linear warmup. I.e., if global_step < num_warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + if num_warmup_steps: + global_steps_int = tf.cast(global_step, tf.int32) + warmup_steps_int = tf.constant(num_warmup_steps, dtype=tf.int32) + + global_steps_float = tf.cast(global_steps_int, tf.float32) + warmup_steps_float = tf.cast(warmup_steps_int, tf.float32) + + warmup_percent_done = global_steps_float / warmup_steps_float + warmup_learning_rate = init_lr * warmup_percent_done + + is_warmup = tf.cast(global_steps_int < warmup_steps_int, tf.float32) + learning_rate = ( + (1.0 - is_warmup) * learning_rate + is_warmup * warmup_learning_rate) + + # It is recommended that you use this optimizer for fine tuning, since this + # is how the model was trained (note that the Adam m/v variables are NOT + # loaded from init_checkpoint.) + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + weight_decay_rate=0.01, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]) + + if use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + + # This is how the model was pre-trained. + (grads, _) = tf.clip_by_global_norm(grads, clip_norm=1.0) + + train_op = optimizer.apply_gradients( + zip(grads, tvars), global_step=global_step) + + # Normally the global step update is done inside of `apply_gradients`. + # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use + # a different optimizer, you should probably take this line out. + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + return train_op + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name diff --git a/baselines/models/roberta_wwm_large_ext/optimization_test.py b/baselines/models/roberta_wwm_large_ext/optimization_test.py new file mode 100644 index 0000000..4f2dcf1 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/optimization_test.py @@ -0,0 +1,48 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import optimization +import tensorflow as tf + + +class OptimizationTest(tf.test.TestCase): + + def test_adam(self): + with self.test_session() as sess: + w = tf.get_variable( + "w", + shape=[3], + initializer=tf.constant_initializer([0.1, -0.2, -0.1])) + x = tf.constant([0.4, 0.2, -0.5]) + loss = tf.reduce_mean(tf.square(x - w)) + tvars = tf.trainable_variables() + grads = tf.gradients(loss, tvars) + global_step = tf.train.get_or_create_global_step() + optimizer = optimization.AdamWeightDecayOptimizer(learning_rate=0.2) + train_op = optimizer.apply_gradients(zip(grads, tvars), global_step) + init_op = tf.group(tf.global_variables_initializer(), + tf.local_variables_initializer()) + sess.run(init_op) + for _ in range(100): + sess.run(train_op) + w_np = sess.run(w) + self.assertAllClose(w_np.flat, [0.4, 0.2, -0.5], rtol=1e-2, atol=1e-2) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_large_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb b/baselines/models/roberta_wwm_large_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb new file mode 100644 index 0000000..466857f --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/predicting_movie_reviews_with_bert_on_tf_hub.ipynb @@ -0,0 +1,1231 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Predicting Movie Reviews with BERT on TF Hub.ipynb", + "version": "0.3.2", + "provenance": [], + "collapsed_sections": [] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "accelerator": "GPU" + }, + "cells": [ + { + "metadata": { + "id": "j0a4mTk9o1Qg", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Copyright 2019 Google Inc.\n", + "\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "\n", + "# http://www.apache.org/licenses/LICENSE-2.0\n", + "\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "dCpvgG0vwXAZ", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Predicting Movie Review Sentiment with BERT on TF Hub" + ] + }, + { + "metadata": { + "id": "xiYrZKaHwV81", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "If you’ve been following Natural Language Processing over the past year, you’ve probably heard of BERT: Bidirectional Encoder Representations from Transformers. It’s a neural network architecture designed by Google researchers that’s totally transformed what’s state-of-the-art for NLP tasks, like text classification, translation, summarization, and question answering.\n", + "\n", + "Now that BERT's been added to [TF Hub](https://www.tensorflow.org/hub) as a loadable module, it's easy(ish) to add into existing Tensorflow text pipelines. In an existing pipeline, BERT can replace text embedding layers like ELMO and GloVE. Alternatively, [finetuning](http://wiki.fast.ai/index.php/Fine_tuning) BERT can provide both an accuracy boost and faster training time in many cases.\n", + "\n", + "Here, we'll train a model to predict whether an IMDB movie review is positive or negative using BERT in Tensorflow with tf hub. Some code was adapted from [this colab notebook](https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb). Let's get started!" + ] + }, + { + "metadata": { + "id": "hsZvic2YxnTz", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import pandas as pd\n", + "import tensorflow as tf\n", + "import tensorflow_hub as hub\n", + "from datetime import datetime" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "cp5wfXDx5SPH", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "In addition to the standard libraries we imported above, we'll need to install BERT's python package." + ] + }, + { + "metadata": { + "id": "jviywGyWyKsA", + "colab_type": "code", + "outputId": "166f3005-d219-404f-b201-2a0b75480360", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 51 + } + }, + "cell_type": "code", + "source": [ + "!pip install bert-tensorflow" + ], + "execution_count": 38, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Requirement already satisfied: bert-tensorflow in /usr/local/lib/python3.6/dist-packages (1.0.1)\n", + "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from bert-tensorflow) (1.11.0)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "hhbGEfwgdEtw", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "import bert\n", + "from bert import run_classifier\n", + "from bert import optimization\n", + "from bert import tokenization" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "KVB3eOcjxxm1", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Below, we'll set an output directory location to store our model output and checkpoints. This can be a local directory, in which case you'd set OUTPUT_DIR to the name of the directory you'd like to create. If you're running this code in Google's hosted Colab, the directory won't persist after the Colab session ends.\n", + "\n", + "Alternatively, if you're a GCP user, you can store output in a GCP bucket. To do that, set a directory name in OUTPUT_DIR and the name of the GCP bucket in the BUCKET field.\n", + "\n", + "Set DO_DELETE to rewrite the OUTPUT_DIR if it exists. Otherwise, Tensorflow will load existing model checkpoints from that directory (if they exist)." + ] + }, + { + "metadata": { + "id": "US_EAnICvP7f", + "colab_type": "code", + "outputId": "7780a032-31d4-4794-e6aa-664a5d2ae7dd", + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# Set the output directory for saving model file\n", + "# Optionally, set a GCP bucket location\n", + "\n", + "OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:\"string\"}\n", + "#@markdown Whether or not to clear/delete the directory and create a new one\n", + "DO_DELETE = False #@param {type:\"boolean\"}\n", + "#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.\n", + "USE_BUCKET = True #@param {type:\"boolean\"}\n", + "BUCKET = 'BUCKET_NAME' #@param {type:\"string\"}\n", + "\n", + "if USE_BUCKET:\n", + " OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)\n", + " from google.colab import auth\n", + " auth.authenticate_user()\n", + "\n", + "if DO_DELETE:\n", + " try:\n", + " tf.gfile.DeleteRecursively(OUTPUT_DIR)\n", + " except:\n", + " # Doesn't matter if the directory didn't exist\n", + " pass\n", + "tf.gfile.MakeDirs(OUTPUT_DIR)\n", + "print('***** Model output directory: {} *****'.format(OUTPUT_DIR))\n" + ], + "execution_count": 40, + "outputs": [ + { + "output_type": "stream", + "text": [ + "***** Model output directory: gs://bert-tfhub/aclImdb_v1 *****\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "pmFYvkylMwXn", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data" + ] + }, + { + "metadata": { + "id": "MC_w8SRqN0fr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "First, let's download the dataset, hosted by Stanford. The code below, which downloads, extracts, and imports the IMDB Large Movie Review Dataset, is borrowed from [this Tensorflow tutorial](https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub)." + ] + }, + { + "metadata": { + "id": "fom_ff20gyy6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "from tensorflow import keras\n", + "import os\n", + "import re\n", + "\n", + "# Load all files from a directory in a DataFrame.\n", + "def load_directory_data(directory):\n", + " data = {}\n", + " data[\"sentence\"] = []\n", + " data[\"sentiment\"] = []\n", + " for file_path in os.listdir(directory):\n", + " with tf.gfile.GFile(os.path.join(directory, file_path), \"r\") as f:\n", + " data[\"sentence\"].append(f.read())\n", + " data[\"sentiment\"].append(re.match(\"\\d+_(\\d+)\\.txt\", file_path).group(1))\n", + " return pd.DataFrame.from_dict(data)\n", + "\n", + "# Merge positive and negative examples, add a polarity column and shuffle.\n", + "def load_dataset(directory):\n", + " pos_df = load_directory_data(os.path.join(directory, \"pos\"))\n", + " neg_df = load_directory_data(os.path.join(directory, \"neg\"))\n", + " pos_df[\"polarity\"] = 1\n", + " neg_df[\"polarity\"] = 0\n", + " return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)\n", + "\n", + "# Download and process the dataset files.\n", + "def download_and_load_datasets(force_download=False):\n", + " dataset = tf.keras.utils.get_file(\n", + " fname=\"aclImdb.tar.gz\", \n", + " origin=\"http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\", \n", + " extract=True)\n", + " \n", + " train_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"train\"))\n", + " test_df = load_dataset(os.path.join(os.path.dirname(dataset), \n", + " \"aclImdb\", \"test\"))\n", + " \n", + " return train_df, test_df\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "2abfwdn-g135", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train, test = download_and_load_datasets()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "XA8WHJgzhIZf", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To keep training fast, we'll take a sample of 5000 train and test examples, respectively." + ] + }, + { + "metadata": { + "id": "lw_F488eixTV", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "train = train.sample(5000)\n", + "test = test.sample(5000)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "prRQM8pDi8xI", + "colab_type": "code", + "outputId": "34445cb8-2be0-4379-fdbc-7794091f6049", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "train.columns" + ], + "execution_count": 44, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Index(['sentence', 'sentiment', 'polarity'], dtype='object')" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 44 + } + ] + }, + { + "metadata": { + "id": "sfRnHSz3iSXz", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "For us, our input data is the 'sentence' column and our label is the 'polarity' column (0, 1 for negative and positive, respecitvely)" + ] + }, + { + "metadata": { + "id": "IuMOGwFui4it", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "DATA_COLUMN = 'sentence'\n", + "LABEL_COLUMN = 'polarity'\n", + "# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'\n", + "label_list = [0, 1]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "V399W0rqNJ-Z", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Data Preprocessing\n", + "We'll need to transform our data into a format BERT understands. This involves two steps. First, we create `InputExample`'s using the constructor provided in the BERT library.\n", + "\n", + "- `text_a` is the text we want to classify, which in this case, is the `Request` field in our Dataframe. \n", + "- `text_b` is used if we're training a model to understand the relationship between sentences (i.e. is `text_b` a translation of `text_a`? Is `text_b` an answer to the question asked by `text_a`?). This doesn't apply to our task, so we can leave `text_b` blank.\n", + "- `label` is the label for our example, i.e. True, False" + ] + }, + { + "metadata": { + "id": "p9gEt5SmM6i6", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Use the InputExample class from BERT's run_classifier code to create examples from the data\n", + "train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example\n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)\n", + "\n", + "test_InputExamples = test.apply(lambda x: bert.run_classifier.InputExample(guid=None, \n", + " text_a = x[DATA_COLUMN], \n", + " text_b = None, \n", + " label = x[LABEL_COLUMN]), axis = 1)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "SCZWZtKxObjh", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next, we need to preprocess our data so that it matches the data BERT was trained on. For this, we'll need to do a couple of things (but don't worry--this is also included in the Python library):\n", + "\n", + "\n", + "1. Lowercase our text (if we're using a BERT lowercase model)\n", + "2. Tokenize it (i.e. \"sally says hi\" -> [\"sally\", \"says\", \"hi\"])\n", + "3. Break words into WordPieces (i.e. \"calling\" -> [\"call\", \"##ing\"])\n", + "4. Map our words to indexes using a vocab file that BERT provides\n", + "5. Add special \"CLS\" and \"SEP\" tokens (see the [readme](https://github.com/google-research/bert))\n", + "6. Append \"index\" and \"segment\" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))\n", + "\n", + "Happily, we don't have to worry about most of these details.\n", + "\n", + "\n" + ] + }, + { + "metadata": { + "id": "qMWiDtpyQSoU", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:" + ] + }, + { + "metadata": { + "id": "IhJSe0QHNG7U", + "colab_type": "code", + "outputId": "20b28cc7-3cb3-4ce6-bfff-a7847ce3bbaa", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + } + }, + "cell_type": "code", + "source": [ + "# This is a path to an uncased (all lowercase) version of BERT\n", + "BERT_MODEL_HUB = \"https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1\"\n", + "\n", + "def create_tokenizer_from_hub_module():\n", + " \"\"\"Get the vocab file and casing info from the Hub module.\"\"\"\n", + " with tf.Graph().as_default():\n", + " bert_module = hub.Module(BERT_MODEL_HUB)\n", + " tokenization_info = bert_module(signature=\"tokenization_info\", as_dict=True)\n", + " with tf.Session() as sess:\n", + " vocab_file, do_lower_case = sess.run([tokenization_info[\"vocab_file\"],\n", + " tokenization_info[\"do_lower_case\"]])\n", + " \n", + " return bert.tokenization.FullTokenizer(\n", + " vocab_file=vocab_file, do_lower_case=do_lower_case)\n", + "\n", + "tokenizer = create_tokenizer_from_hub_module()" + ], + "execution_count": 47, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "z4oFkhpZBDKm", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Great--we just learned that the BERT model we're using expects lowercase data (that's what stored in tokenization_info[\"do_lower_case\"]) and we also loaded BERT's vocab file. We also created a tokenizer, which breaks words into word pieces:" + ] + }, + { + "metadata": { + "id": "dsBo6RCtQmwx", + "colab_type": "code", + "outputId": "9af8c917-90ec-4fe9-897b-79dc89ca88e1", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + } + }, + "cell_type": "code", + "source": [ + "tokenizer.tokenize(\"This here's an example of using the BERT tokenizer\")" + ], + "execution_count": 48, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "['this',\n", + " 'here',\n", + " \"'\",\n", + " 's',\n", + " 'an',\n", + " 'example',\n", + " 'of',\n", + " 'using',\n", + " 'the',\n", + " 'bert',\n", + " 'token',\n", + " '##izer']" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 48 + } + ] + }, + { + "metadata": { + "id": "0OEzfFIt6GIc", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Using our tokenizer, we'll call `run_classifier.convert_examples_to_features` on our InputExamples to convert them into features BERT understands." + ] + }, + { + "metadata": { + "id": "LL5W8gEGRTAf", + "colab_type": "code", + "outputId": "65001dda-155b-48fc-b5fc-1e4cabc8dfbf", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 1261 + } + }, + "cell_type": "code", + "source": [ + "# We'll set sequences to be at most 128 tokens long.\n", + "MAX_SEQ_LENGTH = 128\n", + "# Convert our train and test features to InputFeatures that BERT understands.\n", + "train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + "test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)" + ], + "execution_count": 49, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i ' m watching this on the sci - fi channel right now . it ' s so horrible i can ' t stop watching it ! i ' m a video ##grapher and this movie makes me sad . i feel bad for anyone associated with this movie . some of the camera work is good . most is very questionable . there are a few decent actors in the flick . too bad they ' re surrounded by what must have been the director ' s relatives . that ' s the only way they could have been qualified to be in a movie ! music was a little better than the acting . if you get around to watching this i hope it [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 1005 1049 3666 2023 2006 1996 16596 1011 10882 3149 2157 2085 1012 2009 1005 1055 2061 9202 1045 2064 1005 1056 2644 3666 2009 999 1045 1005 1049 1037 2678 18657 1998 2023 3185 3084 2033 6517 1012 1045 2514 2919 2005 3087 3378 2007 2023 3185 1012 2070 1997 1996 4950 2147 2003 2204 1012 2087 2003 2200 21068 1012 2045 2024 1037 2261 11519 5889 1999 1996 17312 1012 2205 2919 2027 1005 2128 5129 2011 2054 2442 2031 2042 1996 2472 1005 1055 9064 1012 2008 1005 1055 1996 2069 2126 2027 2071 2031 2042 4591 2000 2022 1999 1037 3185 999 2189 2001 1037 2210 2488 2084 1996 3772 1012 2065 2017 2131 2105 2000 3666 2023 1045 3246 2009 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i have been a fan of pushing dai ##sies since the very beginning . it is wonderful ##ly thought up , and bryan fuller has the most remarkable ideas for this show . < br / > < br / > it is unbelievable on how much tv has been needing a creative , original show like pushing dai ##sies . it is a huge relief to see a show , that is unlike the rest , where as , if you compared it to some of the newer shows , such as scrub ##s and house , you would see the similarities , and it does get ted ##ious at moments to see shows so close in identity . < br / > < br [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2031 2042 1037 5470 1997 6183 18765 14625 2144 1996 2200 2927 1012 2009 2003 6919 2135 2245 2039 1010 1998 8527 12548 2038 1996 2087 9487 4784 2005 2023 2265 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 2003 23653 2006 2129 2172 2694 2038 2042 11303 1037 5541 1010 2434 2265 2066 6183 18765 14625 1012 2009 2003 1037 4121 4335 2000 2156 1037 2265 1010 2008 2003 4406 1996 2717 1010 2073 2004 1010 2065 2017 4102 2009 2000 2070 1997 1996 10947 3065 1010 2107 2004 18157 2015 1998 2160 1010 2017 2052 2156 1996 12319 1010 1998 2009 2515 2131 6945 6313 2012 5312 2000 2156 3065 2061 2485 1999 4767 1012 1026 7987 1013 1028 1026 7987 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] this movie starts out promising ##ly , with an early scene in which frank morgan advises against gary cooper ' s marriage to his daughter , anita louise . frank morgan , playing an una ##bas ##hed gold - digger , loudly complain ##s to cooper about his perceived pen ##ury at the hands of his family - including his daughter , anita louise . i am a fan of all 3 actors . frank morgan is ( to my mind ) a hollywood treasure , cooper a legend , and louise a very lovely , versatile and under - appreciated actress seldom seen in the leading role . i also have nothing against teresa wright , and while not blessed with great range , she [SEP]\n", + "INFO:tensorflow:input_ids: 101 2023 3185 4627 2041 10015 2135 1010 2007 2019 2220 3496 1999 2029 3581 5253 25453 2114 5639 6201 1005 1055 3510 2000 2010 2684 1010 12918 8227 1012 3581 5253 1010 2652 2019 14477 22083 9072 2751 1011 28661 1010 9928 17612 2015 2000 6201 2055 2010 8690 7279 13098 2012 1996 2398 1997 2010 2155 1011 2164 2010 2684 1010 12918 8227 1012 1045 2572 1037 5470 1997 2035 1017 5889 1012 3581 5253 2003 1006 2000 2026 2568 1007 1037 5365 8813 1010 6201 1037 5722 1010 1998 8227 1037 2200 8403 1010 22979 1998 2104 1011 12315 3883 15839 2464 1999 1996 2877 2535 1012 1045 2036 2031 2498 2114 12409 6119 1010 1998 2096 2025 10190 2007 2307 2846 1010 2016 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i was over ##taken by the emotion . un ##for ##get ##table rendering of a wartime story which is unknown to most people . the performances were fault ##less and outstanding . [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2001 2058 25310 2011 1996 7603 1012 4895 29278 18150 10880 14259 1997 1037 12498 2466 2029 2003 4242 2000 2087 2111 1012 1996 4616 2020 6346 3238 1998 5151 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] soldier blue is a movie with pre ##tension ##s : pre ##tension ##s to be some sort of profound statement on man ' s inhuman ##ity to man , on the white man ' s exploitation of and brutality towards indigenous peoples ; a biting , un ##fl ##in ##ching and sar ##don ##ic commentary on the horrors of vietnam . well , sorry , but it fails mis ##era ##bly to be any of those things . what soldier blue actually is is per ##nic ##ious , tri ##te , badly made , dish ##ones ##t rubbish . < br / > < br / > another reviewer here hit the nail on the head in saying that it appears to be a hybrid of [SEP]\n", + "INFO:tensorflow:input_ids: 101 5268 2630 2003 1037 3185 2007 3653 29048 2015 1024 3653 29048 2015 2000 2022 2070 4066 1997 13769 4861 2006 2158 1005 1055 29582 3012 2000 2158 1010 2006 1996 2317 2158 1005 1055 14427 1997 1998 24083 2875 6284 7243 1025 1037 12344 1010 4895 10258 2378 8450 1998 18906 5280 2594 8570 2006 1996 22812 1997 5148 1012 2092 1010 3374 1010 2021 2009 11896 28616 6906 6321 2000 2022 2151 1997 2216 2477 1012 2054 5268 2630 2941 2003 2003 2566 8713 6313 1010 13012 2618 1010 6649 2081 1010 9841 21821 2102 29132 1012 1026 7987 1013 1028 1026 7987 1013 1028 2178 12027 2182 2718 1996 13774 2006 1996 2132 1999 3038 2008 2009 3544 2000 2022 1037 8893 1997 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Writing example 0 of 5000\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i just watched this today on tv . it was on abc ' s sunday afternoon movie . < br / > < br / > this wasn ' t a very good movie , but for a low budget independent film like this , it was okay . there is some suspense in it , but there are so many bad qualities that really bring the movie down . the script is pretty lame , and the plot elements aren ' t very realistic , such as the way a 911 operator would laugh and hang up when someone is reporting a murder . i don ' t know what the writer was thinking when they came up with that idea , but it isn [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2074 3427 2023 2651 2006 2694 1012 2009 2001 2006 5925 1005 1055 4465 5027 3185 1012 1026 7987 1013 1028 1026 7987 1013 1028 2023 2347 1005 1056 1037 2200 2204 3185 1010 2021 2005 1037 2659 5166 2981 2143 2066 2023 1010 2009 2001 3100 1012 2045 2003 2070 23873 1999 2009 1010 2021 2045 2024 2061 2116 2919 11647 2008 2428 3288 1996 3185 2091 1012 1996 5896 2003 3492 20342 1010 1998 1996 5436 3787 4995 1005 1056 2200 12689 1010 2107 2004 1996 2126 1037 19989 6872 2052 4756 1998 6865 2039 2043 2619 2003 7316 1037 4028 1012 1045 2123 1005 1056 2113 2054 1996 3213 2001 3241 2043 2027 2234 2039 2007 2008 2801 1010 2021 2009 3475 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] from hardly alien sounding lasers , to an elementary school style shuttle crash , \" night ##be ##ast \" is better classified as a far ##cic ##al mix of fake blood and bare chest . the almost pornographic style of the film seems to be a failed attempt to recover from a lack of co ##hesive or effective story . the acting however is not nearly as beast ##ly , many of the young , aspiring , actors ad ##mir ##ably showcase a hidden talent . particularly don lei ##fer ##t and jamie ze ##mare ##l , who shed a well needed sha ##rd of light on this otherwise terrible film . night ##be ##ast would have never shown up on set had he known the [SEP]\n", + "INFO:tensorflow:input_ids: 101 2013 6684 7344 9391 23965 1010 2000 2019 4732 2082 2806 10382 5823 1010 1000 2305 4783 14083 1000 2003 2488 6219 2004 1037 2521 19053 2389 4666 1997 8275 2668 1998 6436 3108 1012 1996 2471 26932 2806 1997 1996 2143 3849 2000 2022 1037 3478 3535 2000 8980 2013 1037 3768 1997 2522 21579 2030 4621 2466 1012 1996 3772 2174 2003 2025 3053 2004 6841 2135 1010 2116 1997 1996 2402 1010 22344 1010 5889 4748 14503 8231 13398 1037 5023 5848 1012 3391 2123 26947 7512 2102 1998 6175 27838 24376 2140 1010 2040 8328 1037 2092 2734 21146 4103 1997 2422 2006 2023 4728 6659 2143 1012 2305 4783 14083 2052 2031 2196 3491 2039 2006 2275 2018 2002 2124 1996 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] here we have the in ##imi ##table charlie chaplin for ##sa ##king his slap ##stick past to tackle the serious subject of anti - semi ##tism , and into ##ler ##ance in general . he portrays two characters - the sweet , innocent jewish barber - a war veteran , and the ravi ##ng and ruthless dictator , aden ##oid h ##yn ##kel . the jewish ghetto in this country is not safe for long , due to the w ##him ##s of h ##yn ##kel and his armed thugs , who routinely rough up its residents , or leave them alone , dependent upon his mood that day or week . the barber is among them , but is befriended by his former commanding officer [SEP]\n", + "INFO:tensorflow:input_ids: 101 2182 2057 2031 1996 1999 27605 10880 4918 23331 2005 3736 6834 2010 14308 21354 2627 2000 11147 1996 3809 3395 1997 3424 1011 4100 17456 1010 1998 2046 3917 6651 1999 2236 1012 2002 17509 2048 3494 1011 1996 4086 1010 7036 3644 13362 1011 1037 2162 8003 1010 1998 1996 16806 3070 1998 18101 21237 1010 16298 9314 1044 6038 11705 1012 1996 3644 17276 1999 2023 2406 2003 2025 3647 2005 2146 1010 2349 2000 1996 1059 14341 2015 1997 1044 6038 11705 1998 2010 4273 24106 1010 2040 19974 5931 2039 2049 3901 1010 2030 2681 2068 2894 1010 7790 2588 2010 6888 2008 2154 2030 2733 1012 1996 13362 2003 2426 2068 1010 2021 2003 23386 2011 2010 2280 7991 2961 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] i really hated this movie and it ' s the first movie written by stephen king that i didn ' t finish . i was truly disappointed , it was the worst crap i ' ve ever seen . what were you thinking making three hours out of it ? it may have a quite good story , but actors ? no . suspense ? no . romance ? no . horror ? no . it didn ' t have anything . < br / > < br / > it ' s got this strange , crazy science man with einstein - hair , the classic thing . not real at all . and a man keep getting younger all the time . it seems [SEP]\n", + "INFO:tensorflow:input_ids: 101 1045 2428 6283 2023 3185 1998 2009 1005 1055 1996 2034 3185 2517 2011 4459 2332 2008 1045 2134 1005 1056 3926 1012 1045 2001 5621 9364 1010 2009 2001 1996 5409 10231 1045 1005 2310 2412 2464 1012 2054 2020 2017 3241 2437 2093 2847 2041 1997 2009 1029 2009 2089 2031 1037 3243 2204 2466 1010 2021 5889 1029 2053 1012 23873 1029 2053 1012 7472 1029 2053 1012 5469 1029 2053 1012 2009 2134 1005 1056 2031 2505 1012 1026 7987 1013 1028 1026 7987 1013 1028 2009 1005 1055 2288 2023 4326 1010 4689 2671 2158 2007 15313 1011 2606 1010 1996 4438 2518 1012 2025 2613 2012 2035 1012 1998 1037 2158 2562 2893 3920 2035 1996 2051 1012 2009 3849 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: None\n", + "INFO:tensorflow:tokens: [CLS] story chinese tall story tells the story of righteous monk trip ##ita ##ka , who , along with his guardians monkey , sandy and pigs ##y make their journey west on a quest to recover ancient sutra ##s , finally , they reach the final leg of their journey in sha ##che city but all is not as it seems when the city is attacked by evil tree demons . monkey tries his best to battle them but is overwhelmed , knowing his master is in grave danger , he uses his trust ##y golden staff to thrust trip ##ita ##ka to safety . < br / > < br / > the monk ends up being knocked out when he land and when he wakes [SEP]\n", + "INFO:tensorflow:input_ids: 101 2466 2822 4206 2466 4136 1996 2466 1997 19556 8284 4440 6590 2912 1010 2040 1010 2247 2007 2010 14240 10608 1010 7525 1998 14695 2100 2191 2037 4990 2225 2006 1037 8795 2000 8980 3418 26567 2015 1010 2633 1010 2027 3362 1996 2345 4190 1997 2037 4990 1999 21146 5403 2103 2021 2035 2003 2025 2004 2009 3849 2043 1996 2103 2003 4457 2011 4763 3392 7942 1012 10608 5363 2010 2190 2000 2645 2068 2021 2003 13394 1010 4209 2010 3040 2003 1999 6542 5473 1010 2002 3594 2010 3404 2100 3585 3095 2000 7400 4440 6590 2912 2000 3808 1012 1026 7987 1013 1028 1026 7987 1013 1028 1996 8284 4515 2039 2108 6573 2041 2043 2002 2455 1998 2043 2002 17507 102\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 1 (id = 1)\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "ccp5trMwRtmr", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "#Creating a model\n", + "\n", + "Now that we've prepared our data, let's focus on building a model. `create_model` does just this below. First, it loads the BERT tf hub module again (this time to extract the computation graph). Next, it creates a single new layer that will be trained to adapt BERT to our sentiment task (i.e. classifying whether a movie review is positive or negative). This strategy of using a mostly trained model is called [fine-tuning](http://wiki.fast.ai/index.php/Fine_tuning)." + ] + }, + { + "metadata": { + "id": "6o2a5ZIvRcJq", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,\n", + " num_labels):\n", + " \"\"\"Creates a classification model.\"\"\"\n", + "\n", + " bert_module = hub.Module(\n", + " BERT_MODEL_HUB,\n", + " trainable=True)\n", + " bert_inputs = dict(\n", + " input_ids=input_ids,\n", + " input_mask=input_mask,\n", + " segment_ids=segment_ids)\n", + " bert_outputs = bert_module(\n", + " inputs=bert_inputs,\n", + " signature=\"tokens\",\n", + " as_dict=True)\n", + "\n", + " # Use \"pooled_output\" for classification tasks on an entire sentence.\n", + " # Use \"sequence_outputs\" for token-level output.\n", + " output_layer = bert_outputs[\"pooled_output\"]\n", + "\n", + " hidden_size = output_layer.shape[-1].value\n", + "\n", + " # Create our own layer to tune for politeness data.\n", + " output_weights = tf.get_variable(\n", + " \"output_weights\", [num_labels, hidden_size],\n", + " initializer=tf.truncated_normal_initializer(stddev=0.02))\n", + "\n", + " output_bias = tf.get_variable(\n", + " \"output_bias\", [num_labels], initializer=tf.zeros_initializer())\n", + "\n", + " with tf.variable_scope(\"loss\"):\n", + "\n", + " # Dropout helps prevent overfitting\n", + " output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)\n", + "\n", + " logits = tf.matmul(output_layer, output_weights, transpose_b=True)\n", + " logits = tf.nn.bias_add(logits, output_bias)\n", + " log_probs = tf.nn.log_softmax(logits, axis=-1)\n", + "\n", + " # Convert labels into one-hot encoding\n", + " one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)\n", + "\n", + " predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))\n", + " # If we're predicting, we want predicted labels and the probabiltiies.\n", + " if is_predicting:\n", + " return (predicted_labels, log_probs)\n", + "\n", + " # If we're train/eval, compute loss between predicted and actual label\n", + " per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)\n", + " loss = tf.reduce_mean(per_example_loss)\n", + " return (loss, predicted_labels, log_probs)\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "qpE0ZIDOCQzE", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we'll wrap our model function in a `model_fn_builder` function that adapts our model to work for training, evaluation, and prediction." + ] + }, + { + "metadata": { + "id": "FnH-AnOQ9KKW", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# model_fn_builder actually creates our model function\n", + "# using the passed parameters for num_labels, learning_rate, etc.\n", + "def model_fn_builder(num_labels, learning_rate, num_train_steps,\n", + " num_warmup_steps):\n", + " \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n", + " def model_fn(features, labels, mode, params): # pylint: disable=unused-argument\n", + " \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n", + "\n", + " input_ids = features[\"input_ids\"]\n", + " input_mask = features[\"input_mask\"]\n", + " segment_ids = features[\"segment_ids\"]\n", + " label_ids = features[\"label_ids\"]\n", + "\n", + " is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)\n", + " \n", + " # TRAIN and EVAL\n", + " if not is_predicting:\n", + "\n", + " (loss, predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " train_op = bert.optimization.create_optimizer(\n", + " loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)\n", + "\n", + " # Calculate evaluation metrics. \n", + " def metric_fn(label_ids, predicted_labels):\n", + " accuracy = tf.metrics.accuracy(label_ids, predicted_labels)\n", + " f1_score = tf.contrib.metrics.f1_score(\n", + " label_ids,\n", + " predicted_labels)\n", + " auc = tf.metrics.auc(\n", + " label_ids,\n", + " predicted_labels)\n", + " recall = tf.metrics.recall(\n", + " label_ids,\n", + " predicted_labels)\n", + " precision = tf.metrics.precision(\n", + " label_ids,\n", + " predicted_labels) \n", + " true_pos = tf.metrics.true_positives(\n", + " label_ids,\n", + " predicted_labels)\n", + " true_neg = tf.metrics.true_negatives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_pos = tf.metrics.false_positives(\n", + " label_ids,\n", + " predicted_labels) \n", + " false_neg = tf.metrics.false_negatives(\n", + " label_ids,\n", + " predicted_labels)\n", + " return {\n", + " \"eval_accuracy\": accuracy,\n", + " \"f1_score\": f1_score,\n", + " \"auc\": auc,\n", + " \"precision\": precision,\n", + " \"recall\": recall,\n", + " \"true_positives\": true_pos,\n", + " \"true_negatives\": true_neg,\n", + " \"false_positives\": false_pos,\n", + " \"false_negatives\": false_neg\n", + " }\n", + "\n", + " eval_metrics = metric_fn(label_ids, predicted_labels)\n", + "\n", + " if mode == tf.estimator.ModeKeys.TRAIN:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " train_op=train_op)\n", + " else:\n", + " return tf.estimator.EstimatorSpec(mode=mode,\n", + " loss=loss,\n", + " eval_metric_ops=eval_metrics)\n", + " else:\n", + " (predicted_labels, log_probs) = create_model(\n", + " is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)\n", + "\n", + " predictions = {\n", + " 'probabilities': log_probs,\n", + " 'labels': predicted_labels\n", + " }\n", + " return tf.estimator.EstimatorSpec(mode, predictions=predictions)\n", + "\n", + " # Return the actual model function in the closure\n", + " return model_fn\n" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "OjwJ4bTeWXD8", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute train and warmup steps from batch size\n", + "# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)\n", + "BATCH_SIZE = 32\n", + "LEARNING_RATE = 2e-5\n", + "NUM_TRAIN_EPOCHS = 3.0\n", + "# Warmup is a period of time where hte learning rate \n", + "# is small and gradually increases--usually helps training.\n", + "WARMUP_PROPORTION = 0.1\n", + "# Model configs\n", + "SAVE_CHECKPOINTS_STEPS = 500\n", + "SAVE_SUMMARY_STEPS = 100" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "emHf9GhfWBZ_", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Compute # train and warmup steps from batch size\n", + "num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)\n", + "num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "oEJldMr3WYZa", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Specify outpit directory and number of checkpoint steps to save\n", + "run_config = tf.estimator.RunConfig(\n", + " model_dir=OUTPUT_DIR,\n", + " save_summary_steps=SAVE_SUMMARY_STEPS,\n", + " save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "q_WebpS1X97v", + "colab_type": "code", + "outputId": "1648932a-7391-49d3-8af7-52d514e226e8", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 156 + } + }, + "cell_type": "code", + "source": [ + "model_fn = model_fn_builder(\n", + " num_labels=len(label_list),\n", + " learning_rate=LEARNING_RATE,\n", + " num_train_steps=num_train_steps,\n", + " num_warmup_steps=num_warmup_steps)\n", + "\n", + "estimator = tf.estimator.Estimator(\n", + " model_fn=model_fn,\n", + " config=run_config,\n", + " params={\"batch_size\": BATCH_SIZE})\n" + ], + "execution_count": 55, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Using config: {'_model_dir': 'gs://bert-tfhub/aclImdb_v1', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true\n", + "graph_options {\n", + " rewrite_options {\n", + " meta_optimizer_iterations: ONE\n", + " }\n", + "}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "NOO3RfG1DYLo", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Next we create an input builder function that takes our training feature set (`train_features`) and produces a generator. This is a pretty standard design pattern for working with Tensorflow [Estimators](https://www.tensorflow.org/guide/estimators)." + ] + }, + { + "metadata": { + "id": "1Pv2bAlOX_-K", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "# Create an input function for training. drop_remainder = True for using TPUs.\n", + "train_input_fn = bert.run_classifier.input_fn_builder(\n", + " features=train_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=True,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "t6Nukby2EB6-", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now we train our model! For me, using a Colab notebook running on Google's GPUs, my training time was about 14 minutes." + ] + }, + { + "metadata": { + "id": "nucD4gluYJmK", + "colab_type": "code", + "outputId": "5d728e72-4631-42bf-c48d-3f51d4b968ce", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 68 + } + }, + "cell_type": "code", + "source": [ + "print(f'Beginning Training!')\n", + "current_time = datetime.now()\n", + "estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)\n", + "print(\"Training took time \", datetime.now() - current_time)" + ], + "execution_count": 57, + "outputs": [ + { + "output_type": "stream", + "text": [ + "Beginning Training!\n", + "INFO:tensorflow:Skipping training since max_steps has already saved.\n", + "Training took time 0:00:00.759709\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "CmbLTVniARy3", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's use our test data to see how well our model did:" + ] + }, + { + "metadata": { + "id": "JIhejfpyJ8Bx", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "test_input_fn = run_classifier.input_fn_builder(\n", + " features=test_features,\n", + " seq_length=MAX_SEQ_LENGTH,\n", + " is_training=False,\n", + " drop_remainder=False)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "PPVEXhNjYXC-", + "colab_type": "code", + "outputId": "dd5482cd-c558-465f-c854-ec11a0175316", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 445 + } + }, + "cell_type": "code", + "source": [ + "estimator.evaluate(input_fn=test_input_fn, steps=None)" + ], + "execution_count": 59, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n" + ], + "name": "stdout" + }, + { + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/gradients_impl.py:110: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.\n", + " \"Converting sparse IndexedSlices to a dense Tensor of unknown shape. \"\n" + ], + "name": "stderr" + }, + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Starting evaluation at 2019-02-12T21:04:20Z\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n", + "INFO:tensorflow:Finished evaluation at 2019-02-12-21:06:05\n", + "INFO:tensorflow:Saving dict for global step 468: auc = 0.86659324, eval_accuracy = 0.8664, f1_score = 0.8659711, false_negatives = 375.0, false_positives = 293.0, global_step = 468, loss = 0.51870537, precision = 0.880457, recall = 0.8519542, true_negatives = 2174.0, true_positives = 2158.0\n", + "INFO:tensorflow:Saving 'checkpoint_path' summary for global step 468: gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n" + ], + "name": "stdout" + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'auc': 0.86659324,\n", + " 'eval_accuracy': 0.8664,\n", + " 'f1_score': 0.8659711,\n", + " 'false_negatives': 375.0,\n", + " 'false_positives': 293.0,\n", + " 'global_step': 468,\n", + " 'loss': 0.51870537,\n", + " 'precision': 0.880457,\n", + " 'recall': 0.8519542,\n", + " 'true_negatives': 2174.0,\n", + " 'true_positives': 2158.0}" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 59 + } + ] + }, + { + "metadata": { + "id": "ueKsULteiz1B", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Now let's write code to make predictions on new sentences:" + ] + }, + { + "metadata": { + "id": "OsrbTD2EJTVl", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "def getPrediction(in_sentences):\n", + " labels = [\"Negative\", \"Positive\"]\n", + " input_examples = [run_classifier.InputExample(guid=\"\", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, \"\" is just a dummy label\n", + " input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)\n", + " predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)\n", + " predictions = estimator.predict(predict_input_fn)\n", + " return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "-thbodgih_VJ", + "colab_type": "code", + "colab": {} + }, + "cell_type": "code", + "source": [ + "pred_sentences = [\n", + " \"That movie was absolutely awful\",\n", + " \"The acting was a bit lacking\",\n", + " \"The film was creative and surprising\",\n", + " \"Absolutely fantastic!\"\n", + "]" + ], + "execution_count": 0, + "outputs": [] + }, + { + "metadata": { + "id": "QrZmvZySKQTm", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 649 + }, + "outputId": "3891fafb-a460-4eb8-fa6c-335a5bbc10e5" + }, + "cell_type": "code", + "source": [ + "predictions = getPrediction(pred_sentences)" + ], + "execution_count": 72, + "outputs": [ + { + "output_type": "stream", + "text": [ + "INFO:tensorflow:Writing example 0 of 4\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] that movie was absolutely awful [SEP]\n", + "INFO:tensorflow:input_ids: 101 2008 3185 2001 7078 9643 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the acting was a bit lacking [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 3772 2001 1037 2978 11158 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] the film was creative and surprising [SEP]\n", + "INFO:tensorflow:input_ids: 101 1996 2143 2001 5541 1998 11341 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:guid: \n", + "INFO:tensorflow:tokens: [CLS] absolutely fantastic ! [SEP]\n", + "INFO:tensorflow:input_ids: 101 7078 10392 999 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:label: 0 (id = 0)\n", + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Saver not created because there are no variables in the graph to restore\n", + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Restoring parameters from gs://bert-tfhub/aclImdb_v1/model.ckpt-468\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n" + ], + "name": "stdout" + } + ] + }, + { + "metadata": { + "id": "MXkRiEBUqN3n", + "colab_type": "text" + }, + "cell_type": "markdown", + "source": [ + "Voila! We have a sentiment classifier!" + ] + }, + { + "metadata": { + "id": "ERkTE8-7oQLZ", + "colab_type": "code", + "colab": { + "base_uri": "https://localhost:8080/", + "height": 221 + }, + "outputId": "26c33224-dc2c-4b3d-f7b4-ac3ef0a58b27" + }, + "cell_type": "code", + "source": [ + "predictions" + ], + "execution_count": 73, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "[('That movie was absolutely awful',\n", + " array([-4.9142293e-03, -5.3180690e+00], dtype=float32),\n", + " 'Negative'),\n", + " ('The acting was a bit lacking',\n", + " array([-0.03325794, -3.4200459 ], dtype=float32),\n", + " 'Negative'),\n", + " ('The film was creative and surprising',\n", + " array([-5.3589125e+00, -4.7171740e-03], dtype=float32),\n", + " 'Positive'),\n", + " ('Absolutely fantastic!',\n", + " array([-5.0434084 , -0.00647258], dtype=float32),\n", + " 'Positive')]" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 73 + } + ] + } + ] +} \ No newline at end of file diff --git a/baselines/models/roberta_wwm_large_ext/requirements.txt b/baselines/models/roberta_wwm_large_ext/requirements.txt new file mode 100644 index 0000000..357b5ea --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/requirements.txt @@ -0,0 +1,2 @@ +tensorflow >= 1.11.0 # CPU Version of TensorFlow. +# tensorflow-gpu >= 1.11.0 # GPU version of TensorFlow. diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier.py b/baselines/models/roberta_wwm_large_ext/run_classifier.py new file mode 100644 index 0000000..bf16886 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier.py @@ -0,0 +1,1585 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-09 23:37:20 +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +# Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +# Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file,delimiter="\t", quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter=delimiter, quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKData data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = None + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + #if set_type == "test": + # label = "0" + #else: + # label = tokenization.convert_to_unicode(line[0]) + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() + +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = None + #if set_type == "test": + # label = "0" + #else: + # label = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class JDCOMMENTProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_train.csv"),",", "\""), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_dev.csv"),",", "\""), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "jd_test.csv"),",", "\""), "test") + + def get_labels(self): + """See base class.""" + return ["1", "2", "3", "4", "5"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + #print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[0]) + text_a = tokenization.convert_to_unicode(line[1]) + text_b = tokenization.convert_to_unicode(line[2]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "tnews": TnewsProcessor, + "jdcomment": JDCOMMENTProcessor, + "inews": InewsProcessor, + "thucnews":THUCNewsProcessor, + "lcqmc": LCQMCProcessor, + "bq": BQProcessor, + "iflydata": iFLYTEKDataProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + ## dev dataset + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "dev.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_roberta_wwm_large_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + # result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + # output_eval_file = os.path.join(FLAGS.output_dir, "dev_results_roberta_wwm_large_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + ## test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "test.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + ####################################################################################################################### + # evaluate all checkpoints; you can use the checkpoint with the best dev accuarcy + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.output_dir) + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = os.path.join(FLAGS.output_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_roberta_wwm_large_ext.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps, checkpoint_path=filename) + + tf.logging.info("***** Eval results %s *****" % (filename)) + writer.write("***** Eval results %s *****\n" % (filename)) + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + ####################################################################################################################### + + #result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + # + #output_eval_file = os.path.join(FLAGS.output_dir, "test_results_roberta_wwm_large_ext.txt") + # with tf.gfile.GFile(output_eval_file, "w") as writer: + # tf.logging.info("***** Eval results *****") + # for key in sorted(result.keys()): + # tf.logging.info(" %s = %s", key, str(result[key])) + # writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + num_actual_predict_examples = len(predict_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. + while len(predict_examples) % FLAGS.predict_batch_size != 0: + predict_examples.append(PaddingInputExample()) + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + if task_name == "inews": + file_based_convert_examples_to_features_for_inews(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + else: + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(predict_examples), num_actual_predict_examples, + len(predict_examples) - num_actual_predict_examples) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + num_written_lines = 0 + tf.logging.info("***** Predict results *****") + for (i, prediction) in enumerate(result): + probabilities = prediction["probabilities"] + if i >= num_actual_predict_examples: + break + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + num_written_lines += 1 + assert num_written_lines == num_actual_predict_examples + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_bq.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_bq.sh new file mode 100644 index 0000000..7f056d8 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_bq.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:21 + +TASK_NAME="bq" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_iflydata.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_iflydata.sh new file mode 100644 index 0000000..550a9a1 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_iflydata.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:34 + +TASK_NAME="iflydata" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_inews.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_inews.sh new file mode 100644 index 0000000..d52efd3 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_inews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:38 + +TASK_NAME="inews" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_lcqmc.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_lcqmc.sh new file mode 100644 index 0000000..d85348f --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_lcqmc.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:42 + +TASK_NAME="lcqmc" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_thucnews.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_thucnews.sh new file mode 100644 index 0000000..d9188a4 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_thucnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:47 + +TASK_NAME="thucnews" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_tnews.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_tnews.sh new file mode 100644 index 0000000..7be5b7b --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_tnews.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:50 + +TASK_NAME="tnews" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_with_tfhub.py b/baselines/models/roberta_wwm_large_ext/run_classifier_with_tfhub.py new file mode 100644 index 0000000..9d2f80f --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_with_tfhub.py @@ -0,0 +1,314 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner with TF-Hub.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import optimization +import run_classifier +import tokenization +import tensorflow as tf +import tensorflow_hub as hub + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "bert_hub_module_handle", None, + "Handle for the BERT TF-Hub module.") + + +def create_model(is_training, input_ids, input_mask, segment_ids, labels, + num_labels, bert_hub_module_handle): + """Creates a classification model.""" + tags = set() + if is_training: + tags.add("train") + bert_module = hub.Module(bert_hub_module_handle, tags=tags, trainable=True) + bert_inputs = dict( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids) + bert_outputs = bert_module( + inputs=bert_inputs, + signature="tokens", + as_dict=True) + + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use + # bert_outputs["sequence_output"] instead. + output_layer = bert_outputs["pooled_output"] + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + logits = tf.matmul(output_layer, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + + +def model_fn_builder(num_labels, learning_rate, num_train_steps, + num_warmup_steps, use_tpu, bert_hub_module_handle): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, probabilities) = create_model( + is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, + bert_hub_module_handle) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy(label_ids, predictions) + loss = tf.metrics.mean(per_example_loss) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics) + elif mode == tf.estimator.ModeKeys.PREDICT: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions={"probabilities": probabilities}) + else: + raise ValueError( + "Only TRAIN, EVAL and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def create_tokenizer_from_hub_module(bert_hub_module_handle): + """Get the vocab file and casing info from the Hub module.""" + with tf.Graph().as_default(): + bert_module = hub.Module(bert_hub_module_handle) + tokenization_info = bert_module(signature="tokenization_info", as_dict=True) + with tf.Session() as sess: + vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], + tokenization_info["do_lower_case"]]) + return tokenization.FullTokenizer( + vocab_file=vocab_file, do_lower_case=do_lower_case) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": run_classifier.ColaProcessor, + "mnli": run_classifier.MnliProcessor, + "mrpc": run_classifier.MrpcProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = create_tokenizer_from_hub_module(FLAGS.bert_hub_module_handle) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + num_labels=len(label_list), + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + bert_hub_module_handle=FLAGS.bert_hub_module_handle) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_features = run_classifier.convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = run_classifier.input_fn_builder( + features=train_features, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_features = run_classifier.convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + # Eval will be slightly WRONG on the TPU because it will truncate + # the last batch. + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = run_classifier.input_fn_builder( + features=eval_features, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + if FLAGS.do_predict: + predict_examples = processor.get_test_examples(FLAGS.data_dir) + if FLAGS.use_tpu: + # Discard batch remainder if running on TPU + n = len(predict_examples) + predict_examples = predict_examples[:(n - n % FLAGS.predict_batch_size)] + + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + run_classifier.file_based_convert_examples_to_features( + predict_examples, label_list, FLAGS.max_seq_length, tokenizer, + predict_file) + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + predict_input_fn = run_classifier.file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=FLAGS.use_tpu) + + result = estimator.predict(input_fn=predict_input_fn) + + output_predict_file = os.path.join(FLAGS.output_dir, "test_results.tsv") + with tf.gfile.GFile(output_predict_file, "w") as writer: + tf.logging.info("***** Predict results *****") + for prediction in result: + probabilities = prediction["probabilities"] + output_line = "\t".join( + str(class_probability) + for class_probability in probabilities) + "\n" + writer.write(output_line) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("bert_hub_module_handle") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/run_classifier_xnli.sh b/baselines/models/roberta_wwm_large_ext/run_classifier_xnli.sh new file mode 100644 index 0000000..e7a87c6 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_classifier_xnli.sh @@ -0,0 +1,67 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:57:55 + +TASK_NAME="xnli" +MODEL_NAME="chinese_roberta_wwm_large_ext_L-24_H-1024_A-16" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export ROBERTA_WWM_LARGE_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $ROBERTA_WWM_LARGE_DIR ]; then + mkdir -p $ROBERTA_WWM_LARGE_DIR + echo "makedir $ROBERTA_WWM_LARGE_DIR" +fi +cd $ROBERTA_WWM_LARGE_DIR +if [ ! -f "bert_config.json" ] || [ ! -f "vocab.txt" ] || [ ! -f "bert_model.ckpt.index" ] || [ ! -f "bert_model.ckpt.meta" ] || [ ! -f "bert_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + unzip chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip + rm chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --vocab_file=$ROBERTA_WWM_LARGE_DIR/vocab.txt \ + --bert_config_file=$ROBERTA_WWM_LARGE_DIR/bert_config.json \ + --init_checkpoint=$ROBERTA_WWM_LARGE_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_ner.py b/baselines/models/roberta_wwm_large_ext/run_ner.py new file mode 100644 index 0000000..85cf82f --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_ner.py @@ -0,0 +1,844 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import modeling +import optimization +import tokenization +import tensorflow as tf +from sklearn.metrics import f1_score, precision_score, recall_score +from tensorflow.python.ops import math_ops +import tf_metrics +import pickle +import codecs +import sys + +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +flags = tf.flags + +FLAGS = flags.FLAGS + +flags.DEFINE_string( + "data_dir", None, + "The input datadir.", +) + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model." +) + +flags.DEFINE_string( + "task_name", None, "The name of the task to train." +) + +flags.DEFINE_string( + "token_name", "full", "The name of the task to train." +) + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written." +) + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model)." +) + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text." +) + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization." +) + +flags.DEFINE_bool( + "do_train", False, + "Whether to run training." +) +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool("do_predict", False, "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text = text + self.label = label + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, input_ids, input_mask, segment_ids, label_ids, label_mask): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_ids = label_ids + self.label_mask = label_mask + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_data(cls, input_file): + """Reads a BIO data.""" + with open(input_file) as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + word = line.strip().split(' ')[0] + label = line.strip().split(' ')[-1] + if contends.startswith("-DOCSTART-"): + words.append('') + continue + if len(contends) == 0 and words[-1] == '.': + l = ' '.join([label for label in labels if len(label) > 0]) + w = ' '.join([word for word in words if len(word) > 0]) + lines.append([l, w]) + words = [] + labels = [] + continue + if len(contends) == 0: + continue + words.append(word) + labels.append(label) + return lines + + +class NerProcessor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "train.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "dev.txt")), "dev" + ) + + def get_test_examples(self, data_dir): + return self._create_example( + self._read_data(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + # return ["I-MISC", "I-PER", "I-ORG", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + return ["B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + +class WeiboNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.train")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.dev")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "weiboNER.conll.test")), "test") + + + def get_labels(self): + return ['I-PER.NOM', 'I-PER.NAM', 'I-GPE.NAM', 'I-ORG.NAM', 'I-ORG.NOM', 'I-LOC.NAM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + # return ['B-PER.NOM', 'I-PER.NOM', 'B-LOC.NAM', 'B-PER.NAM', 'I-PER.NAM', 'B-GPE.NAM', 'I-GPE.NAM', 'B-ORG.NAM', 'I-ORG.NAM', 'B-ORG.NOM', 'I-ORG.NOM', 'I-LOC.NAM', 'B-LOC.NOM', 'I-LOC.NOM', "O", "X", "[CLS]", "[SEP]"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + words = [] + labels = [] + for line in f: + contends = line.strip() + tokens = contends.split() + if len(tokens) == 2: + words.append(tokens[0]) + label = tokens[-1] + if label[0] == 'B': + label = "I" + label[1:] + labels.append(label) + else: + if len(contends) == 0 and len(words) > 0: + label = [] + word = [] + for l, w in zip(labels, words): + if len(l) > 0 and len(w) > 0: + label.append(l) + # self.labels.add(l) + word.append(w) + lines.append([' '.join(label), ' '.join(word)]) + words = [] + labels = [] + continue + if contends.startswith("-DOCSTART-"): + continue + + return lines + +class MsraNERProcessor(DataProcessor): + def __init_(self): + self.labels = set() + + def get_train_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "train1.txt")), "train" + ) + + def get_dev_examples(self, data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "dev" + ) + + def get_test_examples(self,data_dir): + return self._create_example( + self._read_raw(os.path.join(data_dir, "testright1.txt")), "test") + + def get_labels(self): + return ['B-PERSON', 'I-PERSON', 'B-LOCATION', 'I-LOCATION', 'B-ORGANIZATION', 'I-ORGANIZATION', "O", "[CLS]", "[SEP]", "X"] + + def _create_example(self, lines, set_type): + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[0]) + examples.append(InputExample(guid=guid, text=text, label=label)) + return examples + + def _read_raw(self, input_file): + with codecs.open(input_file, 'r', encoding='utf-8') as f: + lines = [] + chars = [] + labels = [] + len_count = [] + for line in f: + contends = line.strip() + tokens = contends.split() + for token in tokens: + word, label = token.split('/') + + if label == "nr": + chars = chars + list(word) + labels = labels + ['B-PERSON'] + ['I-PERSON']*(len(word)-1) + elif label == "ns": + chars = chars + list(word) + labels = labels + ['B-LOCATION'] + ['I-LOCATION']*(len(word)-1) + elif label == "nt": + chars = chars + list(word) + labels = labels + ['B-ORGANIZATION'] + ['I-ORGANIZATION']*(len(word)-1) + else: + assert label == "o" + chars = chars + list(word) + labels = labels + ["O"] * len(word) + lines.append([' '.join(labels), ' '.join(chars)]) + len_count.append(len(chars)) + chars = [] + labels = [] + return lines + + +def write_tokens(tokens, mode): + if mode == "test": + path = os.path.join(FLAGS.output_dir, "token_" + mode + ".txt") + wf = open(path, 'a') + for token in tokens: + if token != "**NULL**": + wf.write(token + '\n') + wf.close() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode): + label_map = {} + for (i, label) in enumerate(label_list, 1): + label_map[label] = i + + if not os.path.exists(os.path.join(output_dir, 'label2id.pkl')): + with open(os.path.join(output_dir, 'label2id.pkl'), 'wb') as w: + pickle.dump(label_map, w) + textlist = example.text.split(' ') + labellist = example.label.split(' ') + tokens = [] + labels = [] + label_mask = [] + for i, word in enumerate(textlist): + token = tokenizer.tokenize(word) + tokens.extend(token) + label_1 = labellist[i] + for m in range(len(token)): + if m == 0: + labels.append(label_1) + else: + labels.append("X") + + # tokens = tokenizer.tokenize(example.text) + if len(tokens) >= max_seq_length - 1: + tokens = tokens[0:(max_seq_length - 2)] + labels = labels[0:(max_seq_length - 2)] + ntokens = [] + segment_ids = [] + label_ids = [] + ntokens.append("[CLS]") + segment_ids.append(0) + # append("O") or append("[CLS]") not sure! + label_ids.append(label_map["[CLS]"]) + label_mask.append(0) # not to predict and train + for i, token in enumerate(tokens): + ntokens.append(token) + segment_ids.append(0) + label_ids.append(label_map[labels[i]]) + if labels[i] == 'X': + label_mask.append(0) + else: + label_mask.append(1) + ntokens.append("[SEP]") + segment_ids.append(0) + label_mask.append(0) + # append("O") or append("[SEP]") not sure! + label_ids.append(label_map["[SEP]"]) + input_ids = tokenizer.convert_tokens_to_ids(ntokens) + input_mask = [1] * len(input_ids) + # label_mask = [1] * len(input_ids) + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + # we don't concerned about it! + label_ids.append(0) + ntokens.append("**NULL**") + label_mask.append(0) + # print(len(input_ids)) + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(label_ids) == max_seq_length + assert len(label_mask) == max_seq_length + + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label_ids: %s" % " ".join([str(x) for x in label_ids])) + tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + # tf.logging.info("label_mask: %s" % " ".join([str(x) for x in label_mask])) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_ids=label_ids, + label_mask = label_mask + ) + write_tokens(ntokens, mode) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file, output_dir, mode=None +): + writer = tf.python_io.TFRecordWriter(output_file) + for (ex_index, example) in enumerate(examples): + if ex_index % 5000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + feature = convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer, output_dir, mode) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature(feature.label_ids) + features["label_mask"] = create_int_feature(feature.label_mask) + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + + +def file_based_input_fn_builder(input_file, seq_length, is_training, drop_remainder): + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_mask": tf.FixedLenFeature([seq_length], tf.int64), + } + + def _decode_record(record, name_to_features): + example = tf.parse_single_example(record, name_to_features) + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + return example + + def input_fn(params): + batch_size = params["batch_size"] + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + d = d.apply(tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder + )) + return d + + return input_fn + + +def create_model(bert_config, is_training, input_ids, input_mask, label_mask, + segment_ids, labels, num_labels, use_one_hot_embeddings): + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings + ) + + output_layer = model.get_sequence_output() + + hidden_size = output_layer.shape[-1].value + + output_weight = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02) + ) + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer() + ) + with tf.variable_scope("loss"): + if is_training: + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + output_layer = tf.reshape(output_layer, [-1, hidden_size]) + logits = tf.matmul(output_layer, output_weight, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [-1, FLAGS.max_seq_length, num_labels]) + # mask = tf.cast(input_mask,tf.float32) + # loss = tf.contrib.seq2seq.sequence_loss(logits,labels,mask) + # return (loss, logits, predict) + ########################################################################## + log_probs = tf.nn.log_softmax(logits, axis=-1) + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + mask = tf.cast(label_mask, tf.float32) + mask_example_loss = per_example_loss * mask + loss = tf.reduce_sum(mask_example_loss) + probabilities = tf.nn.softmax(logits, axis=-1) + predict = tf.argmax(probabilities, axis=-1) + return (loss, mask_example_loss, logits, predict) + ########################################################################## + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + def model_fn(features, labels, mode, params): + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + label_mask = features["label_mask"] + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (total_loss, per_example_loss, logits, predicts) = create_model( + bert_config, is_training, input_ids, input_mask, label_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + tvars = tf.trainable_variables() + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars, + init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + tf.logging.info("**** Trainable Variables ****") + + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + hook_dict = {} + hook_dict['loss'] = total_loss + hook_dict['global_steps'] = tf.train.get_or_create_global_step() + logging_hook = tf.train.LoggingTensorHook( + hook_dict, every_n_iter=200) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn, + training_hooks=[logging_hook]) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(per_example_loss, label_ids, logits): + # def metric_fn(label_ids, logits): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + # labels = [] + # for i, x in enumerate() + predict_labels = [] + # for i in range(1, num_labels - 4): + # predict_labels.append(i) + # precision = tf_metrics.precision(label_ids, predictions, num_labels, predict_labels, average="macro") + # recall = tf_metrics.recall(label_ids, predictions, num_labels, predict_labels, average="macro") + # f = tf_metrics.f1(label_ids, predictions, num_labels, predict_labels, average="macro") + + precision = tf_metrics.precision(label_ids, predictions, num_labels, average="macro") + recall = tf_metrics.recall(label_ids, predictions, num_labels, average="macro") + f = tf_metrics.f1(label_ids, predictions, num_labels, average="macro") + + # + return { + "eval_precision": precision, + "eval_recall": recall, + "eval_f": f, + # "eval_loss": loss, + } + + eval_metrics = (metric_fn, [per_example_loss, label_ids, logits]) + # eval_metrics = (metric_fn, [label_ids, logits]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predicts, scaffold_fn=scaffold_fn + ) + return output_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + processors = { + "ner": NerProcessor, + "weiboner": WeiboNERProcessor, + "msraner": MsraNERProcessor + } + # if not FLAGS.do_train and not FLAGS.do_eval: + # raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + if not os.path.exists(FLAGS.output_dir): + os.mkdir(FLAGS.output_dir) + task_name = FLAGS.task_name.lower() + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + + if FLAGS.do_train: + train_examples = processor.get_train_examples(FLAGS.data_dir) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + print(num_train_steps) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list) + 1, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file, FLAGS.output_dir) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + if FLAGS.do_eval: + eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file, FLAGS.output_dir) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d", len(eval_examples)) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + eval_steps = None + if FLAGS.use_tpu: + eval_steps = int(len(eval_examples) / FLAGS.eval_batch_size) + eval_drop_remainder = True if FLAGS.use_tpu else False + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + + if FLAGS.do_predict: + + pred_tags = [] + true_tags = [] + + token_path = os.path.join(FLAGS.output_dir, "token_test.txt") + label_file = os.path.join(FLAGS.output_dir, "label2id.pkl") + label_masks = [] + with open(label_file, "rb") as rf: + label2id = pickle.load(rf) + id2label = {value: key for key, value in label2id.items()} + if os.path.exists(token_path): + os.remove(token_path) + predict_examples = processor.get_test_examples(FLAGS.data_dir) + ground_truth_file = os.path.join(FLAGS.output_dir, "ground_truth.txt") + with open(ground_truth_file, 'w') as writer: + for ex_index, example in enumerate(predict_examples): + feature = convert_single_example(ex_index, example, label_list, FLAGS.max_seq_length, tokenizer, FLAGS.output_dir, "test") + line = [] + for i, id in enumerate(feature.label_ids): + if feature.label_mask[i] == 1: + line.append(id2label[id]) + true_tags.append(id2label[id]) + # output_line = " ".join(id2label[id] for id in feature.label_ids if id != 0) + "\n" + output_line = " ".join(line) + "\n" + writer.write(output_line) + label_masks.append(feature.label_mask) + predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record") + file_based_convert_examples_to_features(predict_examples, label_list, + FLAGS.max_seq_length, tokenizer, + predict_file, FLAGS.output_dir, mode="test") + + tf.logging.info("***** Running prediction*****") + tf.logging.info(" Num examples = %d", len(predict_examples)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + if FLAGS.use_tpu: + # Warning: According to tpu_estimator.py Prediction on TPU is an + # experimental feature and hence not supported here + raise ValueError("Prediction in TPU not supported") + predict_drop_remainder = True if FLAGS.use_tpu else False + predict_input_fn = file_based_input_fn_builder( + input_file=predict_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=predict_drop_remainder) + + result = estimator.predict(input_fn=predict_input_fn) + output_predict_file = os.path.join(FLAGS.output_dir, "label_test.txt") + + with open(output_predict_file, 'w') as writer: + for i, prediction in enumerate(result): + line = [] + for j, x in enumerate(prediction): + if label_masks[i][j] == 0: + continue + else: + line.append(id2label[x]) + # writer.write(id2label[x] + "\n") + pred_tags.append(id2label[x]) + output_line = " ".join(line) + "\n" + # # output_line = " ".join(id2label[id] for id in prediction if id != 0) + "\n" + writer.write(output_line) + # evaluate(true_tags, pred_tags, verbose=True) + # evaluate(true_tags, pred_tags) + + tmp = codecs.open(os.path.join(FLAGS.output_dir, "tmp"), 'w', 'utf8') + with codecs.open(ground_truth_file, 'r', 'utf8') as ft, codecs.open(output_predict_file, 'r', 'utf8') as fg: + for lt, lg in zip(ft, fg): + for tl, tg in zip(lt.strip().split(), lg.strip().split()): + print('\t'.join([" ", tl, tg]), file=tmp) + tmp.close() + cmd = "python %s -d '\t' < %s > %s" % \ + (os.path.join(os.getcwd(), "conlleval.py"), \ + os.path.join(FLAGS.output_dir, "tmp"), \ + os.path.join(FLAGS.data_dir, "test_results_roberta_wwm_large_ext.txt")) + os.system(cmd) + + +if __name__ == "__main__": + flags.mark_flag_as_required("data_dir") + flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/run_ner_msra.sh b/baselines/models/roberta_wwm_large_ext/run_ner_msra.sh new file mode 100644 index 0000000..c235729 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_ner_msra.sh @@ -0,0 +1,20 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export GLUE_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets/ +TASK_NAME="msraner" + +python run_ner.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=false \ + --do_predict=true \ + --data_dir=$GLUE_DIR/$TASK_NAME \ + --vocab_file=$BERT_BASE_DIR/vocab.txt \ + --bert_config_file=$BERT_BASE_DIR/bert_config.json \ + --init_checkpoint=$BERT_BASE_DIR/bert_model.ckpt \ + --max_seq_length=256 \ + --train_batch_size=8 \ + --learning_rate=2e-5 \ + --num_train_epochs=5.0 \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ diff --git a/baselines/models/roberta_wwm_large_ext/run_pretraining.py b/baselines/models/roberta_wwm_large_ext/run_pretraining.py new file mode 100644 index 0000000..b118f62 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_pretraining.py @@ -0,0 +1,493 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run masked LM/next sentence masked_lm pre-training for BERT.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import modeling +import optimization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string( + "input_file", None, + "Input TF example files (can be a glob or comma separated).") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded. Must match data generation.") + +flags.DEFINE_integer( + "max_predictions_per_seq", 20, + "Maximum number of masked LM predictions per sequence. " + "Must match data generation.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_integer("num_train_steps", 100000, "Number of training steps.") + +flags.DEFINE_integer("num_warmup_steps", 10000, "Number of warmup steps.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer("max_eval_steps", 100, "Maximum number of eval steps.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + masked_lm_positions = features["masked_lm_positions"] + masked_lm_ids = features["masked_lm_ids"] + masked_lm_weights = features["masked_lm_weights"] + next_sentence_labels = features["next_sentence_labels"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + (masked_lm_loss, + masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output( + bert_config, model.get_sequence_output(), model.get_embedding_table(), + masked_lm_positions, masked_lm_ids, masked_lm_weights) + + (next_sentence_loss, next_sentence_example_loss, + next_sentence_log_probs) = get_next_sentence_output( + bert_config, model.get_pooled_output(), next_sentence_labels) + + total_loss = masked_lm_loss + next_sentence_loss + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + + def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels): + """Computes the loss and accuracy of the model.""" + masked_lm_log_probs = tf.reshape(masked_lm_log_probs, + [-1, masked_lm_log_probs.shape[-1]]) + masked_lm_predictions = tf.argmax( + masked_lm_log_probs, axis=-1, output_type=tf.int32) + masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1]) + masked_lm_ids = tf.reshape(masked_lm_ids, [-1]) + masked_lm_weights = tf.reshape(masked_lm_weights, [-1]) + masked_lm_accuracy = tf.metrics.accuracy( + labels=masked_lm_ids, + predictions=masked_lm_predictions, + weights=masked_lm_weights) + masked_lm_mean_loss = tf.metrics.mean( + values=masked_lm_example_loss, weights=masked_lm_weights) + + next_sentence_log_probs = tf.reshape( + next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]]) + next_sentence_predictions = tf.argmax( + next_sentence_log_probs, axis=-1, output_type=tf.int32) + next_sentence_labels = tf.reshape(next_sentence_labels, [-1]) + next_sentence_accuracy = tf.metrics.accuracy( + labels=next_sentence_labels, predictions=next_sentence_predictions) + next_sentence_mean_loss = tf.metrics.mean( + values=next_sentence_example_loss) + + return { + "masked_lm_accuracy": masked_lm_accuracy, + "masked_lm_loss": masked_lm_mean_loss, + "next_sentence_accuracy": next_sentence_accuracy, + "next_sentence_loss": next_sentence_mean_loss, + } + + eval_metrics = (metric_fn, [ + masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids, + masked_lm_weights, next_sentence_example_loss, + next_sentence_log_probs, next_sentence_labels + ]) + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=eval_metrics, + scaffold_fn=scaffold_fn) + else: + raise ValueError("Only TRAIN and EVAL modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def get_masked_lm_output(bert_config, input_tensor, output_weights, positions, + label_ids, label_weights): + """Get loss and log probs for the masked LM.""" + input_tensor = gather_indexes(input_tensor, positions) + + with tf.variable_scope("cls/predictions"): + # We apply one more non-linear transformation before the output layer. + # This matrix is not used after pre-training. + with tf.variable_scope("transform"): + input_tensor = tf.layers.dense( + input_tensor, + units=bert_config.hidden_size, + activation=modeling.get_activation(bert_config.hidden_act), + kernel_initializer=modeling.create_initializer( + bert_config.initializer_range)) + input_tensor = modeling.layer_norm(input_tensor) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + output_bias = tf.get_variable( + "output_bias", + shape=[bert_config.vocab_size], + initializer=tf.zeros_initializer()) + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + label_ids = tf.reshape(label_ids, [-1]) + label_weights = tf.reshape(label_weights, [-1]) + + one_hot_labels = tf.one_hot( + label_ids, depth=bert_config.vocab_size, dtype=tf.float32) + + # The `positions` tensor might be zero-padded (if the sequence is too + # short to have the maximum number of predictions). The `label_weights` + # tensor has a value of 1.0 for every real prediction and 0.0 for the + # padding predictions. + per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) + numerator = tf.reduce_sum(label_weights * per_example_loss) + denominator = tf.reduce_sum(label_weights) + 1e-5 + loss = numerator / denominator + + return (loss, per_example_loss, log_probs) + + +def get_next_sentence_output(bert_config, input_tensor, labels): + """Get loss and log probs for the next sentence prediction.""" + + # Simple binary classification. Note that 0 is "next sentence" and 1 is + # "random sentence". This weight matrix is not used after pre-training. + with tf.variable_scope("cls/seq_relationship"): + output_weights = tf.get_variable( + "output_weights", + shape=[2, bert_config.hidden_size], + initializer=modeling.create_initializer(bert_config.initializer_range)) + output_bias = tf.get_variable( + "output_bias", shape=[2], initializer=tf.zeros_initializer()) + + logits = tf.matmul(input_tensor, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + labels = tf.reshape(labels, [-1]) + one_hot_labels = tf.one_hot(labels, depth=2, dtype=tf.float32) + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return (loss, per_example_loss, log_probs) + + +def gather_indexes(sequence_tensor, positions): + """Gathers the vectors at the specific positions over a minibatch.""" + sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3) + batch_size = sequence_shape[0] + seq_length = sequence_shape[1] + width = sequence_shape[2] + + flat_offsets = tf.reshape( + tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1]) + flat_positions = tf.reshape(positions + flat_offsets, [-1]) + flat_sequence_tensor = tf.reshape(sequence_tensor, + [batch_size * seq_length, width]) + output_tensor = tf.gather(flat_sequence_tensor, flat_positions) + return output_tensor + + +def input_fn_builder(input_files, + max_seq_length, + max_predictions_per_seq, + is_training, + num_cpu_threads=4): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + name_to_features = { + "input_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "input_mask": + tf.FixedLenFeature([max_seq_length], tf.int64), + "segment_ids": + tf.FixedLenFeature([max_seq_length], tf.int64), + "masked_lm_positions": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_ids": + tf.FixedLenFeature([max_predictions_per_seq], tf.int64), + "masked_lm_weights": + tf.FixedLenFeature([max_predictions_per_seq], tf.float32), + "next_sentence_labels": + tf.FixedLenFeature([1], tf.int64), + } + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = tf.data.Dataset.from_tensor_slices(tf.constant(input_files)) + d = d.repeat() + d = d.shuffle(buffer_size=len(input_files)) + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_cpu_threads, len(input_files)) + + # `sloppy` mode means that the interleaving is not exact. This adds + # even more randomness to the training pipeline. + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + d = d.shuffle(buffer_size=100) + else: + d = tf.data.TFRecordDataset(input_files) + # Since we evaluate for a fixed number of steps we don't want to encounter + # out-of-range exceptions. + d = d.repeat() + + # We must `drop_remainder` on training because the TPU requires fixed + # size dimensions. For eval, we assume we are evaluating on the CPU or GPU + # and we *don't* want to drop the remainder, otherwise we wont cover + # every sample. + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_cpu_threads, + drop_remainder=True)) + return d + + return input_fn + + +def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not FLAGS.do_train and not FLAGS.do_eval: + raise ValueError("At least one of `do_train` or `do_eval` must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + input_files = [] + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) + + tf.logging.info("*** Input Files ***") + for input_file in input_files: + tf.logging.info(" %s" % input_file) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=FLAGS.num_train_steps, + num_warmup_steps=FLAGS.num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + + if FLAGS.do_train: + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + train_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=True) + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps) + + if FLAGS.do_eval: + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + eval_input_fn = input_fn_builder( + input_files=input_files, + max_seq_length=FLAGS.max_seq_length, + max_predictions_per_seq=FLAGS.max_predictions_per_seq, + is_training=False) + + result = estimator.evaluate( + input_fn=eval_input_fn, steps=FLAGS.max_eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + flags.mark_flag_as_required("input_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/run_squad.py b/baselines/models/roberta_wwm_large_ext/run_squad.py new file mode 100644 index 0000000..edd4c3e --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/run_squad.py @@ -0,0 +1,1283 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Run BERT on SQuAD 1.1 and SQuAD 2.0.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import json +import math +import os +import random +import modeling +import optimization +import tokenization +import six +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters +flags.DEFINE_string("train_file", None, + "SQuAD json for training. E.g., train-v1.1.json") + +flags.DEFINE_string( + "predict_file", None, + "SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 384, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_integer( + "doc_stride", 128, + "When splitting up a long document into chunks, how much stride to " + "take between chunks.") + +flags.DEFINE_integer( + "max_query_length", 64, + "The maximum number of tokens for the question. Questions longer than " + "this will be truncated to this length.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_predict", False, "Whether to run eval on the dev set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("predict_batch_size", 8, + "Total batch size for predictions.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_integer( + "n_best_size", 20, + "The total number of n-best predictions to generate in the " + "nbest_predictions.json output file.") + +flags.DEFINE_integer( + "max_answer_length", 30, + "The maximum length of an answer that can be generated. This is needed " + "because the start and end predictions are not conditioned on one another.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool( + "verbose_logging", False, + "If true, all of the warnings related to data processing will be printed. " + "A number of warnings are expected for a normal SQuAD evaluation.") + +flags.DEFINE_bool( + "version_2_with_negative", False, + "If true, the SQuAD examples contain some that do not have an answer.") + +flags.DEFINE_float( + "null_score_diff_threshold", 0.0, + "If null_score - best_non_null is greater than the threshold predict null.") + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) + s += ", question_text: %s" % ( + tokenization.printable_text(self.question_text)) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", end_position: %d" % (self.end_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + + if FLAGS.version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - + 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join( + doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + tokenization.whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + tf.logging.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + unique_id = 1000000000 + + for (example_index, example) in enumerate(examples): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training and not example.is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and example.is_impossible: + start_position = 0 + end_position = 0 + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("token_to_orig_map: %s" % " ".join( + ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and example.is_impossible: + tf.logging.info("impossible example") + if is_training and not example.is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (tokenization.printable_text(answer_text))) + + feature = InputFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + start_position=start_position, + end_position=end_position, + is_impossible=example.is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + return (start_logits, end_logits) + + +def model_fn_builder(bert_config, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + unique_ids = features["unique_ids"] + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + (start_logits, end_logits) = create_model( + bert_config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings) + + tvars = tf.trainable_variables() + + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + seq_length = modeling.get_shape_list(input_ids)[1] + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = features["start_positions"] + end_positions = features["end_positions"] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.PREDICT: + predictions = { + "unique_ids": unique_ids, + "start_logits": start_logits, + "end_logits": end_logits, + } + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + raise ValueError( + "Only TRAIN and PREDICT modes are supported: %s" % (mode)) + + return output_spec + + return model_fn + + +def input_fn_builder(input_file, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min mull score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if FLAGS.version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + + if FLAGS.version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # if we didn't inlude the empty option in the n-best, inlcude it + if FLAGS.version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", start_logit=null_start_logit, + end_logit=null_end_logit)) + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not FLAGS.version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > FLAGS.null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if FLAGS.version_2_with_negative: + with tf.gfile.GFile(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heruistic between + # `pred_text` and `orig_text` to get a character-to-charcter alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if FLAGS.verbose_logging: + tf.logging.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if FLAGS.verbose_logging: + tf.logging.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in six.iteritems(tok_ns_to_s_map): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if FLAGS.verbose_logging: + tf.logging.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_int_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +def validate_flags_or_throw(bert_config): + """Validate the input FLAGS or throw an exception.""" + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError("At least one of `do_train` or `do_predict` must be True.") + + if FLAGS.do_train: + if not FLAGS.train_file: + raise ValueError( + "If `do_train` is True, then `train_file` must be specified.") + if FLAGS.do_predict: + if not FLAGS.predict_file: + raise ValueError( + "If `do_predict` is True, then `predict_file` must be specified.") + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + if FLAGS.max_seq_length <= FLAGS.max_query_length + 3: + raise ValueError( + "The max_seq_length (%d) must be greater than max_query_length " + "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length)) + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + validate_flags_or_throw(bert_config) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + train_examples = read_squad_examples( + input_file=FLAGS.train_file, is_training=True) + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in in the `input_fn`. + rng = random.Random(12345) + rng.shuffle(train_examples) + + model_fn = model_fn_builder( + bert_config=bert_config, + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + # We write to a temporary file to avoid storing very large constant tensors + # in memory. + train_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "train.tf_record"), + is_training=True) + convert_examples_to_features( + examples=train_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + tf.logging.info("***** Running training *****") + tf.logging.info(" Num orig examples = %d", len(train_examples)) + tf.logging.info(" Num split examples = %d", train_writer.num_features) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + del train_examples + + train_input_fn = input_fn_builder( + input_file=train_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_predict: + eval_examples = read_squad_examples( + input_file=FLAGS.predict_file, is_training=False) + + eval_writer = FeatureWriter( + filename=os.path.join(FLAGS.output_dir, "eval.tf_record"), + is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + tokenizer=tokenizer, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + tf.logging.info("***** Running predictions *****") + tf.logging.info(" Num orig examples = %d", len(eval_examples)) + tf.logging.info(" Num split examples = %d", len(eval_features)) + tf.logging.info(" Batch size = %d", FLAGS.predict_batch_size) + + all_results = [] + + predict_input_fn = input_fn_builder( + input_file=eval_writer.filename, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + # If running eval on the TPU, you will need to specify the number of + # steps. + all_results = [] + for result in estimator.predict( + predict_input_fn, yield_single_examples=True): + if len(all_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(all_results))) + unique_id = int(result["unique_ids"]) + start_logits = [float(x) for x in result["start_logits"].flat] + end_logits = [float(x) for x in result["end_logits"].flat] + all_results.append( + RawResult( + unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + output_prediction_file = os.path.join(FLAGS.output_dir, "predictions.json") + output_nbest_file = os.path.join(FLAGS.output_dir, "nbest_predictions.json") + output_null_log_odds_file = os.path.join(FLAGS.output_dir, "null_odds.json") + + write_predictions(eval_examples, eval_features, all_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + FLAGS.do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file) + + +if __name__ == "__main__": + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() diff --git a/baselines/models/roberta_wwm_large_ext/tf_metrics.py b/baselines/models/roberta_wwm_large_ext/tf_metrics.py new file mode 100644 index 0000000..7ccacd4 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tf_metrics.py @@ -0,0 +1,215 @@ +""" +Multiclass +from: +https://github.com/guillaumegenthial/tf_metrics/blob/master/tf_metrics/__init__.py + +""" + +__author__ = "Guillaume Genthial" + +import numpy as np +import tensorflow as tf +from tensorflow.python.ops.metrics_impl import _streaming_confusion_matrix + + +def precision(labels, predictions, num_classes, pos_indices=None, + weights=None, average='micro'): + """Multi-class precision metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + pr, _, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + op, _, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (pr, op) + + +def recall(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + """Multi-class recall metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, re, _ = metrics_from_confusion_matrix( + cm, pos_indices, average=average) + _, op, _ = metrics_from_confusion_matrix( + op, pos_indices, average=average) + return (re, op) + + +def f1(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro'): + return fbeta(labels, predictions, num_classes, pos_indices, weights, + average) + + +def fbeta(labels, predictions, num_classes, pos_indices=None, weights=None, + average='micro', beta=1): + """Multi-class fbeta metric for Tensorflow + Parameters + ---------- + labels : Tensor of tf.int32 or tf.int64 + The true labels + predictions : Tensor of tf.int32 or tf.int64 + The predictions, same shape as labels + num_classes : int + The number of classes + pos_indices : list of int, optional + The indices of the positive classes, default is all + weights : Tensor of tf.int32, optional + Mask, must be of compatible shape with labels + average : str, optional + 'micro': counts the total number of true positives, false + positives, and false negatives for the classes in + `pos_indices` and infer the metric from it. + 'macro': will compute the metric separately for each class in + `pos_indices` and average. Will not account for class + imbalance. + 'weighted': will compute the metric separately for each class in + `pos_indices` and perform a weighted average by the total + number of true labels for each class. + beta : int, optional + Weight of precision in harmonic mean + Returns + ------- + tuple of (scalar float Tensor, update_op) + """ + cm, op = _streaming_confusion_matrix( + labels, predictions, num_classes, weights) + _, _, fbeta = metrics_from_confusion_matrix( + cm, pos_indices, average=average, beta=beta) + _, _, op = metrics_from_confusion_matrix( + op, pos_indices, average=average, beta=beta) + return (fbeta, op) + + +def safe_div(numerator, denominator): + """Safe division, return 0 if denominator is 0""" + numerator, denominator = tf.to_float(numerator), tf.to_float(denominator) + zeros = tf.zeros_like(numerator, dtype=numerator.dtype) + denominator_is_zero = tf.equal(denominator, zeros) + return tf.where(denominator_is_zero, zeros, numerator / denominator) + + +def pr_re_fbeta(cm, pos_indices, beta=1): + """Uses a confusion matrix to compute precision, recall and fbeta""" + num_classes = cm.shape[0] + neg_indices = [i for i in range(num_classes) if i not in pos_indices] + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, neg_indices] = 0 + diag_sum = tf.reduce_sum(tf.diag_part(cm * cm_mask)) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[:, neg_indices] = 0 + tot_pred = tf.reduce_sum(cm * cm_mask) + + cm_mask = np.ones([num_classes, num_classes]) + cm_mask[neg_indices, :] = 0 + tot_gold = tf.reduce_sum(cm * cm_mask) + + pr = safe_div(diag_sum, tot_pred) + re = safe_div(diag_sum, tot_gold) + fbeta = safe_div((1. + beta**2) * pr * re, beta**2 * pr + re) + + return pr, re, fbeta + + +def metrics_from_confusion_matrix(cm, pos_indices=None, average='micro', + beta=1): + """Precision, Recall and F1 from the confusion matrix + Parameters + ---------- + cm : tf.Tensor of type tf.int32, of shape (num_classes, num_classes) + The streaming confusion matrix. + pos_indices : list of int, optional + The indices of the positive classes + beta : int, optional + Weight of precision in harmonic mean + average : str, optional + 'micro', 'macro' or 'weighted' + """ + num_classes = cm.shape[0] + if pos_indices is None: + pos_indices = [i for i in range(num_classes)] + + if average == 'micro': + return pr_re_fbeta(cm, pos_indices, beta) + elif average in {'macro', 'weighted'}: + precisions, recalls, fbetas, n_golds = [], [], [], [] + for idx in pos_indices: + pr, re, fbeta = pr_re_fbeta(cm, [idx], beta) + precisions.append(pr) + recalls.append(re) + fbetas.append(fbeta) + cm_mask = np.zeros([num_classes, num_classes]) + cm_mask[idx, :] = 1 + n_golds.append(tf.to_float(tf.reduce_sum(cm * cm_mask))) + + if average == 'macro': + pr = tf.reduce_mean(precisions) + re = tf.reduce_mean(recalls) + fbeta = tf.reduce_mean(fbetas) + return pr, re, fbeta + if average == 'weighted': + n_gold = tf.reduce_sum(n_golds) + pr_sum = sum(p * n for p, n in zip(precisions, n_golds)) + pr = safe_div(pr_sum, n_gold) + re_sum = sum(r * n for r, n in zip(recalls, n_golds)) + re = safe_div(re_sum, n_gold) + fbeta_sum = sum(f * n for f, n in zip(fbetas, n_golds)) + fbeta = safe_div(fbeta_sum, n_gold) + return pr, re, fbeta + + else: + raise NotImplementedError() \ No newline at end of file diff --git a/baselines/models/roberta_wwm_large_ext/tokenization.py b/baselines/models/roberta_wwm_large_ext/tokenization.py new file mode 100644 index 0000000..0ee1359 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tokenization.py @@ -0,0 +1,399 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import re +import unicodedata +import six +import tensorflow as tf + + +def validate_case_matches_checkpoint(do_lower_case, init_checkpoint): + """Checks whether the casing config is consistent with the checkpoint name.""" + + # The casing has to be passed in by the user and there is no explicit check + # as to whether it matches the checkpoint. The casing information probably + # should have been stored in the bert_config.json file, but it's not, so + # we have to heuristically detect it to validate. + + if not init_checkpoint: + return + + m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint) + if m is None: + return + + model_name = m.group(1) + + lower_models = [ + "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12", + "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12" + ] + + cased_models = [ + "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16", + "multi_cased_L-12_H-768_A-12" + ] + + is_bad_config = False + if model_name in lower_models and not do_lower_case: + is_bad_config = True + actual_flag = "False" + case_name = "lowercased" + opposite_flag = "True" + + if model_name in cased_models and do_lower_case: + is_bad_config = True + actual_flag = "True" + case_name = "cased" + opposite_flag = "False" + + if is_bad_config: + raise ValueError( + "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. " + "However, `%s` seems to be a %s model, so you " + "should pass in `--do_lower_case=%s` so that the fine-tuning matches " + "how the model was pre-training. If this error is wrong, please " + "just comment out this check." % (actual_flag, init_checkpoint, + model_name, case_name, opposite_flag)) + + +def convert_to_unicode(text): + """Converts `text` to Unicode (if it's not already), assuming utf-8 input.""" + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text.decode("utf-8", "ignore") + elif isinstance(text, unicode): + return text + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with tf.gfile.GFile(vocab_file, "r") as reader: + while True: + token = convert_to_unicode(reader.readline()) + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def convert_by_vocab(vocab, items): + """Converts a sequence of [tokens|ids] using the vocab.""" + output = [] + for item in items: + output.append(vocab[item]) + return output + + +def convert_tokens_to_ids(vocab, tokens): + return convert_by_vocab(vocab, tokens) + + +def convert_ids_to_tokens(inv_vocab, ids): + return convert_by_vocab(inv_vocab, ids) + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class FullTokenizer(object): + """Runs end-to-end tokenziation.""" + + def __init__(self, vocab_file, do_lower_case=True): + self.vocab = load_vocab(vocab_file) + self.inv_vocab = {v: k for k, v in self.vocab.items()} + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + + return split_tokens + + def convert_tokens_to_ids(self, tokens): + return convert_by_vocab(self.vocab, tokens) + + def convert_ids_to_tokens(self, ids): + return convert_by_vocab(self.inv_vocab, ids) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = convert_to_unicode(text) + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenziation.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + text = convert_to_unicode(text) + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat in ("Cc", "Cf"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models/roberta_wwm_large_ext/tokenization_test.py b/baselines/models/roberta_wwm_large_ext/tokenization_test.py new file mode 100644 index 0000000..0afaedd --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tokenization_test.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tempfile +import tokenization +import six +import tensorflow as tf + + +class TokenizationTest(tf.test.TestCase): + + def test_full_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing", "," + ] + with tempfile.NamedTemporaryFile(delete=False) as vocab_writer: + if six.PY2: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + else: + vocab_writer.write("".join( + [x + "\n" for x in vocab_tokens]).encode("utf-8")) + + vocab_file = vocab_writer.name + + tokenizer = tokenization.FullTokenizer(vocab_file) + os.unlink(vocab_file) + + tokens = tokenizer.tokenize(u"UNwant\u00E9d,running") + self.assertAllEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9]) + + def test_chinese(self): + tokenizer = tokenization.BasicTokenizer() + + self.assertAllEqual( + tokenizer.tokenize(u"ah\u535A\u63A8zz"), + [u"ah", u"\u535A", u"\u63A8", u"zz"]) + + def test_basic_tokenizer_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=True) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["hello", "!", "how", "are", "you", "?"]) + self.assertAllEqual(tokenizer.tokenize(u"H\u00E9llo"), ["hello"]) + + def test_basic_tokenizer_no_lower(self): + tokenizer = tokenization.BasicTokenizer(do_lower_case=False) + + self.assertAllEqual( + tokenizer.tokenize(u" \tHeLLo!how \n Are yoU? "), + ["HeLLo", "!", "how", "Are", "yoU", "?"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = tokenization.WordpieceTokenizer(vocab=vocab) + + self.assertAllEqual(tokenizer.tokenize(""), []) + + self.assertAllEqual( + tokenizer.tokenize("unwanted running"), + ["un", "##want", "##ed", "runn", "##ing"]) + + self.assertAllEqual( + tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + + def test_convert_tokens_to_ids(self): + vocab_tokens = [ + "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn", + "##ing" + ] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + + self.assertAllEqual( + tokenization.convert_tokens_to_ids( + vocab, ["un", "##want", "##ed", "runn", "##ing"]), [7, 4, 5, 8, 9]) + + def test_is_whitespace(self): + self.assertTrue(tokenization._is_whitespace(u" ")) + self.assertTrue(tokenization._is_whitespace(u"\t")) + self.assertTrue(tokenization._is_whitespace(u"\r")) + self.assertTrue(tokenization._is_whitespace(u"\n")) + self.assertTrue(tokenization._is_whitespace(u"\u00A0")) + + self.assertFalse(tokenization._is_whitespace(u"A")) + self.assertFalse(tokenization._is_whitespace(u"-")) + + def test_is_control(self): + self.assertTrue(tokenization._is_control(u"\u0005")) + + self.assertFalse(tokenization._is_control(u"A")) + self.assertFalse(tokenization._is_control(u" ")) + self.assertFalse(tokenization._is_control(u"\t")) + self.assertFalse(tokenization._is_control(u"\r")) + self.assertFalse(tokenization._is_control(u"\U0001F4A9")) + + def test_is_punctuation(self): + self.assertTrue(tokenization._is_punctuation(u"-")) + self.assertTrue(tokenization._is_punctuation(u"$")) + self.assertTrue(tokenization._is_punctuation(u"`")) + self.assertTrue(tokenization._is_punctuation(u".")) + + self.assertFalse(tokenization._is_punctuation(u"A")) + self.assertFalse(tokenization._is_punctuation(u" ")) + + +if __name__ == "__main__": + tf.test.main() diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_inews.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..2edab12 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_inews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_jdcomment.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_jdcomment.sh new file mode 100755 index 0000000..4b88a95 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_jdcomment.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="jdcomment" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME} +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.230.1.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_lcqmc.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..c9fe0e0 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_thucnews.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_thucnews.sh new file mode 100755 index 0000000..9f690c0 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_thucnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="thucnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_tnews.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..39d3245 --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_tnews.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=128 \ + --train_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=3.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://172.18.0.2:8470 diff --git a/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_xnli.sh b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..9dd4c2a --- /dev/null +++ b/baselines/models/roberta_wwm_large_ext/tpu/run_classifier_xnli.sh @@ -0,0 +1,21 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export PREV_TRAINED_MODEL_DIR=gs://models_zxw/prev_trained_models/nlp/roberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/oberta-wwm-ext-large/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --task_name=$TASK_NAME \ + --do_train=true \ + --do_eval=true \ + --data_dir=$DATA_DIR \ + --vocab_file=$PREV_TRAINED_MODEL_DIR/vocab.txt \ + --bert_config_file=$PREV_TRAINED_MODEL_DIR/bert_config.json \ + --init_checkpoint=$PREV_TRAINED_MODEL_DIR/bert_model.ckpt \ + --max_seq_length=512 \ + --train_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=8.0 \ + --output_dir=$OUTPUT_DIR \ + --num_tpu_cores=8 --use_tpu=True --tpu_name=grpc://10.1.101.2:8470 diff --git a/baselines/models/xlnet/__init__.py b/baselines/models/xlnet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/baselines/models/xlnet/classifier_utils.py b/baselines/models/xlnet/classifier_utils.py new file mode 100644 index 0000000..f6394d0 --- /dev/null +++ b/baselines/models/xlnet/classifier_utils.py @@ -0,0 +1,246 @@ +from absl import flags + +import re +import numpy as np + +import tensorflow as tf +from data_utils import SEP_ID, CLS_ID + +FLAGS = flags.FLAGS + +SEG_ID_A = 0 +SEG_ID_B = 1 +SEG_ID_CLS = 2 +SEG_ID_SEP = 3 +SEG_ID_PAD = 4 + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenize_fn): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[1] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + if label_list is not None: + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenize_fn(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenize_fn(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for two [SEP] & one [CLS] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for one [SEP] & one [CLS] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:max_seq_length - 2] + + tokens = [] + segment_ids = [] + for token in tokens_a: + tokens.append(token) + segment_ids.append(SEG_ID_A) + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_A) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(SEG_ID_B) + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_B) + + tokens.append(CLS_ID) + segment_ids.append(SEG_ID_CLS) + + input_ids = tokens + + # The mask has 0 for real tokens and 1 for padding tokens. Only real + # tokens are attended to. + input_mask = [0] * len(input_ids) + + # Zero-pad up to the sequence length. + if len(input_ids) < max_seq_length: + delta_len = max_seq_length - len(input_ids) + input_ids = [0] * delta_len + input_ids + input_mask = [1] * delta_len + input_mask + segment_ids = [SEG_ID_PAD] * delta_len + segment_ids + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if label_list is not None: + label_id = label_map[example.label] + else: + label_id = example.label + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: {} (id = {})".format(example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id) + return feature + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for two [SEP] & one [CLS] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for one [SEP] & one [CLS] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[:max_seq_length - 2] + + tokens = [] + segment_ids = [] + for token in tokens_a: + tokens.append(token) + segment_ids.append(SEG_ID_A) + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_A) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(SEG_ID_B) + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_B) + + tokens.append(CLS_ID) + segment_ids.append(SEG_ID_CLS) + + input_ids = tokens + + # The mask has 0 for real tokens and 1 for padding tokens. Only real + # tokens are attended to. + input_mask = [0] * len(input_ids) + + # Zero-pad up to the sequence length. + if len(input_ids) < max_seq_length: + delta_len = max_seq_length - len(input_ids) + input_ids = [0] * delta_len + input_ids + input_mask = [1] * delta_len + input_mask + segment_ids = [SEG_ID_PAD] * delta_len + segment_ids + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + if label_map is not None: + label_id = label_map[example.label] + else: + label_id = example.label + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: {} (id = {})".format(example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id) + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list diff --git a/baselines/models/xlnet/cmrc2018_evaluate_drcd.py b/baselines/models/xlnet/cmrc2018_evaluate_drcd.py new file mode 100644 index 0000000..b6f0688 --- /dev/null +++ b/baselines/models/xlnet/cmrc2018_evaluate_drcd.py @@ -0,0 +1,151 @@ +# -*- coding: utf-8 -*- +''' +Evaluation script for CMRC 2018 +version: v5 +Note: +v5 formatted output, add usage description +v4 fixed segmentation issues +''' +from __future__ import print_function +from collections import Counter, OrderedDict +import string +import re +import argparse +import json +import sys +reload(sys) +sys.setdefaultencoding('utf8') +import nltk +import pdb + +# split Chinese with English +def mixed_segmentation(in_str, rm_punc=False): + in_str = str(in_str).decode('utf-8').lower().strip() + segs_out = [] + temp_str = "" + sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', + ',','。',':','?','!','“','”',';','’','《','》','……','·','、', + '「','」','(',')','-','~','『','』'] + for char in in_str: + if rm_punc and char in sp_char: + continue + if re.search(ur'[\u4e00-\u9fa5]', char) or char in sp_char: + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + temp_str = "" + segs_out.append(char) + else: + temp_str += char + + #handling last part + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + + return segs_out + + +# remove punctuation +def remove_punctuation(in_str): + in_str = str(in_str).decode('utf-8').lower().strip() + sp_char = ['-',':','_','*','^','/','\\','~','`','+','=', + ',','。',':','?','!','“','”',';','’','《','》','……','·','、', + '「','」','(',')','-','~','『','』'] + out_segs = [] + for char in in_str: + if char in sp_char: + continue + else: + out_segs.append(char) + return ''.join(out_segs) + + +# find longest common string +def find_lcs(s1, s2): + m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)] + mmax = 0 + p = 0 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i+1][j+1] = m[i][j]+1 + if m[i+1][j+1] > mmax: + mmax=m[i+1][j+1] + p=i+1 + return s1[p-mmax:p], mmax + +# +def evaluate(ground_truth_file, prediction_file): + f1 = 0 + em = 0 + total_count = 0 + skip_count = 0 + for instance in ground_truth_file["data"]: + #context_id = instance['context_id'].strip() + #context_text = instance['context_text'].strip() + for para in instance["paragraphs"]: + for qas in para['qas']: + total_count += 1 + query_id = qas['id'].strip() + query_text = qas['question'].strip() + answers = [x["text"] for x in qas['answers']] + + if query_id not in prediction_file: + sys.stderr.write('Unanswered question: {}\n'.format(query_id)) + skip_count += 1 + continue + + prediction = str(prediction_file[query_id]).decode('utf-8') + f1 += calc_f1_score(answers, prediction) + em += calc_em_score(answers, prediction) + + f1_score = 100.0 * f1 / total_count + em_score = 100.0 * em / total_count + return f1_score, em_score, total_count, skip_count + + +def calc_f1_score(answers, prediction): + f1_scores = [] + for ans in answers: + ans_segs = mixed_segmentation(ans, rm_punc=True) + prediction_segs = mixed_segmentation(prediction, rm_punc=True) + lcs, lcs_len = find_lcs(ans_segs, prediction_segs) + if lcs_len == 0: + f1_scores.append(0) + continue + precision = 1.0*lcs_len/len(prediction_segs) + recall = 1.0*lcs_len/len(ans_segs) + f1 = (2*precision*recall)/(precision+recall) + f1_scores.append(f1) + return max(f1_scores) + + +def calc_em_score(answers, prediction): + em = 0 + for ans in answers: + ans_ = remove_punctuation(ans) + prediction_ = remove_punctuation(prediction) + if ans_ == prediction_: + em = 1 + break + return em + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Evaluation Script for CMRC 2018') + parser.add_argument('dataset_file', help='Official dataset file') + parser.add_argument('prediction_file', help='Your prediction File') + args = parser.parse_args() + ground_truth_file = json.load(open(args.dataset_file, 'rb')) + prediction_file = json.load(open(args.prediction_file, 'rb')) + F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file) + AVG = (EM+F1)*0.5 + output_result = OrderedDict() + output_result['AVERAGE'] = '%.3f' % AVG + output_result['F1'] = '%.3f' % F1 + output_result['EM'] = '%.3f' % EM + output_result['TOTAL'] = TOTAL + output_result['SKIP'] = SKIP + output_result['FILE'] = args.prediction_file + print(json.dumps(output_result)) + diff --git a/baselines/models/xlnet/data_utils.py b/baselines/models/xlnet/data_utils.py new file mode 100644 index 0000000..ae9073f --- /dev/null +++ b/baselines/models/xlnet/data_utils.py @@ -0,0 +1,915 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os +import random + +from absl import flags +import absl.logging as _logging # pylint: disable=unused-import + +import numpy as np + + +import tensorflow as tf + +from prepro_utils import preprocess_text, encode_ids +import sentencepiece as spm + + +special_symbols = { + "" : 0, + "" : 1, + "" : 2, + "" : 3, + "" : 4, + "" : 5, + "" : 6, + "" : 7, + "" : 8, +} + +VOCAB_SIZE = 32000 +UNK_ID = special_symbols[""] +CLS_ID = special_symbols[""] +SEP_ID = special_symbols[""] +MASK_ID = special_symbols[""] +EOD_ID = special_symbols[""] + + +def _int64_feature(values): + return tf.train.Feature(int64_list=tf.train.Int64List(value=values)) + + +def _float_feature(values): + return tf.train.Feature(float_list=tf.train.FloatList(value=values)) + + +def format_filename(prefix, bsz_per_host, seq_len, bi_data, suffix, + mask_alpha=5, mask_beta=1, reuse_len=None, uncased=False, + fixed_num_predict=None): + """docs.""" + if reuse_len is None: + reuse_len_str = "" + else: + reuse_len_str = "reuse-{}.".format(reuse_len) + if not uncased: + uncased_str = "" + else: + uncased_str = "uncased." + if bi_data: + bi_data_str = "bi" + else: + bi_data_str = "uni" + if fixed_num_predict is not None: + fnp_str = "fnp-{}.".format(fixed_num_predict) + else: + fnp_str = "" + + file_name = "{}.bsz-{}.seqlen-{}.{}{}{}.alpha-{}.beta-{}.{}{}".format( + prefix, bsz_per_host, seq_len, reuse_len_str, uncased_str, bi_data_str, + mask_alpha, mask_beta, fnp_str, suffix) + + return file_name + + +def _create_data(idx, input_paths): + # Load sentence-piece model + sp = spm.SentencePieceProcessor() + sp.Load(FLAGS.sp_path) + + input_shards = [] + total_line_cnt = 0 + for input_path in input_paths: + input_data, sent_ids = [], [] + sent_id, line_cnt = True, 0 + tf.logging.info("Processing %s", input_path) + for line in tf.gfile.Open(input_path): + if line_cnt % 100000 == 0: + tf.logging.info("Loading line %d", line_cnt) + line_cnt += 1 + + if not line.strip(): + if FLAGS.use_eod: + sent_id = not sent_id + cur_sent = [EOD_ID] + else: + continue + else: + if FLAGS.from_raw_text: + cur_sent = preprocess_text(line.strip(), lower=FLAGS.uncased) + cur_sent = encode_ids(sp, cur_sent) + else: + cur_sent = list(map(int, line.strip().split())) + + input_data.extend(cur_sent) + sent_ids.extend([sent_id] * len(cur_sent)) + sent_id = not sent_id + + tf.logging.info("Finish with line %d", line_cnt) + if line_cnt == 0: + continue + + input_data = np.array(input_data, dtype=np.int64) + sent_ids = np.array(sent_ids, dtype=np.bool) + + total_line_cnt += line_cnt + input_shards.append((input_data, sent_ids)) + + tf.logging.info("[Task %d] Total number line: %d", idx, total_line_cnt) + + tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords") + + filenames, num_batch = [], 0 + + # Randomly shuffle input shards (with a fixed but distinct random seed) + np.random.seed(100 * FLAGS.task + FLAGS.pass_id) + + perm_indices = np.random.permutation(len(input_shards)) + tf.logging.info("Using perm indices %s for pass %d", + perm_indices.tolist(), FLAGS.pass_id) + + input_data_list, sent_ids_list = [], [] + prev_sent_id = None + for perm_idx in perm_indices: + input_data, sent_ids = input_shards[perm_idx] + # make sure the `send_ids[0] == not prev_sent_id` + if prev_sent_id is not None and sent_ids[0] == prev_sent_id: + sent_ids = np.logical_not(sent_ids) + + # append to temporary list + input_data_list.append(input_data) + sent_ids_list.append(sent_ids) + + # update `prev_sent_id` + prev_sent_id = sent_ids[-1] + + input_data = np.concatenate(input_data_list) + sent_ids = np.concatenate(sent_ids_list) + + file_name, cur_num_batch = create_tfrecords( + save_dir=tfrecord_dir, + basename="{}-{}-{}".format(FLAGS.split, idx, FLAGS.pass_id), + data=[input_data, sent_ids], + bsz_per_host=FLAGS.bsz_per_host, + seq_len=FLAGS.seq_len, + bi_data=FLAGS.bi_data, + sp=sp, + ) + + filenames.append(file_name) + num_batch += cur_num_batch + + record_info = { + "filenames": filenames, + "num_batch": num_batch + } + + return record_info + + +def create_data(_): + # Validate FLAGS + assert FLAGS.bsz_per_host % FLAGS.num_core_per_host == 0 + if not FLAGS.use_tpu: + FLAGS.num_core_per_host = 1 # forced to be one + + # Make workdirs + if not tf.gfile.Exists(FLAGS.save_dir): + tf.gfile.MakeDirs(FLAGS.save_dir) + + tfrecord_dir = os.path.join(FLAGS.save_dir, "tfrecords") + if not tf.gfile.Exists(tfrecord_dir): + tf.gfile.MakeDirs(tfrecord_dir) + + # Create and dump corpus_info from task 0 + if FLAGS.task == 0: + corpus_info = { + "vocab_size": VOCAB_SIZE, + "bsz_per_host": FLAGS.bsz_per_host, + "num_core_per_host": FLAGS.num_core_per_host, + "seq_len": FLAGS.seq_len, + "reuse_len": FLAGS.reuse_len, + "uncased": FLAGS.uncased, + "bi_data": FLAGS.bi_data, + "mask_alpha": FLAGS.mask_alpha, + "mask_beta": FLAGS.mask_beta, + "num_predict": FLAGS.num_predict, + "use_eod": FLAGS.use_eod, + "sp_path": FLAGS.sp_path, + "input_glob": FLAGS.input_glob, + } + corpus_info_path = os.path.join(FLAGS.save_dir, "corpus_info.json") + with tf.gfile.Open(corpus_info_path, "w") as fp: + json.dump(corpus_info, fp) + + # Interleavely split the work into FLAGS.num_task splits + file_paths = sorted(tf.gfile.Glob(FLAGS.input_glob)) + tf.logging.info("Use glob: %s", FLAGS.input_glob) + tf.logging.info("Find %d files: %s", len(file_paths), file_paths) + + task_file_paths = file_paths[FLAGS.task::FLAGS.num_task] + if not task_file_paths: + tf.logging.info("Exit: task %d has no file to process.", FLAGS.task) + return + + tf.logging.info("Task %d process %d files: %s", + FLAGS.task, len(task_file_paths), task_file_paths) + record_info = _create_data(FLAGS.task, task_file_paths) + + record_prefix = "record_info-{}-{}-{}".format( + FLAGS.split, FLAGS.task, FLAGS.pass_id) + record_name = format_filename( + prefix=record_prefix, + bsz_per_host=FLAGS.bsz_per_host, + seq_len=FLAGS.seq_len, + mask_alpha=FLAGS.mask_alpha, + mask_beta=FLAGS.mask_beta, + reuse_len=FLAGS.reuse_len, + bi_data=FLAGS.bi_data, + suffix="json", + uncased=FLAGS.uncased, + fixed_num_predict=FLAGS.num_predict) + record_info_path = os.path.join(tfrecord_dir, record_name) + + with tf.gfile.Open(record_info_path, "w") as fp: + json.dump(record_info, fp) + + +def batchify(data, bsz_per_host, sent_ids=None): + num_step = len(data) // bsz_per_host + data = data[:bsz_per_host * num_step] + data = data.reshape(bsz_per_host, num_step) + if sent_ids is not None: + sent_ids = sent_ids[:bsz_per_host * num_step] + sent_ids = sent_ids.reshape(bsz_per_host, num_step) + + if sent_ids is not None: + return data, sent_ids + return data + + +def _split_a_and_b(data, sent_ids, begin_idx, tot_len, extend_target=False): + """Split two segments from `data` starting from the index `begin_idx`.""" + + data_len = data.shape[0] + if begin_idx + tot_len >= data_len: + tf.logging.info("[_split_a_and_b] returns None: " + "begin_idx %d + tot_len %d >= data_len %d", + begin_idx, tot_len, data_len) + return None + + end_idx = begin_idx + 1 + cut_points = [] + while end_idx < data_len: + if sent_ids[end_idx] != sent_ids[end_idx - 1]: + if end_idx - begin_idx >= tot_len: break + cut_points.append(end_idx) + end_idx += 1 + + a_begin = begin_idx + if len(cut_points) == 0 or random.random() < 0.5: + label = 0 + if len(cut_points) == 0: + a_end = end_idx + else: + a_end = random.choice(cut_points) + + b_len = max(1, tot_len - (a_end - a_begin)) + # (zihang): `data_len - 1` to account for extend_target + b_begin = random.randint(0, data_len - 1 - b_len) + b_end = b_begin + b_len + while b_begin > 0 and sent_ids[b_begin - 1] == sent_ids[b_begin]: + b_begin -= 1 + # (zihang): `data_len - 1` to account for extend_target + while b_end < data_len - 1 and sent_ids[b_end - 1] == sent_ids[b_end]: + b_end += 1 + + new_begin = a_end + else: + label = 1 + a_end = random.choice(cut_points) + b_begin = a_end + b_end = end_idx + + new_begin = b_end + + while a_end - a_begin + b_end - b_begin > tot_len: + if a_end - a_begin > b_end - b_begin: + # delete the right side only for the LM objective + a_end -= 1 + else: + b_end -= 1 + + ret = [data[a_begin: a_end], data[b_begin: b_end], label, new_begin] + + if extend_target: + if a_end >= data_len or b_end >= data_len: + tf.logging.info("[_split_a_and_b] returns None: " + "a_end %d or b_end %d >= data_len %d", + a_end, b_end, data_len) + return None + a_target = data[a_begin + 1: a_end + 1] + b_target = data[b_begin: b_end + 1] + ret.extend([a_target, b_target]) + + return ret + + +def _is_start_piece(piece): + special_pieces = set(list('!"#$%&\"()*+,-./:;?@[\\]^_`{|}~')) + if (piece.startswith("▁") or piece.startswith("<") + or piece in special_pieces): + return True + else: + return False + + +def _sample_mask(sp, seg, reverse=False, max_gram=5, goal_num_predict=None): + """Sample `goal_num_predict` tokens for partial prediction. + About `mask_beta` tokens are chosen in a context of `mask_alpha` tokens.""" + + seg_len = len(seg) + mask = np.array([False] * seg_len, dtype=np.bool) + + num_predict = 0 + + ngrams = np.arange(1, max_gram + 1, dtype=np.int64) + pvals = 1. / np.arange(1, max_gram + 1) + pvals /= pvals.sum(keepdims=True) + + if reverse: + seg = np.flip(seg, 0) + + cur_len = 0 + while cur_len < seg_len: + if goal_num_predict is not None and num_predict >= goal_num_predict: break + + n = np.random.choice(ngrams, p=pvals) + if goal_num_predict is not None: + n = min(n, goal_num_predict - num_predict) + ctx_size = (n * FLAGS.mask_alpha) // FLAGS.mask_beta + l_ctx = np.random.choice(ctx_size) + r_ctx = ctx_size - l_ctx + + # Find the start position of a complete token + beg = cur_len + l_ctx + while beg < seg_len and not _is_start_piece(sp.IdToPiece(seg[beg].item())): + beg += 1 + if beg >= seg_len: + break + + # Find the end position of the n-gram (start pos of the n+1-th gram) + end = beg + 1 + cnt_ngram = 1 + while end < seg_len: + if _is_start_piece(sp.IdToPiece(seg[beg].item())): + cnt_ngram += 1 + if cnt_ngram > n: + break + end += 1 + if end >= seg_len: + break + + # Update + mask[beg:end] = True + num_predict += end - beg + + cur_len = end + r_ctx + + while goal_num_predict is not None and num_predict < goal_num_predict: + i = np.random.randint(seg_len) + if not mask[i]: + mask[i] = True + num_predict += 1 + + if reverse: + mask = np.flip(mask, 0) + + return mask + + +def create_tfrecords(save_dir, basename, data, bsz_per_host, seq_len, + bi_data, sp): + data, sent_ids = data[0], data[1] + + num_core = FLAGS.num_core_per_host + bsz_per_core = bsz_per_host // num_core + + if bi_data: + assert bsz_per_host % (2 * FLAGS.num_core_per_host) == 0 + fwd_data, fwd_sent_ids = batchify(data, bsz_per_host // 2, sent_ids) + + fwd_data = fwd_data.reshape(num_core, 1, bsz_per_core // 2, -1) + fwd_sent_ids = fwd_sent_ids.reshape(num_core, 1, bsz_per_core // 2, -1) + + bwd_data = fwd_data[:, :, :, ::-1] + bwd_sent_ids = fwd_sent_ids[:, :, :, ::-1] + + data = np.concatenate( + [fwd_data, bwd_data], 1).reshape(bsz_per_host, -1) + sent_ids = np.concatenate( + [fwd_sent_ids, bwd_sent_ids], 1).reshape(bsz_per_host, -1) + else: + data, sent_ids = batchify(data, bsz_per_host, sent_ids) + + tf.logging.info("Raw data shape %s.", data.shape) + + file_name = format_filename( + prefix=basename, + bsz_per_host=bsz_per_host, + seq_len=seq_len, + bi_data=bi_data, + suffix="tfrecords", + mask_alpha=FLAGS.mask_alpha, + mask_beta=FLAGS.mask_beta, + reuse_len=FLAGS.reuse_len, + uncased=FLAGS.uncased, + fixed_num_predict=FLAGS.num_predict + ) + save_path = os.path.join(save_dir, file_name) + record_writer = tf.python_io.TFRecordWriter(save_path) + tf.logging.info("Start writing %s.", save_path) + + num_batch = 0 + reuse_len = FLAGS.reuse_len + + # [sep] x 2 + [cls] + assert reuse_len < seq_len - 3 + + data_len = data.shape[1] + sep_array = np.array([SEP_ID], dtype=np.int64) + cls_array = np.array([CLS_ID], dtype=np.int64) + + i = 0 + while i + seq_len <= data_len: + if num_batch % 500 == 0: + tf.logging.info("Processing batch %d", num_batch) + + all_ok = True + features = [] + for idx in range(bsz_per_host): + inp = data[idx, i: i + reuse_len] + tgt = data[idx, i + 1: i + reuse_len + 1] + + results = _split_a_and_b( + data[idx], + sent_ids[idx], + begin_idx=i + reuse_len, + tot_len=seq_len - reuse_len - 3, + extend_target=True) + if results is None: + tf.logging.info("Break out with seq idx %d", i) + all_ok = False + break + + # unpack the results + (a_data, b_data, label, _, a_target, b_target) = tuple(results) + + # sample ngram spans to predict + reverse = bi_data and (idx // (bsz_per_core // 2)) % 2 == 1 + if FLAGS.num_predict is None: + num_predict_0 = num_predict_1 = None + else: + num_predict_1 = FLAGS.num_predict // 2 + num_predict_0 = FLAGS.num_predict - num_predict_1 + mask_0 = _sample_mask(sp, inp, reverse=reverse, + goal_num_predict=num_predict_0) + mask_1 = _sample_mask(sp, np.concatenate([a_data, sep_array, b_data, + sep_array, cls_array]), + reverse=reverse, goal_num_predict=num_predict_1) + + # concatenate data + cat_data = np.concatenate([inp, a_data, sep_array, b_data, + sep_array, cls_array]) + seg_id = ([0] * (reuse_len + a_data.shape[0]) + [0] + + [1] * b_data.shape[0] + [1] + [2]) + assert cat_data.shape[0] == seq_len + assert mask_0.shape[0] == seq_len // 2 + assert mask_1.shape[0] == seq_len // 2 + + # the last two CLS's are not used, just for padding purposes + tgt = np.concatenate([tgt, a_target, b_target, cls_array, cls_array]) + assert tgt.shape[0] == seq_len + + is_masked = np.concatenate([mask_0, mask_1], 0) + if FLAGS.num_predict is not None: + assert np.sum(is_masked) == FLAGS.num_predict + + feature = { + "input": _int64_feature(cat_data), + "is_masked": _int64_feature(is_masked), + "target": _int64_feature(tgt), + "seg_id": _int64_feature(seg_id), + "label": _int64_feature([label]), + } + features.append(feature) + + if all_ok: + assert len(features) == bsz_per_host + for feature in features: + example = tf.train.Example(features=tf.train.Features(feature=feature)) + record_writer.write(example.SerializeToString()) + num_batch += 1 + else: + break + + i += reuse_len + + record_writer.close() + tf.logging.info("Done writing %s. Num of batches: %d", save_path, num_batch) + + return save_path, num_batch + + +################ +# get_input_fn # +################ +def _convert_example(example, use_bfloat16): + """Cast int64 into int32 and float32 to bfloat16 if use_bfloat16.""" + for key in list(example.keys()): + val = example[key] + if tf.keras.backend.is_sparse(val): + val = tf.sparse.to_dense(val) + if val.dtype == tf.int64: + val = tf.cast(val, tf.int32) + if use_bfloat16 and val.dtype == tf.float32: + val = tf.cast(val, tf.bfloat16) + + example[key] = val + + +def parse_files_to_dataset(parser, file_names, split, num_batch, num_hosts, + host_id, num_core_per_host, bsz_per_core): + # list of file pathes + num_files = len(file_names) + num_files_per_host = num_files // num_hosts + my_start_file_id = host_id * num_files_per_host + my_end_file_id = (host_id + 1) * num_files_per_host + if host_id == num_hosts - 1: + my_end_file_id = num_files + file_paths = file_names[my_start_file_id: my_end_file_id] + tf.logging.info("Host %d handles %d files", host_id, len(file_paths)) + + assert split == "train" + dataset = tf.data.Dataset.from_tensor_slices(file_paths) + + # file-level shuffle + if len(file_paths) > 1: + dataset = dataset.shuffle(len(file_paths)) + + # Note: we cannot perform sample-level shuffle here because this will violate + # the consecutive requirement of data stream. + dataset = tf.data.TFRecordDataset(dataset) + + # (zihang): since we are doing online preprocessing, the parsed result of + # the same input at each time will be different. Thus, cache processed data + # is not helpful. It will use a lot of memory and lead to contrainer OOM. + # So, change to cache non-parsed raw data instead. + dataset = dataset.cache().map(parser).repeat() + dataset = dataset.batch(bsz_per_core, drop_remainder=True) + dataset = dataset.prefetch(num_core_per_host * bsz_per_core) + + return dataset + + +def _local_perm(inputs, targets, is_masked, perm_size, seq_len): + """ + Sample a permutation of the factorization order, and create an + attention mask accordingly. + + Args: + inputs: int64 Tensor in shape [seq_len], input ids. + targets: int64 Tensor in shape [seq_len], target ids. + is_masked: bool Tensor in shape [seq_len]. True means being selected + for partial prediction. + perm_size: the length of longest permutation. Could be set to be reuse_len. + Should not be larger than reuse_len or there will be data leaks. + seq_len: int, sequence length. + """ + + # Generate permutation indices + index = tf.range(seq_len, dtype=tf.int64) + index = tf.transpose(tf.reshape(index, [-1, perm_size])) + index = tf.random_shuffle(index) + index = tf.reshape(tf.transpose(index), [-1]) + + # `perm_mask` and `target_mask` + # non-functional tokens + non_func_tokens = tf.logical_not(tf.logical_or( + tf.equal(inputs, SEP_ID), + tf.equal(inputs, CLS_ID))) + + non_mask_tokens = tf.logical_and(tf.logical_not(is_masked), non_func_tokens) + masked_or_func_tokens = tf.logical_not(non_mask_tokens) + + # Set the permutation indices of non-masked (& non-funcional) tokens to the + # smallest index (-1): + # (1) they can be seen by all other positions + # (2) they cannot see masked positions, so there won"t be information leak + smallest_index = -tf.ones([seq_len], dtype=tf.int64) + rev_index = tf.where(non_mask_tokens, smallest_index, index) + + # Create `target_mask`: non-funcional and maksed tokens + # 1: use mask as input and have loss + # 0: use token (or [SEP], [CLS]) as input and do not have loss + target_tokens = tf.logical_and(masked_or_func_tokens, non_func_tokens) + target_mask = tf.cast(target_tokens, tf.float32) + + # Create `perm_mask` + # `target_tokens` cannot see themselves + self_rev_index = tf.where(target_tokens, rev_index, rev_index + 1) + + # 1: cannot attend if i <= j and j is not non-masked (masked_or_func_tokens) + # 0: can attend if i > j or j is non-masked + perm_mask = tf.logical_and( + self_rev_index[:, None] <= rev_index[None, :], + masked_or_func_tokens) + perm_mask = tf.cast(perm_mask, tf.float32) + + # new target: [next token] for LM and [curr token] (self) for PLM + new_targets = tf.concat([inputs[0: 1], targets[: -1]], + axis=0) + + # construct inputs_k + inputs_k = inputs + + # construct inputs_q + inputs_q = target_mask + + return perm_mask, new_targets, target_mask, inputs_k, inputs_q + + +def get_dataset(params, num_hosts, num_core_per_host, split, file_names, + num_batch, seq_len, reuse_len, perm_size, mask_alpha, + mask_beta, use_bfloat16=False, num_predict=None): + + bsz_per_core = params["batch_size"] + if num_hosts > 1: + host_id = params["context"].current_host + else: + host_id = 0 + + #### Function used to parse tfrecord + def parser(record): + """function used to parse tfrecord.""" + + record_spec = { + "input": tf.FixedLenFeature([seq_len], tf.int64), + "target": tf.FixedLenFeature([seq_len], tf.int64), + "seg_id": tf.FixedLenFeature([seq_len], tf.int64), + "label": tf.FixedLenFeature([1], tf.int64), + "is_masked": tf.FixedLenFeature([seq_len], tf.int64), + } + + # retrieve serialized example + example = tf.parse_single_example( + serialized=record, + features=record_spec) + + inputs = example.pop("input") + target = example.pop("target") + is_masked = tf.cast(example.pop("is_masked"), tf.bool) + + non_reuse_len = seq_len - reuse_len + assert perm_size <= reuse_len and perm_size <= non_reuse_len + + perm_mask_0, target_0, target_mask_0, input_k_0, input_q_0 = _local_perm( + inputs[:reuse_len], + target[:reuse_len], + is_masked[:reuse_len], + perm_size, + reuse_len) + + perm_mask_1, target_1, target_mask_1, input_k_1, input_q_1 = _local_perm( + inputs[reuse_len:], + target[reuse_len:], + is_masked[reuse_len:], + perm_size, + non_reuse_len) + + perm_mask_0 = tf.concat([perm_mask_0, tf.ones([reuse_len, non_reuse_len])], + axis=1) + perm_mask_1 = tf.concat([tf.zeros([non_reuse_len, reuse_len]), perm_mask_1], + axis=1) + perm_mask = tf.concat([perm_mask_0, perm_mask_1], axis=0) + target = tf.concat([target_0, target_1], axis=0) + target_mask = tf.concat([target_mask_0, target_mask_1], axis=0) + input_k = tf.concat([input_k_0, input_k_1], axis=0) + input_q = tf.concat([input_q_0, input_q_1], axis=0) + + if num_predict is not None: + indices = tf.range(seq_len, dtype=tf.int64) + bool_target_mask = tf.cast(target_mask, tf.bool) + indices = tf.boolean_mask(indices, bool_target_mask) + + ##### extra padding due to CLS/SEP introduced after prepro + actual_num_predict = tf.shape(indices)[0] + pad_len = num_predict - actual_num_predict + + ##### target_mapping + target_mapping = tf.one_hot(indices, seq_len, dtype=tf.float32) + paddings = tf.zeros([pad_len, seq_len], dtype=target_mapping.dtype) + target_mapping = tf.concat([target_mapping, paddings], axis=0) + example["target_mapping"] = tf.reshape(target_mapping, + [num_predict, seq_len]) + + ##### target + target = tf.boolean_mask(target, bool_target_mask) + paddings = tf.zeros([pad_len], dtype=target.dtype) + target = tf.concat([target, paddings], axis=0) + example["target"] = tf.reshape(target, [num_predict]) + + ##### target mask + target_mask = tf.concat( + [tf.ones([actual_num_predict], dtype=tf.float32), + tf.zeros([pad_len], dtype=tf.float32)], + axis=0) + example["target_mask"] = tf.reshape(target_mask, [num_predict]) + else: + example["target"] = tf.reshape(target, [seq_len]) + example["target_mask"] = tf.reshape(target_mask, [seq_len]) + + # reshape back to fixed shape + example["perm_mask"] = tf.reshape(perm_mask, [seq_len, seq_len]) + example["input_k"] = tf.reshape(input_k, [seq_len]) + example["input_q"] = tf.reshape(input_q, [seq_len]) + + _convert_example(example, use_bfloat16) + + for k, v in example.items(): + tf.logging.info("%s: %s", k, v) + + return example + + # Get dataset + dataset = parse_files_to_dataset( + parser=parser, + file_names=file_names, + split=split, + num_batch=num_batch, + num_hosts=num_hosts, + host_id=host_id, + num_core_per_host=num_core_per_host, + bsz_per_core=bsz_per_core) + + return dataset + + +def get_input_fn( + tfrecord_dir, + split, + bsz_per_host, + seq_len, + reuse_len, + bi_data, + num_hosts=1, + num_core_per_host=1, + perm_size=None, + mask_alpha=None, + mask_beta=None, + uncased=False, + num_passes=None, + use_bfloat16=False, + num_predict=None): + + # Merge all record infos into a single one + record_glob_base = format_filename( + prefix="record_info-{}-*".format(split), + bsz_per_host=bsz_per_host, + seq_len=seq_len, + bi_data=bi_data, + suffix="json", + mask_alpha=mask_alpha, + mask_beta=mask_beta, + reuse_len=reuse_len, + uncased=uncased, + fixed_num_predict=num_predict) + + record_info = {"num_batch": 0, "filenames": []} + + tfrecord_dirs = tfrecord_dir.split(",") + tf.logging.info("Use the following tfrecord dirs: %s", tfrecord_dirs) + + for idx, record_dir in enumerate(tfrecord_dirs): + record_glob = os.path.join(record_dir, record_glob_base) + tf.logging.info("[%d] Record glob: %s", idx, record_glob) + + record_paths = sorted(tf.gfile.Glob(record_glob)) + tf.logging.info("[%d] Num of record info path: %d", + idx, len(record_paths)) + + cur_record_info = {"num_batch": 0, "filenames": []} + + for record_info_path in record_paths: + if num_passes is not None: + record_info_name = os.path.basename(record_info_path) + fields = record_info_name.split(".")[0].split("-") + pass_id = int(fields[-1]) + if len(fields) == 5 and pass_id >= num_passes: + tf.logging.info("Skip pass %d: %s", pass_id, record_info_name) + continue + + with tf.gfile.Open(record_info_path, "r") as fp: + info = json.load(fp) + if num_passes is not None: + eff_num_passes = min(num_passes, len(info["filenames"])) + ratio = eff_num_passes / len(info["filenames"]) + cur_record_info["num_batch"] += int(info["num_batch"] * ratio) + cur_record_info["filenames"] += info["filenames"][:eff_num_passes] + else: + cur_record_info["num_batch"] += info["num_batch"] + cur_record_info["filenames"] += info["filenames"] + + # overwrite directory for `cur_record_info` + new_filenames = [] + for filename in cur_record_info["filenames"]: + basename = os.path.basename(filename) + new_filename = os.path.join(record_dir, basename) + new_filenames.append(new_filename) + cur_record_info["filenames"] = new_filenames + + tf.logging.info("[Dir %d] Number of chosen batches: %s", + idx, cur_record_info["num_batch"]) + tf.logging.info("[Dir %d] Number of chosen files: %s", + idx, len(cur_record_info["filenames"])) + tf.logging.info(cur_record_info["filenames"]) + + # add `cur_record_info` to global `record_info` + record_info["num_batch"] += cur_record_info["num_batch"] + record_info["filenames"] += cur_record_info["filenames"] + + tf.logging.info("Total number of batches: %d", + record_info["num_batch"]) + tf.logging.info("Total number of files: %d", + len(record_info["filenames"])) + tf.logging.info(record_info["filenames"]) + + def input_fn(params): + """docs.""" + assert params["batch_size"] * num_core_per_host == bsz_per_host + + dataset = get_dataset( + params=params, + num_hosts=num_hosts, + num_core_per_host=num_core_per_host, + split=split, + file_names=record_info["filenames"], + num_batch=record_info["num_batch"], + seq_len=seq_len, + reuse_len=reuse_len, + perm_size=perm_size, + mask_alpha=mask_alpha, + mask_beta=mask_beta, + use_bfloat16=use_bfloat16, + num_predict=num_predict) + + return dataset + + return input_fn, record_info + + +if __name__ == "__main__": + FLAGS = flags.FLAGS + flags.DEFINE_bool("use_tpu", True, help="whether to use TPUs") + flags.DEFINE_integer("bsz_per_host", 32, help="batch size per host.") + flags.DEFINE_integer("num_core_per_host", 8, help="num TPU cores per host.") + + flags.DEFINE_integer("seq_len", 512, + help="Sequence length.") + flags.DEFINE_integer("reuse_len", 256, + help="Number of token that can be reused as memory. " + "Could be half of `seq_len`.") + flags.DEFINE_bool("uncased", True, help="Use uncased inputs or not.") + flags.DEFINE_bool("bi_data", True, + help="whether to create bidirectional data") + flags.DEFINE_integer("mask_alpha", default=6, + help="How many tokens to form a group.") + flags.DEFINE_integer("mask_beta", default=1, + help="How many tokens to mask within each group.") + flags.DEFINE_bool("use_eod", True, + help="whether to append EOD at the end of a doc.") + flags.DEFINE_bool("from_raw_text", True, + help="Whether the input is raw text or encoded ids.") + flags.DEFINE_integer("num_predict", default=85, + help="Num of tokens to predict.") + + flags.DEFINE_string("input_glob", "data/example/*.txt", + help="Input file glob.") + flags.DEFINE_string("sp_path", "", help="Path to the sentence piece model.") + flags.DEFINE_string("save_dir", "proc_data/example", + help="Directory for saving the processed data.") + flags.DEFINE_enum("split", "train", ["train", "dev", "test"], + help="Save the data as which split.") + + flags.DEFINE_integer("pass_id", 0, help="ID of the current pass." + "Different passes sample different negative segment.") + flags.DEFINE_integer("num_task", 1, help="Number of total tasks.") + flags.DEFINE_integer("task", 0, help="The Task ID. This value is used when " + "using multiple workers to identify each worker.") + + tf.logging.set_verbosity(tf.logging.INFO) + tf.app.run(create_data) diff --git a/baselines/models/xlnet/function_builder.py b/baselines/models/xlnet/function_builder.py new file mode 100644 index 0000000..3d09335 --- /dev/null +++ b/baselines/models/xlnet/function_builder.py @@ -0,0 +1,362 @@ +"""doc.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import functools +import os +import tensorflow as tf +import modeling +import xlnet + + +def construct_scalar_host_call( + monitor_dict, + model_dir, + prefix="", + reduce_fn=None): + """ + Construct host calls to monitor training progress on TPUs. + """ + + metric_names = list(monitor_dict.keys()) + + def host_call_fn(global_step, *args): + """actual host call function.""" + step = global_step[0] + with tf.contrib.summary.create_file_writer( + logdir=model_dir, filename_suffix=".host_call").as_default(): + with tf.contrib.summary.always_record_summaries(): + for i, name in enumerate(metric_names): + if reduce_fn is None: + scalar = args[i][0] + else: + scalar = reduce_fn(args[i]) + with tf.contrib.summary.record_summaries_every_n_global_steps( + 100, global_step=step): + tf.contrib.summary.scalar(prefix + name, scalar, step=step) + + return tf.contrib.summary.all_summary_ops() + + global_step_tensor = tf.reshape(tf.train.get_or_create_global_step(), [1]) + other_tensors = [tf.reshape(monitor_dict[key], [1]) for key in metric_names] + + return host_call_fn, [global_step_tensor] + other_tensors + + +def two_stream_loss(FLAGS, features, labels, mems, is_training): + """Pretraining loss with two-stream attention Transformer-XL.""" + + #### Unpack input + mem_name = "mems" + mems = mems.get(mem_name, None) + + inp_k = tf.transpose(features["input_k"], [1, 0]) + inp_q = tf.transpose(features["input_q"], [1, 0]) + + seg_id = tf.transpose(features["seg_id"], [1, 0]) + + inp_mask = None + perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0]) + + if FLAGS.num_predict is not None: + # [num_predict x tgt_len x bsz] + target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0]) + else: + target_mapping = None + + # target for LM loss + tgt = tf.transpose(features["target"], [1, 0]) + + # target mask for LM loss + tgt_mask = tf.transpose(features["target_mask"], [1, 0]) + + # construct xlnet config and save to model_dir + xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS) + xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json")) + + # construct run config from FLAGS + run_config = xlnet.create_run_config(is_training, False, FLAGS) + + xlnet_model = xlnet.XLNetModel( + xlnet_config=xlnet_config, + run_config=run_config, + input_ids=inp_k, + seg_ids=seg_id, + input_mask=inp_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + inp_q=inp_q) + + output = xlnet_model.get_sequence_output() + new_mems = {mem_name: xlnet_model.get_new_memory()} + lookup_table = xlnet_model.get_embedding_table() + + initializer = xlnet_model.get_initializer() + + with tf.variable_scope("model", reuse=tf.AUTO_REUSE): + # LM loss + lm_loss = modeling.lm_loss( + hidden=output, + target=tgt, + n_token=xlnet_config.n_token, + d_model=xlnet_config.d_model, + initializer=initializer, + lookup_table=lookup_table, + tie_weight=True, + bi_data=run_config.bi_data, + use_tpu=run_config.use_tpu) + + #### Quantity to monitor + monitor_dict = {} + + if FLAGS.use_bfloat16: + tgt_mask = tf.cast(tgt_mask, tf.float32) + lm_loss = tf.cast(lm_loss, tf.float32) + + total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask) + monitor_dict["total_loss"] = total_loss + + return total_loss, new_mems, monitor_dict + + +def get_loss(FLAGS, features, labels, mems, is_training): + """Pretraining loss with two-stream attention Transformer-XL.""" + if FLAGS.use_bfloat16: + with tf.tpu.bfloat16_scope(): + return two_stream_loss(FLAGS, features, labels, mems, is_training) + else: + return two_stream_loss(FLAGS, features, labels, mems, is_training) + + +def get_classification_loss( + FLAGS, features, n_class, is_training): + """Loss for downstream classification tasks.""" + + bsz_per_core = tf.shape(features["input_ids"])[0] + + inp = tf.transpose(features["input_ids"], [1, 0]) + seg_id = tf.transpose(features["segment_ids"], [1, 0]) + inp_mask = tf.transpose(features["input_mask"], [1, 0]) + label = tf.reshape(features["label_ids"], [bsz_per_core]) + + xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) + run_config = xlnet.create_run_config(is_training, True, FLAGS) + + xlnet_model = xlnet.XLNetModel( + xlnet_config=xlnet_config, + run_config=run_config, + input_ids=inp, + seg_ids=seg_id, + input_mask=inp_mask) + + summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) + + with tf.variable_scope("model", reuse=tf.AUTO_REUSE): + + if FLAGS.cls_scope is not None and FLAGS.cls_scope: + cls_scope = "classification_{}".format(FLAGS.cls_scope) + else: + cls_scope = "classification_{}".format(FLAGS.task_name.lower()) + + per_example_loss, logits = modeling.classification_loss( + hidden=summary, + labels=label, + n_class=n_class, + initializer=xlnet_model.get_initializer(), + scope=cls_scope, + return_logits=True) + + total_loss = tf.reduce_mean(per_example_loss) + + return total_loss, per_example_loss, logits + + +def get_regression_loss( + FLAGS, features, is_training): + """Loss for downstream regression tasks.""" + + bsz_per_core = tf.shape(features["input_ids"])[0] + + inp = tf.transpose(features["input_ids"], [1, 0]) + seg_id = tf.transpose(features["segment_ids"], [1, 0]) + inp_mask = tf.transpose(features["input_mask"], [1, 0]) + label = tf.reshape(features["label_ids"], [bsz_per_core]) + + xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) + run_config = xlnet.create_run_config(is_training, True, FLAGS) + + xlnet_model = xlnet.XLNetModel( + xlnet_config=xlnet_config, + run_config=run_config, + input_ids=inp, + seg_ids=seg_id, + input_mask=inp_mask) + + summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) + + with tf.variable_scope("model", reuse=tf.AUTO_REUSE): + per_example_loss, logits = modeling.regression_loss( + hidden=summary, + labels=label, + initializer=xlnet_model.get_initializer(), + scope="regression_{}".format(FLAGS.task_name.lower()), + return_logits=True) + + total_loss = tf.reduce_mean(per_example_loss) + + return total_loss, per_example_loss, logits + + +def get_qa_outputs(FLAGS, features, is_training): + """Loss for downstream span-extraction QA tasks such as SQuAD.""" + + inp = tf.transpose(features["input_ids"], [1, 0]) + seg_id = tf.transpose(features["segment_ids"], [1, 0]) + inp_mask = tf.transpose(features["input_mask"], [1, 0]) + + seq_len = tf.shape(inp)[0] + + xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) + run_config = xlnet.create_run_config(is_training, True, FLAGS) + + xlnet_model = xlnet.XLNetModel( + xlnet_config=xlnet_config, + run_config=run_config, + input_ids=inp, + seg_ids=seg_id, + input_mask=inp_mask) + output = xlnet_model.get_sequence_output() + initializer = xlnet_model.get_initializer() + + return_dict = {} + + # invalid position mask such as query and special symbols (PAD, SEP, CLS) + p_mask = features["p_mask"] + + # logit of the start position + with tf.variable_scope("start_logits"): + start_logits = tf.layers.dense( + output, + 1, + kernel_initializer=initializer) + start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0]) + start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask + start_log_probs = tf.nn.log_softmax(start_logits_masked, -1) + + # logit of the end position + with tf.variable_scope("end_logits"): + if is_training: + # during training, compute the end logits based on the + # ground truth of the start position + + start_positions = tf.reshape(features["start_positions"], [-1]) + start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1, + dtype=tf.float32) + start_features = tf.einsum("lbh,bl->bh", output, start_index) + start_features = tf.tile(start_features[None], [seq_len, 1, 1]) + end_logits = tf.layers.dense( + tf.concat([output, start_features], axis=-1), xlnet_config.d_model, + kernel_initializer=initializer, activation=tf.tanh, name="dense_0") + end_logits = tf.contrib.layers.layer_norm( + end_logits, begin_norm_axis=-1) + + end_logits = tf.layers.dense( + end_logits, 1, + kernel_initializer=initializer, + name="dense_1") + end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0]) + end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask + end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) + else: + # during inference, compute the end logits based on beam search + + start_top_log_probs, start_top_index = tf.nn.top_k( + start_log_probs, k=FLAGS.start_n_top) + start_index = tf.one_hot(start_top_index, + depth=seq_len, axis=-1, dtype=tf.float32) + start_features = tf.einsum("lbh,bkl->bkh", output, start_index) + end_input = tf.tile(output[:, :, None], + [1, 1, FLAGS.start_n_top, 1]) + start_features = tf.tile(start_features[None], + [seq_len, 1, 1, 1]) + end_input = tf.concat([end_input, start_features], axis=-1) + end_logits = tf.layers.dense( + end_input, + xlnet_config.d_model, + kernel_initializer=initializer, + activation=tf.tanh, + name="dense_0") + end_logits = tf.contrib.layers.layer_norm(end_logits, + begin_norm_axis=-1) + end_logits = tf.layers.dense( + end_logits, + 1, + kernel_initializer=initializer, + name="dense_1") + end_logits = tf.reshape(end_logits, [seq_len, -1, FLAGS.start_n_top]) + end_logits = tf.transpose(end_logits, [1, 2, 0]) + end_logits_masked = end_logits * ( + 1 - p_mask[:, None]) - 1e30 * p_mask[:, None] + end_log_probs = tf.nn.log_softmax(end_logits_masked, -1) + end_top_log_probs, end_top_index = tf.nn.top_k( + end_log_probs, k=FLAGS.end_n_top) + end_top_log_probs = tf.reshape( + end_top_log_probs, + [-1, FLAGS.start_n_top * FLAGS.end_n_top]) + end_top_index = tf.reshape( + end_top_index, + [-1, FLAGS.start_n_top * FLAGS.end_n_top]) + + if is_training: + return_dict["start_log_probs"] = start_log_probs + return_dict["end_log_probs"] = end_log_probs + else: + return_dict["start_top_log_probs"] = start_top_log_probs + return_dict["start_top_index"] = start_top_index + return_dict["end_top_log_probs"] = end_top_log_probs + return_dict["end_top_index"] = end_top_index + + return return_dict + + +def get_race_loss(FLAGS, features, is_training): + """Loss for downstream multi-choice QA tasks such as RACE.""" + + bsz_per_core = tf.shape(features["input_ids"])[0] + + def _transform_features(feature): + out = tf.reshape(feature, [bsz_per_core, 4, -1]) + out = tf.transpose(out, [2, 0, 1]) + out = tf.reshape(out, [-1, bsz_per_core * 4]) + return out + + inp = _transform_features(features["input_ids"]) + seg_id = _transform_features(features["segment_ids"]) + inp_mask = _transform_features(features["input_mask"]) + label = tf.reshape(features["label_ids"], [bsz_per_core]) + + xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path) + run_config = xlnet.create_run_config(is_training, True, FLAGS) + + xlnet_model = xlnet.XLNetModel( + xlnet_config=xlnet_config, + run_config=run_config, + input_ids=inp, + seg_ids=seg_id, + input_mask=inp_mask) + summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj) + + with tf.variable_scope("logits"): + logits = tf.layers.dense(summary, 1, + kernel_initializer=xlnet_model.get_initializer()) + logits = tf.reshape(logits, [bsz_per_core, 4]) + + one_hot_target = tf.one_hot(label, 4) + per_example_loss = -tf.reduce_sum( + tf.nn.log_softmax(logits) * one_hot_target, -1) + total_loss = tf.reduce_mean(per_example_loss) + + return total_loss, per_example_loss, logits + diff --git a/baselines/models/xlnet/gpu_utils.py b/baselines/models/xlnet/gpu_utils.py new file mode 100644 index 0000000..e131019 --- /dev/null +++ b/baselines/models/xlnet/gpu_utils.py @@ -0,0 +1,69 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tensorflow as tf + +def assign_to_gpu(gpu=0, ps_dev="/device:CPU:0"): + def _assign(op): + node_def = op if isinstance(op, tf.NodeDef) else op.node_def + if node_def.op == "Variable": + return ps_dev + else: + return "/gpu:%d" % gpu + return _assign + + +def average_grads_and_vars(tower_grads_and_vars): + def average_dense(grad_and_vars): + if len(grad_and_vars) == 1: + return grad_and_vars[0][0] + + grad = grad_and_vars[0][0] + for g, _ in grad_and_vars[1:]: + grad += g + return grad / len(grad_and_vars) + + def average_sparse(grad_and_vars): + if len(grad_and_vars) == 1: + return grad_and_vars[0][0] + + indices = [] + values = [] + for g, _ in grad_and_vars: + indices += [g.indices] + values += [g.values] + indices = tf.concat(indices, 0) + values = tf.concat(values, 0) / len(grad_and_vars) + return tf.IndexedSlices(values, indices, grad_and_vars[0][0].dense_shape) + + average_grads_and_vars = [] + for grad_and_vars in zip(*tower_grads_and_vars): + if grad_and_vars[0][0] is None: + grad = None + elif isinstance(grad_and_vars[0][0], tf.IndexedSlices): + grad = average_sparse(grad_and_vars) + else: + grad = average_dense(grad_and_vars) + # Keep in mind that the Variables are redundant because they are shared + # across towers. So .. we will just return the first tower's pointer to + # the Variable. + v = grad_and_vars[0][1] + grad_and_var = (grad, v) + average_grads_and_vars.append(grad_and_var) + return average_grads_and_vars + + +def load_from_checkpoint(saver, logdir): + sess = tf.get_default_session() + ckpt = tf.train.get_checkpoint_state(logdir) + if ckpt and ckpt.model_checkpoint_path: + if os.path.isabs(ckpt.model_checkpoint_path): + # Restores from checkpoint with absolute path. + saver.restore(sess, ckpt.model_checkpoint_path) + else: + # Restores from checkpoint with relative path. + saver.restore(sess, os.path.join(logdir, ckpt.model_checkpoint_path)) + return True + return False diff --git a/baselines/models/xlnet/model_utils.py b/baselines/models/xlnet/model_utils.py new file mode 100644 index 0000000..fd8d6d8 --- /dev/null +++ b/baselines/models/xlnet/model_utils.py @@ -0,0 +1,399 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import os +import re +import numpy as np +import six +from os.path import join +from six.moves import zip + +from absl import flags + +import tensorflow as tf + + +def configure_tpu(FLAGS): + if FLAGS.use_tpu: + tpu_cluster = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + master = tpu_cluster.get_master() + else: + tpu_cluster = None + master = FLAGS.master + + session_config = tf.ConfigProto(allow_soft_placement=True) + # Uncomment the following line if you hope to monitor GPU RAM growth + # session_config.gpu_options.allow_growth = True + + if FLAGS.use_tpu: + strategy = None + tf.logging.info('Use TPU without distribute strategy.') + elif FLAGS.num_core_per_host == 1: + strategy = None + tf.logging.info('Single device mode.') + else: + strategy = tf.contrib.distribute.MirroredStrategy( + num_gpus=FLAGS.num_core_per_host) + tf.logging.info('Use MirroredStrategy with %d devices.', + strategy.num_replicas_in_sync) + + per_host_input = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + master=master, + model_dir=FLAGS.model_dir, + session_config=session_config, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations, + num_shards=FLAGS.num_hosts * FLAGS.num_core_per_host, + per_host_input_for_training=per_host_input), + keep_checkpoint_max=FLAGS.max_save, + save_checkpoints_secs=None, + save_checkpoints_steps=FLAGS.save_steps, + train_distribute=strategy + ) + return run_config + + +def init_from_checkpoint(FLAGS, global_vars=False): + tvars = tf.global_variables() if global_vars else tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if FLAGS.init_checkpoint is not None: + if FLAGS.init_checkpoint.endswith("latest"): + ckpt_dir = os.path.dirname(FLAGS.init_checkpoint) + init_checkpoint = tf.train.latest_checkpoint(ckpt_dir) + else: + init_checkpoint = FLAGS.init_checkpoint + + tf.logging.info("Initialize from the ckpt {}".format(init_checkpoint)) + + (assignment_map, initialized_variable_names + ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if FLAGS.use_tpu: + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + # Log customized initialization + tf.logging.info("**** Global Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + return scaffold_fn + + +def get_train_op(FLAGS, total_loss, grads_and_vars=None): + global_step = tf.train.get_or_create_global_step() + + # increase the learning rate linearly + if FLAGS.warmup_steps > 0: + warmup_lr = (tf.cast(global_step, tf.float32) + / tf.cast(FLAGS.warmup_steps, tf.float32) + * FLAGS.learning_rate) + else: + warmup_lr = 0.0 + + # decay the learning rate + if FLAGS.decay_method == "poly": + decay_lr = tf.train.polynomial_decay( + FLAGS.learning_rate, + global_step=global_step - FLAGS.warmup_steps, + decay_steps=FLAGS.train_steps - FLAGS.warmup_steps, + end_learning_rate=FLAGS.learning_rate * FLAGS.min_lr_ratio) + elif FLAGS.decay_method == "cos": + decay_lr = tf.train.cosine_decay( + FLAGS.learning_rate, + global_step=global_step - FLAGS.warmup_steps, + decay_steps=FLAGS.train_steps - FLAGS.warmup_steps, + alpha=FLAGS.min_lr_ratio) + else: + raise ValueError(FLAGS.decay_method) + + learning_rate = tf.where(global_step < FLAGS.warmup_steps, + warmup_lr, decay_lr) + + if (FLAGS.weight_decay > 0 and not FLAGS.use_tpu and + FLAGS.num_core_per_host > 1): + raise ValueError("Do not support `weight_decay > 0` with multi-gpu " + "training so far.") + + if FLAGS.weight_decay == 0: + optimizer = tf.train.AdamOptimizer( + learning_rate=learning_rate, + epsilon=FLAGS.adam_epsilon) + else: + optimizer = AdamWeightDecayOptimizer( + learning_rate=learning_rate, + epsilon=FLAGS.adam_epsilon, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], + weight_decay_rate=FLAGS.weight_decay) + + if FLAGS.use_tpu: + optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) + + if grads_and_vars is None: + grads_and_vars = optimizer.compute_gradients(total_loss) + gradients, variables = zip(*grads_and_vars) + clipped, gnorm = tf.clip_by_global_norm(gradients, FLAGS.clip) + + if getattr(FLAGS, "lr_layer_decay_rate", 1.0) != 1.0: + n_layer = 0 + for i in range(len(clipped)): + m = re.search(r"model/transformer/layer_(\d+?)/", variables[i].name) + if not m: continue + n_layer = max(n_layer, int(m.group(1)) + 1) + + for i in range(len(clipped)): + for l in range(n_layer): + if "model/transformer/layer_{}/".format(l) in variables[i].name: + abs_rate = FLAGS.lr_layer_decay_rate ** (n_layer - 1 - l) + clipped[i] *= abs_rate + tf.logging.info("Apply mult {:.4f} to layer-{} grad of {}".format( + abs_rate, l, variables[i].name)) + break + + train_op = optimizer.apply_gradients( + zip(clipped, variables), global_step=global_step) + + # Manually increment `global_step` for AdamWeightDecayOptimizer + if FLAGS.weight_decay > 0: + new_global_step = global_step + 1 + train_op = tf.group(train_op, [global_step.assign(new_global_step)]) + + return train_op, learning_rate, gnorm + + +def clean_ckpt(_): + input_ckpt = FLAGS.clean_input_ckpt + output_model_dir = FLAGS.clean_output_model_dir + + tf.reset_default_graph() + + var_list = tf.contrib.framework.list_variables(input_ckpt) + var_values, var_dtypes = {}, {} + for (name, shape) in var_list: + if not name.startswith("global_step") and "adam" not in name.lower(): + var_values[name] = None + tf.logging.info("Include {}".format(name)) + else: + tf.logging.info("Exclude {}".format(name)) + + tf.logging.info("Loading from {}".format(input_ckpt)) + reader = tf.contrib.framework.load_checkpoint(input_ckpt) + for name in var_values: + tensor = reader.get_tensor(name) + var_dtypes[name] = tensor.dtype + var_values[name] = tensor + + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): + tf_vars = [ + tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[v]) + for v in var_values + ] + placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars] + assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] + global_step = tf.Variable( + 0, name="global_step", trainable=False, dtype=tf.int64) + saver = tf.train.Saver(tf.all_variables()) + + if not tf.gfile.Exists(output_model_dir): + tf.gfile.MakeDirs(output_model_dir) + + # Build a model consisting only of variables, set them to the average values. + with tf.Session() as sess: + sess.run(tf.initialize_all_variables()) + for p, assign_op, (name, value) in zip(placeholders, assign_ops, + six.iteritems(var_values)): + sess.run(assign_op, {p: value}) + + # Use the built saver to save the averaged checkpoint. + saver.save(sess, join(output_model_dir, "model.ckpt"), + global_step=global_step) + + +def avg_checkpoints(model_dir, output_model_dir, last_k): + tf.reset_default_graph() + + checkpoint_state = tf.train.get_checkpoint_state(model_dir) + checkpoints = checkpoint_state.all_model_checkpoint_paths[- last_k:] + var_list = tf.contrib.framework.list_variables(checkpoints[0]) + var_values, var_dtypes = {}, {} + for (name, shape) in var_list: + if not name.startswith("global_step"): + var_values[name] = np.zeros(shape) + for checkpoint in checkpoints: + reader = tf.contrib.framework.load_checkpoint(checkpoint) + for name in var_values: + tensor = reader.get_tensor(name) + var_dtypes[name] = tensor.dtype + var_values[name] += tensor + tf.logging.info("Read from checkpoint %s", checkpoint) + for name in var_values: # Average. + var_values[name] /= len(checkpoints) + + with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): + tf_vars = [ + tf.get_variable(v, shape=var_values[v].shape, dtype=var_dtypes[v]) + for v in var_values + ] + placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars] + assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)] + global_step = tf.Variable( + 0, name="global_step", trainable=False, dtype=tf.int64) + saver = tf.train.Saver(tf.all_variables()) + + # Build a model consisting only of variables, set them to the average values. + with tf.Session() as sess: + sess.run(tf.initialize_all_variables()) + for p, assign_op, (name, value) in zip(placeholders, assign_ops, + six.iteritems(var_values)): + sess.run(assign_op, {p: value}) + # Use the built saver to save the averaged checkpoint. + saver.save(sess, join(output_model_dir, "model.ckpt"), + global_step=global_step) + + +def get_assignment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + assignment_map = {} + initialized_variable_names = {} + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + # tf.logging.info('original name: %s', name) + if name not in name_to_variable: + continue + # assignment_map[name] = name + assignment_map[name] = name_to_variable[name] + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + return (assignment_map, initialized_variable_names) + + +class AdamWeightDecayOptimizer(tf.train.Optimizer): + """A basic Adam optimizer that includes "correct" L2 weight decay.""" + + def __init__(self, + learning_rate, + weight_decay_rate=0.0, + beta_1=0.9, + beta_2=0.999, + epsilon=1e-6, + exclude_from_weight_decay=None, + include_in_weight_decay=["r_s_bias", "r_r_bias", "r_w_bias"], + name="AdamWeightDecayOptimizer"): + """Constructs a AdamWeightDecayOptimizer.""" + super(AdamWeightDecayOptimizer, self).__init__(False, name) + + self.learning_rate = learning_rate + self.weight_decay_rate = weight_decay_rate + self.beta_1 = beta_1 + self.beta_2 = beta_2 + self.epsilon = epsilon + self.exclude_from_weight_decay = exclude_from_weight_decay + self.include_in_weight_decay = include_in_weight_decay + + def apply_gradients(self, grads_and_vars, global_step=None, name=None): + """See base class.""" + assignments = [] + for (grad, param) in grads_and_vars: + if grad is None or param is None: + continue + + param_name = self._get_variable_name(param.name) + + m = tf.get_variable( + name=param_name + "/adam_m", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + v = tf.get_variable( + name=param_name + "/adam_v", + shape=param.shape.as_list(), + dtype=tf.float32, + trainable=False, + initializer=tf.zeros_initializer()) + + # Standard Adam update. + next_m = ( + tf.multiply(self.beta_1, m) + tf.multiply(1.0 - self.beta_1, grad)) + next_v = ( + tf.multiply(self.beta_2, v) + tf.multiply(1.0 - self.beta_2, + tf.square(grad))) + + update = next_m / (tf.sqrt(next_v) + self.epsilon) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if self._do_use_weight_decay(param_name): + update += self.weight_decay_rate * param + + update_with_lr = self.learning_rate * update + + next_param = param - update_with_lr + + assignments.extend( + [param.assign(next_param), + m.assign(next_m), + v.assign(next_v)]) + + return tf.group(*assignments, name=name) + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if not self.weight_decay_rate: + return False + for r in self.include_in_weight_decay: + if re.search(r, param_name) is not None: + return True + + if self.exclude_from_weight_decay: + for r in self.exclude_from_weight_decay: + if re.search(r, param_name) is not None: + tf.logging.info('Adam WD excludes {}'.format(param_name)) + return False + return True + + def _get_variable_name(self, param_name): + """Get the variable name from the tensor name.""" + m = re.match("^(.*):\\d+$", param_name) + if m is not None: + param_name = m.group(1) + return param_name + + +if __name__ == "__main__": + flags.DEFINE_string("clean_input_ckpt", "", "input ckpt for cleaning") + flags.DEFINE_string("clean_output_model_dir", "", "output dir for cleaned ckpt") + + FLAGS = flags.FLAGS + + tf.app.run(clean_ckpt) diff --git a/baselines/models/xlnet/modeling.py b/baselines/models/xlnet/modeling.py new file mode 100644 index 0000000..a7d719c --- /dev/null +++ b/baselines/models/xlnet/modeling.py @@ -0,0 +1,783 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import numpy as np +import tensorflow as tf + + +def gelu(x): + """Gaussian Error Linear Unit. + + This is a smoother version of the RELU. + Original paper: https://arxiv.org/abs/1606.08415 + Args: + x: float Tensor to perform activation. + + Returns: + `x` with the GELU activation applied. + """ + cdf = 0.5 * (1.0 + tf.tanh( + (np.sqrt(2 / np.pi) * (x + 0.044715 * tf.pow(x, 3))))) + return x * cdf + + +def embedding_lookup(x, n_token, d_embed, initializer, use_tpu=True, + scope='embedding', reuse=None, dtype=tf.float32): + """TPU and GPU embedding_lookup function.""" + with tf.variable_scope(scope, reuse=reuse): + lookup_table = tf.get_variable('lookup_table', [n_token, d_embed], + dtype=dtype, initializer=initializer) + if use_tpu: + one_hot_idx = tf.one_hot(x, n_token, dtype=dtype) + if one_hot_idx.shape.ndims == 2: + return tf.einsum('in,nd->id', one_hot_idx, lookup_table), lookup_table + else: + return tf.einsum('ibn,nd->ibd', one_hot_idx, lookup_table), lookup_table + else: + return tf.nn.embedding_lookup(lookup_table, x), lookup_table + + +def positional_embedding(pos_seq, inv_freq, bsz=None): + sinusoid_inp = tf.einsum('i,d->id', pos_seq, inv_freq) + pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1) + pos_emb = pos_emb[:, None, :] + + if bsz is not None: + pos_emb = tf.tile(pos_emb, [1, bsz, 1]) + + return pos_emb + + +def positionwise_ffn(inp, d_model, d_inner, dropout, kernel_initializer, + activation_type='relu', scope='ff', is_training=True, + reuse=None): + """Position-wise Feed-forward Network.""" + if activation_type == 'relu': + activation = tf.nn.relu + elif activation_type == 'gelu': + activation = gelu + else: + raise ValueError('Unsupported activation type {}'.format(activation_type)) + + output = inp + with tf.variable_scope(scope, reuse=reuse): + output = tf.layers.dense(output, d_inner, activation=activation, + kernel_initializer=kernel_initializer, + name='layer_1') + output = tf.layers.dropout(output, dropout, training=is_training, + name='drop_1') + output = tf.layers.dense(output, d_model, + kernel_initializer=kernel_initializer, + name='layer_2') + output = tf.layers.dropout(output, dropout, training=is_training, + name='drop_2') + output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1, + scope='LayerNorm') + return output + + +def head_projection(h, d_model, n_head, d_head, kernel_initializer, name): + """Project hidden states to a specific head with a 4D-shape.""" + proj_weight = tf.get_variable('{}/kernel'.format(name), + [d_model, n_head, d_head], dtype=h.dtype, + initializer=kernel_initializer) + head = tf.einsum('ibh,hnd->ibnd', h, proj_weight) + + return head + + +def post_attention(h, attn_vec, d_model, n_head, d_head, dropout, is_training, + kernel_initializer, residual=True): + """Post-attention processing.""" + # post-attention projection (back to `d_model`) + proj_o = tf.get_variable('o/kernel', [d_model, n_head, d_head], + dtype=h.dtype, initializer=kernel_initializer) + attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, proj_o) + + attn_out = tf.layers.dropout(attn_out, dropout, training=is_training) + if residual: + output = tf.contrib.layers.layer_norm(attn_out + h, begin_norm_axis=-1, + scope='LayerNorm') + else: + output = tf.contrib.layers.layer_norm(attn_out, begin_norm_axis=-1, + scope='LayerNorm') + + return output + + +def abs_attn_core(q_head, k_head, v_head, attn_mask, dropatt, is_training, + scale): + """Core absolute positional attention operations.""" + + attn_score = tf.einsum('ibnd,jbnd->ijbn', q_head, k_head) + attn_score *= scale + if attn_mask is not None: + attn_score = attn_score - 1e30 * attn_mask + + # attention probability + attn_prob = tf.nn.softmax(attn_score, 1) + attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training) + + # attention output + attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head) + + return attn_vec + + +def rel_attn_core(q_head, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, + r_w_bias, r_r_bias, r_s_bias, attn_mask, dropatt, is_training, + scale): + """Core relative positional attention operations.""" + + # content based attention score + ac = tf.einsum('ibnd,jbnd->ijbn', q_head + r_w_bias, k_head_h) + + # position based attention score + bd = tf.einsum('ibnd,jbnd->ijbn', q_head + r_r_bias, k_head_r) + bd = rel_shift(bd, klen=tf.shape(ac)[1]) + + # segment based attention score + if seg_mat is None: + ef = 0 + else: + ef = tf.einsum('ibnd,snd->ibns', q_head + r_s_bias, seg_embed) + ef = tf.einsum('ijbs,ibns->ijbn', seg_mat, ef) + + # merge attention scores and perform masking + attn_score = (ac + bd + ef) * scale + if attn_mask is not None: + # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask + attn_score = attn_score - 1e30 * attn_mask + + # attention probability + attn_prob = tf.nn.softmax(attn_score, 1) + attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training) + + # attention output + attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, v_head_h) + + return attn_vec + + +def rel_shift(x, klen=-1): + """perform relative shift to form the relative attention score.""" + x_size = tf.shape(x) + + x = tf.reshape(x, [x_size[1], x_size[0], x_size[2], x_size[3]]) + x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1]) + x = tf.reshape(x, [x_size[0], x_size[1] - 1, x_size[2], x_size[3]]) + x = tf.slice(x, [0, 0, 0, 0], [-1, klen, -1, -1]) + + return x + + +def _create_mask(qlen, mlen, dtype=tf.float32, same_length=False): + """create causal attention mask.""" + attn_mask = tf.ones([qlen, qlen], dtype=dtype) + mask_u = tf.matrix_band_part(attn_mask, 0, -1) + mask_dia = tf.matrix_band_part(attn_mask, 0, 0) + attn_mask_pad = tf.zeros([qlen, mlen], dtype=dtype) + ret = tf.concat([attn_mask_pad, mask_u - mask_dia], 1) + if same_length: + mask_l = tf.matrix_band_part(attn_mask, -1, 0) + ret = tf.concat([ret[:, :qlen] + mask_l - mask_dia, ret[:, qlen:]], 1) + + return ret + + +def _cache_mem(curr_out, prev_mem, mem_len, reuse_len=None): + """cache hidden states into memory.""" + if mem_len is None or mem_len == 0: + return None + else: + if reuse_len is not None and reuse_len > 0: + curr_out = curr_out[:reuse_len] + + if prev_mem is None: + new_mem = curr_out[-mem_len:] + else: + new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:] + + return tf.stop_gradient(new_mem) + + +def relative_positional_encoding(qlen, klen, d_model, clamp_len, attn_type, + bi_data, bsz=None, dtype=None): + """create relative positional encoding.""" + freq_seq = tf.range(0, d_model, 2.0) + if dtype is not None and dtype != tf.float32: + freq_seq = tf.cast(freq_seq, dtype=dtype) + inv_freq = 1 / (10000 ** (freq_seq / d_model)) + + if attn_type == 'bi': + # beg, end = klen - 1, -qlen + beg, end = klen, -qlen + elif attn_type == 'uni': + # beg, end = klen - 1, -1 + beg, end = klen, -1 + else: + raise ValueError('Unknown `attn_type` {}.'.format(attn_type)) + + if bi_data: + fwd_pos_seq = tf.range(beg, end, -1.0) + bwd_pos_seq = tf.range(-beg, -end, 1.0) + + if dtype is not None and dtype != tf.float32: + fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) + bwd_pos_seq = tf.cast(bwd_pos_seq, dtype=dtype) + + if clamp_len > 0: + fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len) + bwd_pos_seq = tf.clip_by_value(bwd_pos_seq, -clamp_len, clamp_len) + + if bsz is not None: + # With bi_data, the batch size should be divisible by 2. + assert bsz%2 == 0 + fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz//2) + bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq, bsz//2) + else: + fwd_pos_emb = positional_embedding(fwd_pos_seq, inv_freq) + bwd_pos_emb = positional_embedding(bwd_pos_seq, inv_freq) + + pos_emb = tf.concat([fwd_pos_emb, bwd_pos_emb], axis=1) + else: + fwd_pos_seq = tf.range(beg, end, -1.0) + if dtype is not None and dtype != tf.float32: + fwd_pos_seq = tf.cast(fwd_pos_seq, dtype=dtype) + if clamp_len > 0: + fwd_pos_seq = tf.clip_by_value(fwd_pos_seq, -clamp_len, clamp_len) + pos_emb = positional_embedding(fwd_pos_seq, inv_freq, bsz) + + return pos_emb + + +def multihead_attn(q, k, v, attn_mask, d_model, n_head, d_head, dropout, + dropatt, is_training, kernel_initializer, residual=True, + scope='abs_attn', reuse=None): + """Standard multi-head attention with absolute positional embedding.""" + + scale = 1 / (d_head ** 0.5) + with tf.variable_scope(scope, reuse=reuse): + # attention heads + q_head = head_projection( + q, d_model, n_head, d_head, kernel_initializer, 'q') + k_head = head_projection( + k, d_model, n_head, d_head, kernel_initializer, 'k') + v_head = head_projection( + v, d_model, n_head, d_head, kernel_initializer, 'v') + + # attention vector + attn_vec = abs_attn_core(q_head, k_head, v_head, attn_mask, dropatt, + is_training, scale) + + # post processing + output = post_attention(v, attn_vec, d_model, n_head, d_head, dropout, + is_training, kernel_initializer, residual) + + return output + + + +def rel_multihead_attn(h, r, r_w_bias, r_r_bias, seg_mat, r_s_bias, seg_embed, + attn_mask, mems, d_model, n_head, d_head, dropout, + dropatt, is_training, kernel_initializer, + scope='rel_attn', reuse=None): + """Multi-head attention with relative positional encoding.""" + + scale = 1 / (d_head ** 0.5) + with tf.variable_scope(scope, reuse=reuse): + if mems is not None and mems.shape.ndims > 1: + cat = tf.concat([mems, h], 0) + else: + cat = h + + # content heads + q_head_h = head_projection( + h, d_model, n_head, d_head, kernel_initializer, 'q') + k_head_h = head_projection( + cat, d_model, n_head, d_head, kernel_initializer, 'k') + v_head_h = head_projection( + cat, d_model, n_head, d_head, kernel_initializer, 'v') + + # positional heads + k_head_r = head_projection( + r, d_model, n_head, d_head, kernel_initializer, 'r') + + # core attention ops + attn_vec = rel_attn_core( + q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, + r_r_bias, r_s_bias, attn_mask, dropatt, is_training, scale) + + # post processing + output = post_attention(h, attn_vec, d_model, n_head, d_head, dropout, + is_training, kernel_initializer) + + return output + + +def two_stream_rel_attn(h, g, r, mems, r_w_bias, r_r_bias, seg_mat, r_s_bias, + seg_embed, attn_mask_h, attn_mask_g, target_mapping, + d_model, n_head, d_head, dropout, dropatt, is_training, + kernel_initializer, scope='rel_attn'): + """Two-stream attention with relative positional encoding.""" + + scale = 1 / (d_head ** 0.5) + with tf.variable_scope(scope, reuse=False): + + # content based attention score + if mems is not None and mems.shape.ndims > 1: + cat = tf.concat([mems, h], 0) + else: + cat = h + + # content-based key head + k_head_h = head_projection( + cat, d_model, n_head, d_head, kernel_initializer, 'k') + + # content-based value head + v_head_h = head_projection( + cat, d_model, n_head, d_head, kernel_initializer, 'v') + + # position-based key head + k_head_r = head_projection( + r, d_model, n_head, d_head, kernel_initializer, 'r') + + ##### h-stream + # content-stream query head + q_head_h = head_projection( + h, d_model, n_head, d_head, kernel_initializer, 'q') + + # core attention ops + attn_vec_h = rel_attn_core( + q_head_h, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, + r_r_bias, r_s_bias, attn_mask_h, dropatt, is_training, scale) + + # post processing + output_h = post_attention(h, attn_vec_h, d_model, n_head, d_head, dropout, + is_training, kernel_initializer) + + with tf.variable_scope(scope, reuse=True): + ##### g-stream + # query-stream query head + q_head_g = head_projection( + g, d_model, n_head, d_head, kernel_initializer, 'q') + + # core attention ops + if target_mapping is not None: + q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) + attn_vec_g = rel_attn_core( + q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, + r_r_bias, r_s_bias, attn_mask_g, dropatt, is_training, scale) + attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) + else: + attn_vec_g = rel_attn_core( + q_head_g, k_head_h, v_head_h, k_head_r, seg_embed, seg_mat, r_w_bias, + r_r_bias, r_s_bias, attn_mask_g, dropatt, is_training, scale) + + # post processing + output_g = post_attention(g, attn_vec_g, d_model, n_head, d_head, dropout, + is_training, kernel_initializer) + + return output_h, output_g + + +def transformer_xl(inp_k, n_token, n_layer, d_model, n_head, + d_head, d_inner, dropout, dropatt, attn_type, + bi_data, initializer, is_training, mem_len=None, + inp_q=None, mems=None, + same_length=False, clamp_len=-1, untie_r=False, + use_tpu=True, input_mask=None, + perm_mask=None, seg_id=None, reuse_len=None, + ff_activation='relu', target_mapping=None, + use_bfloat16=False, scope='transformer', **kwargs): + """ + Defines a Transformer-XL computation graph with additional + support for XLNet. + + Args: + + inp_k: int32 Tensor in shape [len, bsz], the input token IDs. + seg_id: int32 Tensor in shape [len, bsz], the input segment IDs. + input_mask: float32 Tensor in shape [len, bsz], the input mask. + 0 for real tokens and 1 for padding. + mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory + from previous batches. The length of the list equals n_layer. + If None, no memory is used. + perm_mask: float32 Tensor in shape [len, len, bsz]. + If perm_mask[i, j, k] = 0, i attend to j in batch k; + if perm_mask[i, j, k] = 1, i does not attend to j in batch k. + If None, each position attends to all the others. + target_mapping: float32 Tensor in shape [num_predict, len, bsz]. + If target_mapping[i, j, k] = 1, the i-th predict in batch k is + on the j-th token. + Only used during pretraining for partial prediction. + Set to None during finetuning. + inp_q: float32 Tensor in shape [len, bsz]. + 1 for tokens with losses and 0 for tokens without losses. + Only used during pretraining for two-stream attention. + Set to None during finetuning. + + n_layer: int, the number of layers. + d_model: int, the hidden size. + n_head: int, the number of attention heads. + d_head: int, the dimension size of each attention head. + d_inner: int, the hidden size in feed-forward layers. + ff_activation: str, "relu" or "gelu". + untie_r: bool, whether to untie the biases in attention. + n_token: int, the vocab size. + + is_training: bool, whether in training mode. + use_tpu: bool, whether TPUs are used. + use_bfloat16: bool, use bfloat16 instead of float32. + dropout: float, dropout rate. + dropatt: float, dropout rate on attention probabilities. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + summary_type: str, "last", "first", "mean", or "attn". The method + to pool the input to get a vector representation. + initializer: A tf initializer. + scope: scope name for the computation graph. + """ + tf.logging.info('memory input {}'.format(mems)) + tf_float = tf.bfloat16 if use_bfloat16 else tf.float32 + tf.logging.info('Use float type {}'.format(tf_float)) + + new_mems = [] + with tf.variable_scope(scope): + if untie_r: + r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head], + dtype=tf_float, initializer=initializer) + r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head], + dtype=tf_float, initializer=initializer) + else: + r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head], + dtype=tf_float, initializer=initializer) + r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head], + dtype=tf_float, initializer=initializer) + + bsz = tf.shape(inp_k)[1] + qlen = tf.shape(inp_k)[0] + mlen = tf.shape(mems[0])[0] if mems is not None else 0 + klen = mlen + qlen + + ##### Attention mask + # causal attention mask + if attn_type == 'uni': + attn_mask = _create_mask(qlen, mlen, tf_float, same_length) + attn_mask = attn_mask[:, :, None, None] + elif attn_type == 'bi': + attn_mask = None + else: + raise ValueError('Unsupported attention type: {}'.format(attn_type)) + + # data mask: input mask & perm mask + if input_mask is not None and perm_mask is not None: + data_mask = input_mask[None] + perm_mask + elif input_mask is not None and perm_mask is None: + data_mask = input_mask[None] + elif input_mask is None and perm_mask is not None: + data_mask = perm_mask + else: + data_mask = None + + if data_mask is not None: + # all mems can be attended to + mems_mask = tf.zeros([tf.shape(data_mask)[0], mlen, bsz], + dtype=tf_float) + data_mask = tf.concat([mems_mask, data_mask], 1) + if attn_mask is None: + attn_mask = data_mask[:, :, :, None] + else: + attn_mask += data_mask[:, :, :, None] + + if attn_mask is not None: + attn_mask = tf.cast(attn_mask > 0, dtype=tf_float) + + if attn_mask is not None: + non_tgt_mask = -tf.eye(qlen, dtype=tf_float) + non_tgt_mask = tf.concat([tf.zeros([qlen, mlen], dtype=tf_float), + non_tgt_mask], axis=-1) + non_tgt_mask = tf.cast((attn_mask + non_tgt_mask[:, :, None, None]) > 0, + dtype=tf_float) + else: + non_tgt_mask = None + + ##### Word embedding + word_emb_k, lookup_table = embedding_lookup( + x=inp_k, + n_token=n_token, + d_embed=d_model, + initializer=initializer, + use_tpu=use_tpu, + dtype=tf_float, + scope='word_embedding') + + if inp_q is not None: + with tf.variable_scope('mask_emb'): + mask_emb = tf.get_variable('mask_emb', [1, 1, d_model], dtype=tf_float) + if target_mapping is not None: + word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) + else: + inp_q_ext = inp_q[:, :, None] + word_emb_q = inp_q_ext * mask_emb + (1 - inp_q_ext) * word_emb_k + output_h = tf.layers.dropout(word_emb_k, dropout, training=is_training) + if inp_q is not None: + output_g = tf.layers.dropout(word_emb_q, dropout, training=is_training) + + ##### Segment embedding + if seg_id is not None: + if untie_r: + r_s_bias = tf.get_variable('r_s_bias', [n_layer, n_head, d_head], + dtype=tf_float, initializer=initializer) + else: + # default case (tie) + r_s_bias = tf.get_variable('r_s_bias', [n_head, d_head], + dtype=tf_float, initializer=initializer) + + seg_embed = tf.get_variable('seg_embed', [n_layer, 2, n_head, d_head], + dtype=tf_float, initializer=initializer) + + # Convert `seg_id` to one-hot `seg_mat` + mem_pad = tf.zeros([mlen, bsz], dtype=tf.int32) + cat_ids = tf.concat([mem_pad, seg_id], 0) + + # `1` indicates not in the same segment [qlen x klen x bsz] + seg_mat = tf.cast( + tf.logical_not(tf.equal(seg_id[:, None], cat_ids[None, :])), + tf.int32) + seg_mat = tf.one_hot(seg_mat, 2, dtype=tf_float) + else: + seg_mat = None + + ##### Positional encoding + pos_emb = relative_positional_encoding( + qlen, klen, d_model, clamp_len, attn_type, bi_data, + bsz=bsz, dtype=tf_float) + pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training) + + ##### Attention layers + if mems is None: + mems = [None] * n_layer + + for i in range(n_layer): + # cache new mems + new_mems.append(_cache_mem(output_h, mems[i], mem_len, reuse_len)) + + # segment bias + if seg_id is None: + r_s_bias_i = None + seg_embed_i = None + else: + r_s_bias_i = r_s_bias if not untie_r else r_s_bias[i] + seg_embed_i = seg_embed[i] + + with tf.variable_scope('layer_{}'.format(i)): + if inp_q is not None: + output_h, output_g = two_stream_rel_attn( + h=output_h, + g=output_g, + r=pos_emb, + r_w_bias=r_w_bias if not untie_r else r_w_bias[i], + r_r_bias=r_r_bias if not untie_r else r_r_bias[i], + seg_mat=seg_mat, + r_s_bias=r_s_bias_i, + seg_embed=seg_embed_i, + attn_mask_h=non_tgt_mask, + attn_mask_g=attn_mask, + mems=mems[i], + target_mapping=target_mapping, + d_model=d_model, + n_head=n_head, + d_head=d_head, + dropout=dropout, + dropatt=dropatt, + is_training=is_training, + kernel_initializer=initializer) + reuse = True + else: + reuse = False + + output_h = rel_multihead_attn( + h=output_h, + r=pos_emb, + r_w_bias=r_w_bias if not untie_r else r_w_bias[i], + r_r_bias=r_r_bias if not untie_r else r_r_bias[i], + seg_mat=seg_mat, + r_s_bias=r_s_bias_i, + seg_embed=seg_embed_i, + attn_mask=non_tgt_mask, + mems=mems[i], + d_model=d_model, + n_head=n_head, + d_head=d_head, + dropout=dropout, + dropatt=dropatt, + is_training=is_training, + kernel_initializer=initializer, + reuse=reuse) + + if inp_q is not None: + output_g = positionwise_ffn( + inp=output_g, + d_model=d_model, + d_inner=d_inner, + dropout=dropout, + kernel_initializer=initializer, + activation_type=ff_activation, + is_training=is_training) + + output_h = positionwise_ffn( + inp=output_h, + d_model=d_model, + d_inner=d_inner, + dropout=dropout, + kernel_initializer=initializer, + activation_type=ff_activation, + is_training=is_training, + reuse=reuse) + + if inp_q is not None: + output = tf.layers.dropout(output_g, dropout, training=is_training) + else: + output = tf.layers.dropout(output_h, dropout, training=is_training) + + return output, new_mems, lookup_table + + +def lm_loss(hidden, target, n_token, d_model, initializer, lookup_table=None, + tie_weight=False, bi_data=True, use_tpu=False): + """doc.""" + + with tf.variable_scope('lm_loss'): + if tie_weight: + assert lookup_table is not None, \ + 'lookup_table cannot be None for tie_weight' + softmax_w = lookup_table + else: + softmax_w = tf.get_variable('weight', [n_token, d_model], + dtype=hidden.dtype, initializer=initializer) + + softmax_b = tf.get_variable('bias', [n_token], dtype=hidden.dtype, + initializer=tf.zeros_initializer()) + + logits = tf.einsum('ibd,nd->ibn', hidden, softmax_w) + softmax_b + + if use_tpu: + one_hot_target = tf.one_hot(target, n_token, dtype=logits.dtype) + loss = -tf.reduce_sum(tf.nn.log_softmax(logits) * one_hot_target, -1) + else: + loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, + logits=logits) + + return loss + + +def summarize_sequence(summary_type, hidden, d_model, n_head, d_head, dropout, + dropatt, input_mask, is_training, initializer, + scope=None, reuse=None, use_proj=True): + + """ + Different classification tasks may not may not share the same parameters + to summarize the sequence features. + + If shared, one can keep the `scope` to the default value `None`. + Otherwise, one should specify a different `scope` for each task. + """ + + with tf.variable_scope(scope, 'sequnece_summary', reuse=reuse): + if summary_type == 'last': + summary = hidden[-1] + elif summary_type == 'first': + summary = hidden[0] + elif summary_type == 'mean': + summary = tf.reduce_mean(hidden, axis=0) + elif summary_type == 'attn': + bsz = tf.shape(hidden)[1] + + summary_bias = tf.get_variable('summary_bias', [d_model], + dtype=hidden.dtype, + initializer=initializer) + summary_bias = tf.tile(summary_bias[None, None], [1, bsz, 1]) + + if input_mask is not None: + input_mask = input_mask[None, :, :, None] + + summary = multihead_attn(summary_bias, hidden, hidden, input_mask, + d_model, n_head, d_head, dropout, dropatt, + is_training, initializer, residual=False) + summary = summary[0] + else: + raise ValueError('Unsupported summary type {}'.format(summary_type)) + + # use another projection as in BERT + if use_proj: + summary = tf.layers.dense( + summary, + d_model, + activation=tf.tanh, + kernel_initializer=initializer, + name='summary') + + # dropout + summary = tf.layers.dropout( + summary, dropout, training=is_training, + name='dropout') + + return summary + + +def classification_loss(hidden, labels, n_class, initializer, scope, reuse=None, + return_logits=False): + """ + Different classification tasks should use different scope names to ensure + different dense layers (parameters) are used to produce the logits. + + An exception will be in transfer learning, where one hopes to transfer + the classification weights. + """ + + with tf.variable_scope(scope, reuse=reuse): + logits = tf.layers.dense( + hidden, + n_class, + kernel_initializer=initializer, + name='logit') + + one_hot_target = tf.one_hot(labels, n_class, dtype=hidden.dtype) + loss = -tf.reduce_sum(tf.nn.log_softmax(logits) * one_hot_target, -1) + + if return_logits: + return loss, logits + + return loss + + +def regression_loss(hidden, labels, initializer, scope, reuse=None, + return_logits=False): + with tf.variable_scope(scope, reuse=reuse): + logits = tf.layers.dense( + hidden, + 1, + kernel_initializer=initializer, + name='logit') + + logits = tf.squeeze(logits, axis=-1) + loss = tf.square(logits - labels) + + if return_logits: + return loss, logits + + return loss + diff --git a/baselines/models/xlnet/prepro_utils.py b/baselines/models/xlnet/prepro_utils.py new file mode 100644 index 0000000..1d8ac83 --- /dev/null +++ b/baselines/models/xlnet/prepro_utils.py @@ -0,0 +1,138 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unicodedata +import six +from functools import partial + + +SPIECE_UNDERLINE = '▁' + + +def printable_text(text): + """Returns text encoded in a way suitable for print or `tf.logging`.""" + + # These functions want `str` for both Python2 and Python3, but in one case + # it's a Unicode string and in the other it's a byte string. + if six.PY3: + if isinstance(text, str): + return text + elif isinstance(text, bytes): + return text.decode("utf-8", "ignore") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + elif six.PY2: + if isinstance(text, str): + return text + elif isinstance(text, unicode): + return text.encode("utf-8") + else: + raise ValueError("Unsupported string type: %s" % (type(text))) + else: + raise ValueError("Not running on Python2 or Python 3?") + + +def print_(*args): + new_args = [] + for arg in args: + if isinstance(arg, list): + s = [printable_text(i) for i in arg] + s = ' '.join(s) + new_args.append(s) + else: + new_args.append(printable_text(arg)) + print(*new_args) + + +def preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False): + if remove_space: + outputs = ' '.join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if six.PY2 and isinstance(outputs, str): + outputs = outputs.decode('utf-8') + + if not keep_accents: + outputs = unicodedata.normalize('NFKD', outputs) + outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) + if lower: + outputs = outputs.lower() + + return outputs + + +def encode_pieces(sp_model, text, return_unicode=True, sample=False): + # return_unicode is used only for py2 + + # note(zhiliny): in some systems, sentencepiece only accepts str for py2 + if six.PY2 and isinstance(text, unicode): + text = text.encode('utf-8') + + if not sample: + pieces = sp_model.EncodeAsPieces(text) + else: + pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): + cur_pieces = sp_model.EncodeAsPieces( + piece[:-1].replace(SPIECE_UNDERLINE, '')) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + # note(zhiliny): convert back to unicode for py2 + if six.PY2 and return_unicode: + ret_pieces = [] + for piece in new_pieces: + if isinstance(piece, str): + piece = piece.decode('utf-8') + ret_pieces.append(piece) + new_pieces = ret_pieces + + return new_pieces + + +def encode_ids(sp_model, text, sample=False): + pieces = encode_pieces(sp_model, text, return_unicode=False, sample=sample) + ids = [sp_model.PieceToId(piece) for piece in pieces] + return ids + + +if __name__ == '__main__': + import sentencepiece as spm + + sp = spm.SentencePieceProcessor() + sp.load('sp10m.uncased.v3.model') + + print_(u'I was born in 2000, and this is falsé.') + print_(u'ORIGINAL', sp.EncodeAsPieces(u'I was born in 2000, and this is falsé.')) + print_(u'OURS', encode_pieces(sp, u'I was born in 2000, and this is falsé.')) + print(encode_ids(sp, u'I was born in 2000, and this is falsé.')) + print_('') + prepro_func = partial(preprocess_text, lower=True) + print_(prepro_func('I was born in 2000, and this is falsé.')) + print_('ORIGINAL', sp.EncodeAsPieces(prepro_func('I was born in 2000, and this is falsé.'))) + print_('OURS', encode_pieces(sp, prepro_func('I was born in 2000, and this is falsé.'))) + print(encode_ids(sp, prepro_func('I was born in 2000, and this is falsé.'))) + print_('') + print_('I was born in 2000, and this is falsé.') + print_('ORIGINAL', sp.EncodeAsPieces('I was born in 2000, and this is falsé.')) + print_('OURS', encode_pieces(sp, 'I was born in 2000, and this is falsé.')) + print(encode_ids(sp, 'I was born in 2000, and this is falsé.')) + print_('') + print_('I was born in 92000, and this is falsé.') + print_('ORIGINAL', sp.EncodeAsPieces('I was born in 92000, and this is falsé.')) + print_('OURS', encode_pieces(sp, 'I was born in 92000, and this is falsé.')) + print(encode_ids(sp, 'I was born in 92000, and this is falsé.')) + diff --git a/baselines/models/xlnet/run_classifier.py b/baselines/models/xlnet/run_classifier.py new file mode 100644 index 0000000..8661d5d --- /dev/null +++ b/baselines/models/xlnet/run_classifier.py @@ -0,0 +1,1360 @@ +# -*- coding: utf-8 -*- +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-10 13:46:50 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from os.path import join +from absl import flags +import os +import sys +import csv +import collections +import numpy as np +import time +import math +import json +import random +from copy import copy +from collections import defaultdict as dd + +import absl.logging as _logging # pylint: disable=unused-import +import tensorflow as tf + +import sentencepiece as spm + +from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID +import model_utils +import function_builder +from classifier_utils import PaddingInputExample +from classifier_utils import convert_single_example +from classifier_utils import convert_example_list_for_inews +from prepro_utils import preprocess_text, encode_ids + + +# Model +flags.DEFINE_string("model_config_path", default=None, + help="Model config path.") +flags.DEFINE_float("dropout", default=0.1, + help="Dropout rate.") +flags.DEFINE_float("dropatt", default=0.1, + help="Attention dropout rate.") +flags.DEFINE_integer("clamp_len", default=-1, + help="Clamp length") +flags.DEFINE_string("summary_type", default="last", + help="Method used to summarize a sequence into a compact vector.") +flags.DEFINE_bool("use_summ_proj", default=True, + help="Whether to use projection for summarizing sequences.") +flags.DEFINE_bool("use_bfloat16", False, + help="Whether to use bfloat16.") + +# Parameter initialization +flags.DEFINE_enum("init", default="normal", + enum_values=["normal", "uniform"], + help="Initialization method.") +flags.DEFINE_float("init_std", default=0.02, + help="Initialization std when init is normal.") +flags.DEFINE_float("init_range", default=0.1, + help="Initialization std when init is uniform.") + +# I/O paths +flags.DEFINE_bool("overwrite_data", default=False, + help="If False, will use cached data if available.") +flags.DEFINE_string("init_checkpoint", default=None, + help="checkpoint path for initializing the model. " + "Could be a pretrained model or a finetuned model.") +flags.DEFINE_string("output_dir", default="", + help="Output dir for TF records.") +flags.DEFINE_string("spiece_model_file", default="", + help="Sentence Piece model path.") +flags.DEFINE_string("model_dir", default="", + help="Directory for saving the finetuned model.") +flags.DEFINE_string("data_dir", default="", + help="Directory for input data.") + +# TPUs and machines +flags.DEFINE_bool("use_tpu", default=False, help="whether to use TPU.") +flags.DEFINE_integer("num_hosts", default=1, help="How many TPU hosts.") +flags.DEFINE_integer("num_core_per_host", default=8, + help="8 for TPU v2 and v3-8, 16 for larger TPU v3 pod. In the context " + "of GPU training, it refers to the number of GPUs used.") +flags.DEFINE_string("tpu_job_name", default=None, help="TPU worker job name.") +flags.DEFINE_string("tpu", default=None, help="TPU name.") +flags.DEFINE_string("tpu_zone", default=None, help="TPU zone.") +flags.DEFINE_string("gcp_project", default=None, help="gcp project.") +flags.DEFINE_string("master", default=None, help="master") +flags.DEFINE_integer("iterations", default=1000, + help="number of iterations per TPU training loop.") + +# training +flags.DEFINE_bool("do_train", default=False, help="whether to do training") +flags.DEFINE_integer("train_steps", default=1000, + help="Number of training steps") +flags.DEFINE_integer("num_train_epochs", default=0, + help="Number of training steps") +flags.DEFINE_integer("warmup_steps", default=0, help="number of warmup steps") +flags.DEFINE_float("learning_rate", default=1e-5, help="initial learning rate") +flags.DEFINE_float("lr_layer_decay_rate", 1.0, + "Top layer: lr[L] = FLAGS.learning_rate." + "Low layer: lr[l-1] = lr[l] * lr_layer_decay_rate.") +flags.DEFINE_float("min_lr_ratio", default=0.0, + help="min lr ratio for cos decay.") +flags.DEFINE_float("clip", default=1.0, help="Gradient clipping") +flags.DEFINE_integer("max_save", default=0, + help="Max number of checkpoints to save. Use 0 to save all.") +flags.DEFINE_integer("save_steps", default=None, + help="Save the model for every save_steps. " + "If None, not to save any model.") +flags.DEFINE_integer("train_batch_size", default=8, + help="Batch size for training") +flags.DEFINE_float("weight_decay", default=0.00, help="Weight decay rate") +flags.DEFINE_float("adam_epsilon", default=1e-8, help="Adam epsilon") +flags.DEFINE_string("decay_method", default="poly", help="poly or cos") + +# evaluation +flags.DEFINE_bool("do_eval", default=False, help="whether to do eval") +flags.DEFINE_bool("do_predict", default=False, help="whether to do prediction") +flags.DEFINE_float("predict_threshold", default=0, + help="Threshold for binary prediction.") +flags.DEFINE_string("eval_split", default="dev", help="could be dev or test") +flags.DEFINE_integer("eval_batch_size", default=128, + help="batch size for evaluation") +flags.DEFINE_integer("predict_batch_size", default=128, + help="batch size for prediction.") +flags.DEFINE_string("predict_dir", default=None, + help="Dir for saving prediction files.") +flags.DEFINE_bool("eval_all_ckpt", default=False, + help="Eval all ckpts. If False, only evaluate the last one.") +flags.DEFINE_string("predict_ckpt", default=None, + help="Ckpt path for do_predict. If None, use the last one.") + +# task specific +flags.DEFINE_string("task_name", default=None, help="Task name") +flags.DEFINE_integer("max_seq_length", default=128, help="Max sequence length") +flags.DEFINE_integer("shuffle_buffer", default=2048, + help="Buffer size used for shuffle.") +flags.DEFINE_integer("num_passes", default=1, + help="Num passes for processing training data. " + "This is use to batch data without loss for TPUs.") +flags.DEFINE_bool("uncased", default=False, + help="Use uncased.") +flags.DEFINE_string("cls_scope", default=None, + help="Classifier layer scope.") +flags.DEFINE_bool("is_regression", default=False, + help="Whether it's a regression task.") + +FLAGS = flags.FLAGS + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if len(line) == 0: + continue + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = (line[2]) + text_b = (line[3]) + if set_type == "test": + label = "0" + else: + label = (line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file, num_passes=1): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + if num_passes > 1: + examples *= num_passes + + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = None + if set_type == "test": + #label = "0" + label = line[1] + else: + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = None + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class iFLYTEKDataProcessor(DataProcessor): + """Processor for the iFLYTEKApp data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(119): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[1] + text_b = None + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = line[2] + text_a = line[0] + text_b = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + +class BQProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = line[2] + text_a = line[0] + text_b = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class GLUEProcessor(DataProcessor): + def __init__(self): + self.train_file = "train.tsv" + self.dev_file = "dev.tsv" + self.test_file = "test.tsv" + self.label_column = None + self.text_a_column = None + self.text_b_column = None + self.contains_header = True + self.test_text_a_column = None + self.test_text_b_column = None + self.test_contains_header = True + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.train_file)), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.dev_file)), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + if self.test_text_a_column is None: + self.test_text_a_column = self.text_a_column + if self.test_text_b_column is None: + self.test_text_b_column = self.text_b_column + + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.test_file)), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 and self.contains_header and set_type != "test": + continue + if i == 0 and self.test_contains_header and set_type == "test": + continue + guid = "%s-%s" % (set_type, i) + + a_column = (self.text_a_column if set_type != "test" else + self.test_text_a_column) + b_column = (self.text_b_column if set_type != "test" else + self.test_text_b_column) + + # there are some incomplete lines in QNLI + if len(line) <= a_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_a = line[a_column] + + if b_column is not None: + if len(line) <= b_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_b = line[b_column] + else: + text_b = None + + if set_type == "test": + label = self.get_labels()[0] + else: + if len(line) <= self.label_column: + tf.logging.warning('Incomplete line, ignored.') + continue + label = line[self.label_column] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class Yelp5Processor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train.csv")) + + def get_dev_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test.csv")) + + def get_labels(self): + """See base class.""" + return ["1", "2", "3", "4", "5"] + + def _create_examples(self, input_file): + """Creates examples for the training and dev sets.""" + examples = [] + with tf.gfile.Open(input_file) as f: + reader = csv.reader(f) + for i, line in enumerate(reader): + + label = line[0] + text_a = line[1].replace('""', '"').replace('\\"', '"') + examples.append( + InputExample(guid=str(i), text_a=text_a, text_b=None, label=label)) + return examples + + +class ImdbProcessor(DataProcessor): + def get_labels(self): + return ["neg", "pos"] + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train")) + + def get_dev_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test")) + + def _create_examples(self, data_dir): + examples = [] + for label in ["neg", "pos"]: + cur_dir = os.path.join(data_dir, label) + for filename in tf.gfile.ListDirectory(cur_dir): + if not filename.endswith("txt"): + continue + + path = os.path.join(cur_dir, filename) + with tf.gfile.Open(path) as f: + text = f.read().strip().replace("
", " ") + examples.append(InputExample( + guid="unused_id", text_a=text, text_b=None, label=label)) + return examples + + +class MnliMatchedProcessor(GLUEProcessor): + def __init__(self): + super(MnliMatchedProcessor, self).__init__() + self.dev_file = "dev_matched.tsv" + self.test_file = "test_matched.tsv" + self.label_column = -1 + self.text_a_column = 8 + self.text_b_column = 9 + + def get_labels(self): + return ["contradiction", "entailment", "neutral"] + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = line[0] + text_b = line[1] + label = line[2] + if label == "contradictory": + label = "contradiction" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, set_type+".tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = line[0] + if language != self.language: + continue + text_a = line[6] + text_b = line[7] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class CSCProcessor(DataProcessor): + def get_labels(self): + return ["0", "1"] + + def get_train_examples(self, data_dir): + set_type = "train" + input_file = os.path.join(data_dir, set_type + ".tsv") + tf.logging.info("using file %s" % input_file) + lines = self._read_tsv(input_file) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + + text_a = line[1] + label = line[0] + + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + def get_devtest_examples(self, data_dir, set_type="dev"): + input_file = os.path.join(data_dir, set_type + ".tsv") + tf.logging.info("using file %s" % input_file) + lines = self._read_tsv(input_file) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + + text_a = line[1] + label = line[0] + + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliMatchedProcessor): + def __init__(self): + super(MnliMismatchedProcessor, self).__init__() + self.dev_file = "dev_mismatched.tsv" + self.test_file = "test_mismatched.tsv" + + +class StsbProcessor(GLUEProcessor): + def __init__(self): + super(StsbProcessor, self).__init__() + self.label_column = 9 + self.text_a_column = 7 + self.text_b_column = 8 + + def get_labels(self): + return [0.0] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 and self.contains_header and set_type != "test": + continue + if i == 0 and self.test_contains_header and set_type == "test": + continue + guid = "%s-%s" % (set_type, i) + + a_column = (self.text_a_column if set_type != "test" else + self.test_text_a_column) + b_column = (self.text_b_column if set_type != "test" else + self.test_text_b_column) + + # there are some incomplete lines in QNLI + if len(line) <= a_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_a = line[a_column] + + if b_column is not None: + if len(line) <= b_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_b = line[b_column] + else: + text_b = None + + if set_type == "test": + label = self.get_labels()[0] + else: + if len(line) <= self.label_column: + tf.logging.warning('Incomplete line, ignored.') + continue + label = float(line[self.label_column]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + + return examples + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenize_fn, output_file, + num_passes=1): + """Convert a set of `InputExample`s to a TFRecord file.""" + print(len(examples)) + sys.stdout.flush() + # do not create duplicated records + if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data: + tf.logging.info("Do not overwrite tfrecord {} exists.".format(output_file)) + return + + tf.logging.info("Create new tfrecord {}.".format(output_file)) + + writer = tf.python_io.TFRecordWriter(output_file) + + if num_passes > 1: + examples *= num_passes + + print(len(examples)) + sys.stdout.flush() + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example {} of {}".format(ex_index, + len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenize_fn) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_float_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + if label_list is not None: + features["label_ids"] = create_int_feature([feature.label_id]) + else: + features["label_ids"] = create_float_feature([float(feature.label_id)]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.float32), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + if FLAGS.is_regression: + name_to_features["label_ids"] = tf.FixedLenFeature([], tf.float32) + + tf.logging.info("Input tfrecord file {}".format(input_file)) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def input_fn(params, input_context=None): + """The actual input function.""" + if FLAGS.use_tpu: + batch_size = params["batch_size"] + elif is_training: + batch_size = FLAGS.train_batch_size + elif FLAGS.do_eval: + batch_size = FLAGS.eval_batch_size + else: + batch_size = FLAGS.predict_batch_size + + d = tf.data.TFRecordDataset(input_file) + # Shard the dataset to difference devices + if input_context is not None: + tf.logging.info("Input pipeline id %d out of %d", + input_context.input_pipeline_id, input_context.num_replicas_in_sync) + d = d.shard(input_context.num_input_pipelines, + input_context.input_pipeline_id) + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = d.shuffle(buffer_size=FLAGS.shuffle_buffer) + d = d.repeat() + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def get_model_fn(n_class): + def model_fn(features, labels, mode, params): + #### Training or Evaluation + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + # Get loss from inputs + if FLAGS.is_regression: + (total_loss, per_example_loss, logits + ) = function_builder.get_regression_loss(FLAGS, features, is_training) + else: + (total_loss, per_example_loss, logits + ) = function_builder.get_classification_loss( + FLAGS, features, n_class, is_training) + + # Check model parameters + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + tf.logging.info('#params: {}'.format(num_params)) + + # load pretrained models + scaffold_fn = model_utils.init_from_checkpoint(FLAGS) + + # Evaluation mode + if mode == tf.estimator.ModeKeys.EVAL: + assert FLAGS.num_hosts == 1 + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + eval_input_dict = { + 'labels': label_ids, + 'predictions': predictions, + 'weights': is_real_example + } + accuracy = tf.metrics.accuracy(**eval_input_dict) + + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + 'eval_accuracy': accuracy, + 'eval_loss': loss} + + def regression_metric_fn( + per_example_loss, label_ids, logits, is_real_example): + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + pearsonr = tf.contrib.metrics.streaming_pearson_correlation( + logits, label_ids, weights=is_real_example) + return {'eval_loss': loss, 'eval_pearsonr': pearsonr} + + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + + # Constucting evaluation TPUEstimatorSpec with new cache. + label_ids = tf.reshape(features['label_ids'], [-1]) + + if FLAGS.is_regression: + metric_fn = regression_metric_fn + else: + metric_fn = metric_fn + metric_args = [per_example_loss, label_ids, logits, is_real_example] + + if FLAGS.use_tpu: + eval_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=(metric_fn, metric_args), + scaffold_fn=scaffold_fn) + else: + eval_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + eval_metric_ops=metric_fn(*metric_args)) + + return eval_spec + + elif mode == tf.estimator.ModeKeys.PREDICT: + label_ids = tf.reshape(features["label_ids"], [-1]) + + predictions = { + "logits": logits, + "labels": label_ids, + "is_real": features["is_real_example"] + } + + if FLAGS.use_tpu: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + output_spec = tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions) + return output_spec + + # Configuring the optimizer + train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) + + monitor_dict = {} + monitor_dict["lr"] = learning_rate + + # Constucting training TPUEstimatorSpec with new cache. + if FLAGS.use_tpu: + # Creating host calls + if not FLAGS.is_regression: + label_ids = tf.reshape(features['label_ids'], [-1]) + predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) + is_correct = tf.equal(predictions, label_ids) + accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) + + monitor_dict["accuracy"] = accuracy + + host_call = function_builder.construct_scalar_host_call( + monitor_dict=monitor_dict, + model_dir=FLAGS.model_dir, + prefix="train/", + reduce_fn=tf.reduce_mean) + else: + host_call = None + + train_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, + scaffold_fn=scaffold_fn) + else: + train_spec = tf.estimator.EstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op) + + return train_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + # Validate flags + if FLAGS.save_steps is not None: + FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) + + if FLAGS.do_predict: + predict_dir = FLAGS.predict_dir + if not tf.gfile.Exists(predict_dir): + tf.gfile.MakeDirs(predict_dir) + + processors = { + "mnli_matched": MnliMatchedProcessor, + "mnli_mismatched": MnliMismatchedProcessor, + 'sts-b': StsbProcessor, + 'imdb': ImdbProcessor, + "yelp5": Yelp5Processor, + "xnli": XnliProcessor, + "csc": CSCProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "lcqmc_pair": LCQMCProcessor, + "lcqmc": LCQMCProcessor, + "bq": BQProcessor, + "thucnews":THUCNewsProcessor, + "iflydata": iFLYTEKDataProcessor + } + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval, `do_predict` or " + "`do_submit` must be True.") + + if not tf.gfile.Exists(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + label_list = processor.get_labels() if not FLAGS.is_regression else None + + sp = spm.SentencePieceProcessor() + sp.Load(FLAGS.spiece_model_file) + + def tokenize_fn(text): + text = preprocess_text(text, lower=FLAGS.uncased) + return encode_ids(sp, text) + + run_config = model_utils.configure_tpu(FLAGS) + + model_fn = get_model_fn(len(label_list) if label_list is not None else None) + + spm_basename = os.path.basename(FLAGS.spiece_model_file) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + if FLAGS.use_tpu: + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + else: + estimator = tf.estimator.Estimator( + model_fn=model_fn, + config=run_config) + + if FLAGS.do_train: + train_file_base = "{}.len-{}.train.tf_record".format( + spm_basename, FLAGS.max_seq_length) + train_file = os.path.join(FLAGS.output_dir, train_file_base) + tf.logging.info("Use tfrecord file {}".format(train_file)) + + train_examples = processor.get_train_examples(FLAGS.data_dir) + np.random.shuffle(train_examples) + tf.logging.info("Num of train samples: {}".format(len(train_examples))) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + train_file, FLAGS.num_passes) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + train_file, FLAGS.num_passes) + + # here we use epoch number to calculate total train_steps + train_steps = int(len(train_examples) * FLAGS.num_train_epochs / FLAGS.train_batch_size) + FLAGS.warmup_steps = int(0.1 * train_steps) + + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + + estimator.train(input_fn=train_input_fn, max_steps=train_steps) + + if FLAGS.do_eval or FLAGS.do_predict: + eval_examples = processor.get_devtest_examples(FLAGS.data_dir, FLAGS.eval_split) + tf.logging.info("Num of eval samples: {}".format(len(eval_examples))) + + if FLAGS.do_eval: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + # + # Modified in XL: We also adopt the same mechanism for GPUs. + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file_base = "{}.len-{}.{}.eval.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=True) + + # Filter out all checkpoints in the directory + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.model_dir) + + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = join(FLAGS.model_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + # Decide whether to evaluate all ckpts + if not FLAGS.eval_all_ckpt: + steps_and_files = steps_and_files[-1:] + + eval_results = [] + output_eval_file = os.path.join(FLAGS.data_dir, "dev_results_bert.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + ret = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=filename) + + ret["step"] = global_step + ret["path"] = filename + + eval_results.append(ret) + + tf.logging.info("=" * 80) + log_str = "Eval result | " + for key, val in sorted(ret.items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + writer.write("%s = %s\n" % (key, val)) + tf.logging.info(log_str) + + key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" + eval_results.sort(key=lambda x: x[key_name], reverse=True) + + tf.logging.info("=" * 80) + log_str = "Best result | " + for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) +### evalation testset +##################################################################################### + eval_examples = processor.get_test_examples(FLAGS.data_dir) + tf.logging.info("Num of eval samples: {}".format(len(eval_examples))) + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file_base = "{}.len-{}.{}.test.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=True) + + # Filter out all checkpoints in the directory + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.model_dir) + + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = join(FLAGS.model_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + # Decide whether to evaluate all ckpts + if not FLAGS.eval_all_ckpt: + steps_and_files = steps_and_files[-1:] + + eval_results = [] + output_eval_file = os.path.join(FLAGS.data_dir, "test_results_bert.txt") + print("output_eval_file:", output_eval_file) + tf.logging.info("output_eval_file:" + output_eval_file) + with tf.gfile.GFile(output_eval_file, "w") as writer: + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + ret = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=filename) + + ret["step"] = global_step + ret["path"] = filename + + eval_results.append(ret) + + tf.logging.info("=" * 80) + log_str = "Eval result | " + for key, val in sorted(ret.items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + writer.write("%s = %s\n" % (key, val)) + tf.logging.info(log_str) + + key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" + eval_results.sort(key=lambda x: x[key_name], reverse=True) + + tf.logging.info("=" * 80) + log_str = "Best result | " + for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + + if FLAGS.do_predict: + eval_examples = processor.get_test_examples(FLAGS.data_dir) + eval_file_base = "{}.len-{}.{}.predict.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + pred_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + predict_results = [] + with tf.gfile.Open(os.path.join(predict_dir, "{}.tsv".format( + task_name)), "w") as fout: + fout.write("index\tprediction\n") + + for pred_cnt, result in enumerate(estimator.predict( + input_fn=pred_input_fn, + yield_single_examples=True, + checkpoint_path=FLAGS.predict_ckpt)): + if pred_cnt % 1000 == 0: + tf.logging.info("Predicting submission for example: {}".format( + pred_cnt)) + + logits = [float(x) for x in result["logits"].flat] + predict_results.append(logits) + + if len(logits) == 1: + label_out = logits[0] + elif len(logits) == 2: + if logits[1] - logits[0] > FLAGS.predict_threshold: + label_out = label_list[1] + else: + label_out = label_list[0] + elif len(logits) > 2: + max_index = np.argmax(np.array(logits, dtype=np.float32)) + label_out = label_list[max_index] + else: + raise NotImplementedError + + fout.write("{}\t{}\n".format(pred_cnt, label_out)) + + predict_json_path = os.path.join(predict_dir, "{}.logits.json".format( + task_name)) + + with tf.gfile.Open(predict_json_path, "w") as fp: + json.dump(predict_results, fp, indent=4) + + +if __name__ == "__main__": + tf.app.run() diff --git a/baselines/models/xlnet/run_classifier_bq.sh b/baselines/models/xlnet/run_classifier_bq.sh new file mode 100644 index 0000000..bb4eb66 --- /dev/null +++ b/baselines/models/xlnet/run_classifier_bq.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:58:56 + +TASK_NAME="bq" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/bq.zip + unzip bq.zip + rm bq.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_classifier_iflydata.sh b/baselines/models/xlnet/run_classifier_iflydata.sh new file mode 100644 index 0000000..edf73d3 --- /dev/null +++ b/baselines/models/xlnet/run_classifier_iflydata.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:06 + +TASK_NAME="iflydata" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/iflytek.zip + unzip iflytek.zip + rm iflytek.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_classifier_inews.sh b/baselines/models/xlnet/run_classifier_inews.sh new file mode 100644 index 0000000..06f4b7a --- /dev/null +++ b/baselines/models/xlnet/run_classifier_inews.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:11 + +TASK_NAME="inews" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/inews.zip + unzip inews.zip + rm inews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_classifier_lcqmc.sh b/baselines/models/xlnet/run_classifier_lcqmc.sh new file mode 100644 index 0000000..a3e0072 --- /dev/null +++ b/baselines/models/xlnet/run_classifier_lcqmc.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:17 + +TASK_NAME="lcqmc" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +echo "Please try again if the data is not downloaded successfully." +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/train.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/dev.txt +wget -c https://raw.githubusercontent.com/pengming617/text_matching/master/data/test.txt +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_classifier_thucnews.py b/baselines/models/xlnet/run_classifier_thucnews.py new file mode 100644 index 0000000..5d328b1 --- /dev/null +++ b/baselines/models/xlnet/run_classifier_thucnews.py @@ -0,0 +1,1380 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from os.path import join +# from absl import flags +import os +import sys +import csv +import collections +import numpy as np +import time +import math +import json +import random +from copy import copy +from collections import defaultdict as dd + +# import absl.logging as _logging # pylint: disable=unused-import +import tensorflow as tf + +import sentencepiece as spm + +from data_utils import SEP_ID, VOCAB_SIZE, CLS_ID +import model_utils +import function_builder +from classifier_utils import PaddingInputExample +from classifier_utils import convert_single_example +from prepro_utils import preprocess_text, encode_ids + +FLAGS = tf.flags.FLAGS + +# Model +tf.flags.DEFINE_string("model_config_path",None, + "Model config path.") +tf.flags.DEFINE_float("dropout",0.1, + "Dropout rate.") +tf.flags.DEFINE_float("dropatt",0.1, + "Attention dropout rate.") +tf.flags.DEFINE_integer("clamp_len", -1, + "Clamp length") +tf.flags.DEFINE_string("summary_type", "last", + "Method used to summarize a sequence into a compact vector.") +tf.flags.DEFINE_bool("use_summ_proj", True, + "Whether to use projection for summarizing sequences.") +tf.flags.DEFINE_bool("use_bfloat16", False, + "Whether to use bfloat16.") + +# Parameter initialization +# tf.flags.DEFINE_enum("init","normal", +# enum_values=["normal", "uniform"], +# help="Initialization method.") +tf.flags.DEFINE_string("init", "normal", + "Initialization method ,either normal or uniform. ") +tf.flags.DEFINE_float("init_std", 0.02, + "Initialization std when init is normal.") +tf.flags.DEFINE_float("init_range",0.1, + "Initialization std when init is uniform.") + +# I/O paths +tf.flags.DEFINE_bool("overwrite_data", False, + "If False, will use cached data if available.") +tf.flags.DEFINE_string("init_checkpoint", None, + "checkpoint path for initializing the model. " + "Could be a pretrained model or a finetuned model.") +tf.flags.DEFINE_string("output_dir", "", + "Output dir for TF records.") +tf.flags.DEFINE_string("spiece_model_file", "", + "Sentence Piece model path.") +tf.flags.DEFINE_string("model_dir","", + "Directory for saving the finetuned model.") +tf.flags.DEFINE_string("data_dir", "", + "Directory for input data.") + +# TPUs and machines +tf.flags.DEFINE_bool("use_tpu",False, "whether to use TPU.") +tf.flags.DEFINE_integer("num_hosts", 1, "How many TPU hosts.") +tf.flags.DEFINE_integer("num_core_per_host", 8, + "8 for TPU v2 and v3-8, 16 for larger TPU v3 pod. In the context " + "of GPU training, it refers to the number of GPUs used.") +tf.flags.DEFINE_string("tpu_job_name",None, "TPU worker job name.") +tf.flags.DEFINE_string("tpu", None, "TPU name.") +tf.flags.DEFINE_string("tpu_zone", None,"TPU zone.") +tf.flags.DEFINE_string("gcp_project", None, "gcp project.") +tf.flags.DEFINE_string("master", None, "master") +tf.flags.DEFINE_integer("iterations", 1000, + "number of iterations per TPU training loop.") + +# training +tf.flags.DEFINE_bool("do_train", False, "whether to do training") +tf.flags.DEFINE_integer("train_steps", 1000, + "Number of training steps") +tf.flags.DEFINE_integer("num_train_epochs", 0, + "Number of training steps") +tf.flags.DEFINE_integer("warmup_steps", 0, "number of warmup steps") +tf.flags.DEFINE_float("learning_rate", 1e-5, "initial learning rate") +tf.flags.DEFINE_float("lr_layer_decay_rate", 1.0, + "Top layer: lr[L] = FLAGS.learning_rate." + "Low layer: lr[l-1] = lr[l] * lr_layer_decay_rate.") +tf.flags.DEFINE_float("min_lr_ratio", 0.0, + "min lr ratio for cos decay.") +tf.flags.DEFINE_float("clip", 1.0,"Gradient clipping") +tf.flags.DEFINE_integer("max_save", 0, + "Max number of checkpoints to save. Use 0 to save all.") +tf.flags.DEFINE_integer("save_steps", None, + "Save the model for every save_steps. " + "If None, not to save any model.") +tf.flags.DEFINE_integer("train_batch_size", 8, + "Batch size for training") +tf.flags.DEFINE_float("weight_decay", 0.00, "Weight decay rate") +tf.flags.DEFINE_float("adam_epsilon", 1e-8, "Adam epsilon") +tf.flags.DEFINE_string("decay_method", "poly", "poly or cos") + +# evaluation +tf.flags.DEFINE_bool("do_eval", False, "whether to do eval") +tf.flags.DEFINE_bool("do_predict", False, "whether to do prediction") +tf.flags.DEFINE_float("predict_threshold", 0, + "Threshold for binary prediction.") +tf.flags.DEFINE_string("eval_split", "dev", "could be dev or test") +tf.flags.DEFINE_integer("eval_batch_size", 128, + "batch size for evaluation") +tf.flags.DEFINE_integer("predict_batch_size", 128, + "batch size for prediction.") +tf.flags.DEFINE_string("predict_dir", None, + "Dir for saving prediction files.") +tf.flags.DEFINE_bool("eval_all_ckpt", False, + "Eval all ckpts. If False, only evaluate the last one.") +tf.flags.DEFINE_string("predict_ckpt", None, + "Ckpt path for do_predict. If None, use the last one.") + +# task specific +tf.flags.DEFINE_string("task_name", None,"Task name") +tf.flags.DEFINE_integer("max_seq_length", 128, "Max sequence length") +tf.flags.DEFINE_integer("shuffle_buffer", 2048, + "Buffer size used for shuffle.") +tf.flags.DEFINE_integer("num_passes", 1, + "Num passes for processing training data. " + "This is use to batch data without loss for TPUs.") +tf.flags.DEFINE_bool("uncased", False, + "Use uncased.") +tf.flags.DEFINE_string("cls_scope", None, + "Classifier layer scope.") +tf.flags.DEFINE_bool("is_regression", False, + "Whether it's a regression task.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + if len(line) == 0: + continue + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines + + +class InewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[2]) + text_b = tokenization.convert_to_unicode(line[3]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +def convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, + tokenizer, example): + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + + return feature + +def convert_example_list_for_inews(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return [InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False)] + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + must_len = len(tokens_a) + 3 + extra_len = max_seq_length - must_len + feature_list = [] + if example.text_b and extra_len > 0: + extra_num = int((len(tokens_b) -1) / extra_len) + 1 + for num in range(extra_num): + max_len = min((num+1)*extra_len, len(tokens_b)) + tokens_b_sub = tokens_b[num*extra_len: max_len] + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b_sub, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + else: + feature = convert_single_example_for_inews(ex_index, tokens_a, tokens_b, label_map, max_seq_length, tokenizer, example) + feature_list.append(feature) + return feature_list + + +def file_based_convert_examples_to_features_for_inews( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + num_example = 0 + for (ex_index, example) in enumerate(examples): + if ex_index % 1000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature_list = convert_example_list_for_inews(ex_index, example, label_list, + max_seq_length, tokenizer) + num_example += len(feature_list) + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + for feature in feature_list: + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + tf.logging.info("feature num: %s", num_example) + writer.close() +class TnewsProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = None + if set_type == "test": + label = "0" + else: + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class THUCNewsProcessor(DataProcessor): + """Processor for the THUCNews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(14): + labels.append(str(i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 or len(line) < 3: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = None + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + +class LCQMCProcessor(DataProcessor): + """Processor for the internal data set. sentence pair classification""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + # dev_0827.tsv + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), set_type) + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + # return ["-1","0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + print("length of lines:", len(lines)) + for (i, line) in enumerate(lines): + # print('#i:',i,line) + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + try: + label = tokenization.convert_to_unicode(line[2]) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + except Exception: + print('###error.i:', i, line) + return examples + + +class GLUEProcessor(DataProcessor): + def __init__(self): + self.train_file = "train.tsv" + self.dev_file = "dev.tsv" + self.test_file = "test.tsv" + self.label_column = None + self.text_a_column = None + self.text_b_column = None + self.contains_header = True + self.test_text_a_column = None + self.test_text_b_column = None + self.test_contains_header = True + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.train_file)), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.dev_file)), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + if self.test_text_a_column is None: + self.test_text_a_column = self.text_a_column + if self.test_text_b_column is None: + self.test_text_b_column = self.text_b_column + + return self._create_examples( + self._read_tsv(os.path.join(data_dir, self.test_file)), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 and self.contains_header and set_type != "test": + continue + if i == 0 and self.test_contains_header and set_type == "test": + continue + guid = "%s-%s" % (set_type, i) + + a_column = (self.text_a_column if set_type != "test" else + self.test_text_a_column) + b_column = (self.text_b_column if set_type != "test" else + self.test_text_b_column) + + # there are some incomplete lines in QNLI + if len(line) <= a_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_a = line[a_column] + + if b_column is not None: + if len(line) <= b_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_b = line[b_column] + else: + text_b = None + + if set_type == "test": + label = self.get_labels()[0] + else: + if len(line) <= self.label_column: + tf.logging.warning('Incomplete line, ignored.') + continue + label = line[self.label_column] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class Yelp5Processor(DataProcessor): + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train.csv")) + + def get_dev_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test.csv")) + + def get_labels(self): + """See base class.""" + return ["1", "2", "3", "4", "5"] + + def _create_examples(self, input_file): + """Creates examples for the training and dev sets.""" + examples = [] + with tf.gfile.Open(input_file) as f: + reader = csv.reader(f) + for i, line in enumerate(reader): + + label = line[0] + text_a = line[1].replace('""', '"').replace('\\"', '"') + examples.append( + InputExample(guid=str(i), text_a=text_a, text_b=None, label=label)) + return examples + + +class ImdbProcessor(DataProcessor): + def get_labels(self): + return ["neg", "pos"] + + def get_train_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "train")) + + def get_dev_examples(self, data_dir): + return self._create_examples(os.path.join(data_dir, "test")) + + def _create_examples(self, data_dir): + examples = [] + for label in ["neg", "pos"]: + cur_dir = os.path.join(data_dir, label) + for filename in tf.gfile.ListDirectory(cur_dir): + if not filename.endswith("txt"): + continue + + path = os.path.join(cur_dir, filename) + with tf.gfile.Open(path) as f: + text = f.read().strip().replace("
", " ") + examples.append(InputExample( + guid="unused_id", text_a=text, text_b=None, label=label)) + return examples + + +class MnliMatchedProcessor(GLUEProcessor): + def __init__(self): + super(MnliMatchedProcessor, self).__init__() + self.dev_file = "dev_matched.tsv" + self.test_file = "test_matched.tsv" + self.label_column = -1 + self.text_a_column = 8 + self.text_b_column = 9 + + def get_labels(self): + return ["contradiction", "entailment", "neutral"] + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = line[0] + text_b = line[1] + label = line[2] + if label == "contradictory": + label = "contradiction" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_devtest_examples(self, data_dir, set_type="dev"): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, set_type+".tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = line[0] + if language != self.language: + continue + text_a = line[6] + text_b = line[7] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class CSCProcessor(DataProcessor): + def get_labels(self): + return ["0", "1"] + + def get_train_examples(self, data_dir): + set_type = "train" + input_file = os.path.join(data_dir, set_type + ".tsv") + tf.logging.info("using file %s" % input_file) + lines = self._read_tsv(input_file) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + + text_a = line[1] + label = line[0] + + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + def get_devtest_examples(self, data_dir, set_type="dev"): + input_file = os.path.join(data_dir, set_type + ".tsv") + tf.logging.info("using file %s" % input_file) + lines = self._read_tsv(input_file) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + + text_a = line[1] + label = line[0] + + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliMatchedProcessor): + def __init__(self): + super(MnliMismatchedProcessor, self).__init__() + self.dev_file = "dev_mismatched.tsv" + self.test_file = "test_mismatched.tsv" + + +class StsbProcessor(GLUEProcessor): + def __init__(self): + super(StsbProcessor, self).__init__() + self.label_column = 9 + self.text_a_column = 7 + self.text_b_column = 8 + + def get_labels(self): + return [0.0] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0 and self.contains_header and set_type != "test": + continue + if i == 0 and self.test_contains_header and set_type == "test": + continue + guid = "%s-%s" % (set_type, i) + + a_column = (self.text_a_column if set_type != "test" else + self.test_text_a_column) + b_column = (self.text_b_column if set_type != "test" else + self.test_text_b_column) + + # there are some incomplete lines in QNLI + if len(line) <= a_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_a = line[a_column] + + if b_column is not None: + if len(line) <= b_column: + tf.logging.warning('Incomplete line, ignored.') + continue + text_b = line[b_column] + else: + text_b = None + + if set_type == "test": + label = self.get_labels()[0] + else: + if len(line) <= self.label_column: + tf.logging.warning('Incomplete line, ignored.') + continue + label = float(line[self.label_column]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + + return examples + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenize_fn, output_file, + num_passes=1): + """Convert a set of `InputExample`s to a TFRecord file.""" + print(len(examples)) + sys.stdout.flush() + # do not create duplicated records + if tf.gfile.Exists(output_file) and not FLAGS.overwrite_data: + tf.logging.info("Do not overwrite tfrecord {} exists.".format(output_file)) + return + + tf.logging.info("Create new tfrecord {}.".format(output_file)) + + writer = tf.python_io.TFRecordWriter(output_file) + + if num_passes > 1: + examples *= num_passes + + print(len(examples)) + sys.stdout.flush() + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example {} of {}".format(ex_index, + len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenize_fn) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_float_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + if label_list is not None: + features["label_ids"] = create_int_feature([feature.label_id]) + else: + features["label_ids"] = create_float_feature([float(feature.label_id)]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.float32), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + if FLAGS.is_regression: + name_to_features["label_ids"] = tf.FixedLenFeature([], tf.float32) + + tf.logging.info("Input tfrecord file {}".format(input_file)) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def input_fn(params, input_context=None): + """The actual input function.""" + if FLAGS.use_tpu: + batch_size = params["batch_size"] + elif is_training: + batch_size = FLAGS.train_batch_size + elif FLAGS.do_eval: + batch_size = FLAGS.eval_batch_size + else: + batch_size = FLAGS.predict_batch_size + + d = tf.data.TFRecordDataset(input_file) + # Shard the dataset to difference devices + if input_context is not None: + tf.logging.info("Input pipeline id %d out of %d", + input_context.input_pipeline_id, input_context.num_replicas_in_sync) + d = d.shard(input_context.num_input_pipelines, + input_context.input_pipeline_id) + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = d.shuffle(buffer_size=FLAGS.shuffle_buffer) + d = d.repeat() + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def get_model_fn(n_class): + def model_fn(features, labels, mode, params): + #### Training or Evaluation + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + # Get loss from inputs + if FLAGS.is_regression: + (total_loss, per_example_loss, logits + ) = function_builder.get_regression_loss(FLAGS, features, is_training) + else: + (total_loss, per_example_loss, logits + ) = function_builder.get_classification_loss( + FLAGS, features, n_class, is_training) + + # Check model parameters + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + tf.logging.info('#params: {}'.format(num_params)) + + # load pretrained models + scaffold_fn = model_utils.init_from_checkpoint(FLAGS) + + # Evaluation mode + if mode == tf.estimator.ModeKeys.EVAL: + assert FLAGS.num_hosts == 1 + + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + eval_input_dict = { + 'labels': label_ids, + 'predictions': predictions, + 'weights': is_real_example + } + accuracy = tf.metrics.accuracy(**eval_input_dict) + + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + 'eval_accuracy': accuracy, + 'eval_loss': loss} + + def regression_metric_fn( + per_example_loss, label_ids, logits, is_real_example): + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + pearsonr = tf.contrib.metrics.streaming_pearson_correlation( + logits, label_ids, weights=is_real_example) + return {'eval_loss': loss, 'eval_pearsonr': pearsonr} + + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + + # Constucting evaluation TPUEstimatorSpec with new cache. + label_ids = tf.reshape(features['label_ids'], [-1]) + + if FLAGS.is_regression: + metric_fn = regression_metric_fn + else: + metric_fn = metric_fn + metric_args = [per_example_loss, label_ids, logits, is_real_example] + + if FLAGS.use_tpu: + eval_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + eval_metrics=(metric_fn, metric_args), + scaffold_fn=scaffold_fn) + else: + eval_spec = tf.estimator.EstimatorSpec( + mode=mode, + loss=total_loss, + eval_metric_ops=metric_fn(*metric_args)) + + return eval_spec + + elif mode == tf.estimator.ModeKeys.PREDICT: + label_ids = tf.reshape(features["label_ids"], [-1]) + + predictions = { + "logits": logits, + "labels": label_ids, + "is_real": features["is_real_example"] + } + + if FLAGS.use_tpu: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + output_spec = tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions) + return output_spec + + # Configuring the optimizer + train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) + + monitor_dict = {} + monitor_dict["lr"] = learning_rate + + # Constucting training TPUEstimatorSpec with new cache. + if FLAGS.use_tpu: + # Creating host calls + if not FLAGS.is_regression: + label_ids = tf.reshape(features['label_ids'], [-1]) + predictions = tf.argmax(logits, axis=-1, output_type=label_ids.dtype) + is_correct = tf.equal(predictions, label_ids) + accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32)) + + monitor_dict["accuracy"] = accuracy + + host_call = function_builder.construct_scalar_host_call( + monitor_dict=monitor_dict, + model_dir=FLAGS.model_dir, + prefix="train/", + reduce_fn=tf.reduce_mean) + else: + host_call = None + + train_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, + scaffold_fn=scaffold_fn) + else: + train_spec = tf.estimator.EstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op) + + return train_spec + + return model_fn + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + # Validate flags + if FLAGS.save_steps is not None: + FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) + + if FLAGS.do_predict: + predict_dir = FLAGS.predict_dir + if not tf.gfile.Exists(predict_dir): + tf.gfile.MakeDirs(predict_dir) + + processors = { + "mnli_matched": MnliMatchedProcessor, + "mnli_mismatched": MnliMismatchedProcessor, + 'sts-b': StsbProcessor, + 'imdb': ImdbProcessor, + "yelp5": Yelp5Processor, + "xnli": XnliProcessor, + "csc": CSCProcessor, + "tnews": TnewsProcessor, + "inews": InewsProcessor, + "lcqmc_pair": LCQMCProcessor, + "thucnews":THUCNewsProcessor, + } + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval, `do_predict` or " + "`do_submit` must be True.") + + if not tf.gfile.Exists(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + + task_name = FLAGS.task_name.lower() + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + label_list = processor.get_labels() if not FLAGS.is_regression else None + + sp = spm.SentencePieceProcessor() + sp.Load(FLAGS.spiece_model_file) + + def tokenize_fn(text): + text = preprocess_text(text, lower=FLAGS.uncased) + return encode_ids(sp, text) + + run_config = model_utils.configure_tpu(FLAGS) + + model_fn = get_model_fn(len(label_list) if label_list is not None else None) + + spm_basename = os.path.basename(FLAGS.spiece_model_file) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + if FLAGS.use_tpu: + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size, + eval_batch_size=FLAGS.eval_batch_size) + else: + estimator = tf.estimator.Estimator( + model_fn=model_fn, + config=run_config) + + if FLAGS.do_train: + import datetime + dt = datetime.datetime.now() + train_file_base = "{}.{}.len-{}.train.tf_record".format(dt, + spm_basename, FLAGS.max_seq_length) + train_file = os.path.join(FLAGS.output_dir, train_file_base) + tf.logging.info("Use tfrecord file {}".format(train_file)) + + train_examples = processor.get_train_examples(FLAGS.data_dir) + np.random.shuffle(train_examples) + tf.logging.info("Num of train samples: {}".format(len(train_examples))) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + train_file, FLAGS.num_passes) + else: + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + train_file, FLAGS.num_passes) + + # here we use epoch number to calculate total train_steps + train_steps = int(len(train_examples) * FLAGS.num_train_epochs / FLAGS.train_batch_size) + FLAGS.warmup_steps = int(0.1 * train_steps) + + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + + estimator.train(input_fn=train_input_fn, max_steps=train_steps) + + if FLAGS.do_eval or FLAGS.do_predict: + eval_examples = processor.get_devtest_examples(FLAGS.data_dir, FLAGS.eval_split) + tf.logging.info("Num of eval samples: {}".format(len(eval_examples))) + + if FLAGS.do_eval: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + # + # Modified in XL: We also adopt the same mechanism for GPUs. + + ## dev dataset + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file_base = "{}.len-{}.{}.eval.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=True) + + # Filter out all checkpoints in the directory + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.model_dir) + + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = join(FLAGS.model_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + # Decide whether to evaluate all ckpts + if not FLAGS.eval_all_ckpt: + steps_and_files = steps_and_files[-1:] + + eval_results = [] + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + ret = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=filename) + + ret["step"] = global_step + ret["path"] = filename + + eval_results.append(ret) + + tf.logging.info("=" * 80) + log_str = "Eval result | " + for key, val in sorted(ret.items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + # for x in ret: + # tf.logging.info(x) + + key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" + eval_results.sort(key=lambda x: x[key_name], reverse=True) + + tf.logging.info("=" * 80) + log_str = "Best result | " + for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + + + ## test dataset + eval_examples = processor.get_test_examples(FLAGS.data_dir) + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file_base = "{}.len-{}.{}.test.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=True) + + # Filter out all checkpoints in the directory + steps_and_files = [] + filenames = tf.gfile.ListDirectory(FLAGS.model_dir) + + for filename in filenames: + if filename.endswith(".index"): + ckpt_name = filename[:-6] + cur_filename = join(FLAGS.model_dir, ckpt_name) + global_step = int(cur_filename.split("-")[-1]) + tf.logging.info("Add {} to eval list.".format(cur_filename)) + steps_and_files.append([global_step, cur_filename]) + steps_and_files = sorted(steps_and_files, key=lambda x: x[0]) + + # Decide whether to evaluate all ckpts + if not FLAGS.eval_all_ckpt: + steps_and_files = steps_and_files[-1:] + + eval_results = [] + for global_step, filename in sorted(steps_and_files, key=lambda x: x[0]): + ret = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + checkpoint_path=filename) + + ret["step"] = global_step + ret["path"] = filename + + eval_results.append(ret) + + tf.logging.info("=" * 80) + log_str = "Test Eval result | " + for key, val in sorted(ret.items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + # for x in ret: + # tf.logging.info(x) + + key_name = "eval_pearsonr" if FLAGS.is_regression else "eval_accuracy" + eval_results.sort(key=lambda x: x[key_name], reverse=True) + + tf.logging.info("=" * 80) + log_str = "Test Best result | " + for key, val in sorted(eval_results[0].items(), key=lambda x: x[0]): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + + if FLAGS.do_predict: + eval_file_base = "{}.len-{}.{}.predict.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.eval_split) + eval_file = os.path.join(FLAGS.output_dir, eval_file_base) + if task_name == "inews": + file_based_convert_examples_to_features_for_inews( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + else: + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenize_fn, + eval_file) + + pred_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False) + + predict_results = [] + with tf.gfile.Open(os.path.join(predict_dir, "{}.tsv".format( + task_name)), "w") as fout: + fout.write("index\tprediction\n") + + for pred_cnt, result in enumerate(estimator.predict( + input_fn=pred_input_fn, + yield_single_examples=True, + checkpoint_path=FLAGS.predict_ckpt)): + if pred_cnt % 1000 == 0: + tf.logging.info("Predicting submission for example: {}".format( + pred_cnt)) + # print("-------whole result:", result) + # aaaaaa + logits = [float(x) for x in result["logits"].flat] + predict_results.append(logits) + + if len(logits) == 1: + label_out = logits[0] + elif len(logits) == 2: + if logits[1] - logits[0] > FLAGS.predict_threshold: + label_out = label_list[1] + else: + label_out = label_list[0] + elif len(logits) > 2: + max_index = np.argmax(np.array(logits, dtype=np.float32)) + label_out = label_list[max_index] + else: + raise NotImplementedError + + fout.write("{}\t{}\t{}\n".format(pred_cnt, label_out,result["is_real"])) + # fout.write("{}\t{}\n".format(pred_cnt, label_out)) + + import datetime + dt_n = datetime.datetime.now() + predict_json_path = os.path.join(predict_dir, "{}.{}.logits.json".format( + dt_n,task_name)) + + with tf.gfile.Open(predict_json_path, "w") as fp: + json.dump(predict_results, fp, indent=4) + + +if __name__ == "__main__": + tf.app.run() diff --git a/baselines/models/xlnet/run_classifier_thucnews.sh b/baselines/models/xlnet/run_classifier_thucnews.sh new file mode 100644 index 0000000..381303f --- /dev/null +++ b/baselines/models/xlnet/run_classifier_thucnews.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:23 + +TASK_NAME="thucnews" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/thucnews.zip + unzip thucnews.zip + rm thucnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier_thucnews.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --do_predict=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=16 \ + --eval_batch_size=16 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False \ + --predict_dir=$CURRENT_DIR/${TASK_NAME}_output/predict diff --git a/baselines/models/xlnet/run_classifier_tnews.sh b/baselines/models/xlnet/run_classifier_tnews.sh new file mode 100644 index 0000000..ad984cc --- /dev/null +++ b/baselines/models/xlnet/run_classifier_tnews.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:28 + +TASK_NAME="tnews" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.txt" ] || [ ! -f "dev.txt" ] || [ ! -f "test.txt" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/tnews.zip + unzip tnews.zip + rm tnews.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_classifier_xnli.sh b/baselines/models/xlnet/run_classifier_xnli.sh new file mode 100644 index 0000000..c4c5e6f --- /dev/null +++ b/baselines/models/xlnet/run_classifier_xnli.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# @Author: bo.shi +# @Date: 2019-11-04 09:56:36 +# @Last Modified by: bo.shi +# @Last Modified time: 2019-11-11 09:59:33 + +TASK_NAME="xnli" +MODEL_NAME="chinese_xlnet_mid_L-24_H-768_A-12" +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export CUDA_VISIBLE_DEVICES="0" +export PRETRAINED_MODELS_DIR=$CURRENT_DIR/prev_trained_model +export XLNET_DIR=$PRETRAINED_MODELS_DIR/$MODEL_NAME +export GLUE_DATA_DIR=$CURRENT_DIR/../../glue/chineseGLUEdatasets + +# install related packages +pip install sentencepiece --user + +# download and unzip dataset +if [ ! -d $GLUE_DATA_DIR ]; then + mkdir -p $GLUE_DATA_DIR + echo "makedir $GLUE_DATA_DIR" +fi +cd $GLUE_DATA_DIR +if [ ! -d $TASK_NAME ]; then + mkdir $TASK_NAME + echo "makedir $GLUE_DATA_DIR/$TASK_NAME" +fi +cd $TASK_NAME +if [ ! -f "train.tsv" ] || [ ! -f "dev.tsv" ] || [ ! -f "test.tsv" ]; then + rm * + wget https://storage.googleapis.com/chineseglue/tasks/xnli.zip + unzip xnli.zip + rm xnli.zip +else + echo "data exists" +fi +echo "Finish download dataset." + +# download model +if [ ! -d $XLNET_DIR ]; then + mkdir -p $XLNET_DIR + echo "makedir $XLNET_DIR" +fi +cd $XLNET_DIR +if [ ! -f "xlnet_config.json" ] || [ ! -f "spiece.model" ] || [ ! -f "xlnet_model.ckpt.index" ] || [ ! -f "xlnet_model.ckpt.meta" ] || [ ! -f "xlnet_model.ckpt.data-00000-of-00001" ]; then + rm * + wget -c https://storage.googleapis.com/chineseglue/pretrain_models/chinese_xlnet_mid_L-24_H-768_A-12.zip + unzip chinese_xlnet_mid_L-24_H-768_A-12.zip + rm chinese_xlnet_mid_L-24_H-768_A-12.zip +else + echo "model exists" +fi +echo "Finish download model." + +# run task +cd $CURRENT_DIR +echo "Start running..." +python run_classifier.py \ + --spiece_model_file=${XLNET_DIR}/spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$GLUE_DATA_DIR/$TASK_NAME \ + --output_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --model_dir=$CURRENT_DIR/${TASK_NAME}_output/ \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=1 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=False diff --git a/baselines/models/xlnet/run_cmrc_drcd.py b/baselines/models/xlnet/run_cmrc_drcd.py new file mode 100644 index 0000000..ee59eea --- /dev/null +++ b/baselines/models/xlnet/run_cmrc_drcd.py @@ -0,0 +1,1293 @@ +# coding=utf-8 +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from absl import flags +import absl.logging as _logging # pylint: disable=unused-import + +import collections +import os +import time +import math +import json +import six +import random +import gc + +import numpy as np + +if six.PY2: + import cPickle as pickle +else: + import pickle + +import tensorflow as tf +import sentencepiece as spm +from prepro_utils import preprocess_text, encode_ids, encode_pieces, printable_text +import function_builder +import model_utils +import squad_utils +from data_utils import SEP_ID, CLS_ID, VOCAB_SIZE + +SPIECE_UNDERLINE = u'▁' + +SEG_ID_P = 0 +SEG_ID_Q = 1 +SEG_ID_CLS = 2 +SEG_ID_PAD = 3 + +# Preprocessing +flags.DEFINE_bool("do_prepro", default=False, + help="Perform preprocessing only.") +flags.DEFINE_integer("num_proc", default=1, + help="Number of preprocessing processes.") +flags.DEFINE_integer("proc_id", default=0, + help="Process id for preprocessing.") + +# Model +flags.DEFINE_string("model_config_path", default=None, + help="Model config path.") +flags.DEFINE_float("dropout", default=0.1, + help="Dropout rate.") +flags.DEFINE_float("dropatt", default=0.1, + help="Attention dropout rate.") +flags.DEFINE_integer("clamp_len", default=-1, + help="Clamp length.") +flags.DEFINE_string("summary_type", default="last", + help="Method used to summarize a sequence into a vector.") +flags.DEFINE_bool("use_bfloat16", default=False, + help="Whether to use bfloat16.") + +# Parameter initialization +flags.DEFINE_enum("init", default="normal", + enum_values=["normal", "uniform"], + help="Initialization method.") +flags.DEFINE_float("init_std", default=0.02, + help="Initialization std when init is normal.") +flags.DEFINE_float("init_range", default=0.1, + help="Initialization std when init is uniform.") + +# I/O paths +flags.DEFINE_bool("overwrite_data", default=False, + help="If False, will use cached data if available.") +flags.DEFINE_string("init_checkpoint", default=None, + help="checkpoint path for initializing the model. " + "Could be a pretrained model or a finetuned model.") +flags.DEFINE_bool("init_global_vars", default=False, + help="If true, init all global vars. If false, init " + "trainable vars only.") +flags.DEFINE_string("output_dir", default="", + help="Output dir for TF records.") +flags.DEFINE_string("predict_dir", default="", + help="Dir for predictions.") +flags.DEFINE_string("spiece_model_file", default="", + help="Sentence Piece model path.") +flags.DEFINE_string("model_dir", default="", + help="Directory for saving the finetuned model.") +flags.DEFINE_string("train_file", default="", + help="Path of train file.") +flags.DEFINE_string("predict_file", default="", + help="Path of prediction file.") + +# Data preprocessing config +flags.DEFINE_integer("max_seq_length", + default=512, help="Max sequence length") +flags.DEFINE_integer("max_query_length", + default=64, help="Max query length") +flags.DEFINE_integer("doc_stride", + default=128, help="Doc stride") +flags.DEFINE_integer("max_answer_length", + default=64, help="Max answer length") +flags.DEFINE_bool("uncased", default=False, help="Use uncased data.") + +# TPUs and machines +flags.DEFINE_bool("use_tpu", default=False, help="whether to use TPU.") +flags.DEFINE_integer("num_hosts", default=1, help="How many TPU hosts.") +flags.DEFINE_integer("num_core_per_host", default=8, + help="8 for TPU v2 and v3-8, 16 for larger TPU v3 pod. In the context " + "of GPU training, it refers to the number of GPUs used.") +flags.DEFINE_string("tpu_job_name", default=None, help="TPU worker job name.") +flags.DEFINE_string("tpu", default=None, help="TPU name.") +flags.DEFINE_string("tpu_zone", default=None, help="TPU zone.") +flags.DEFINE_string("gcp_project", default=None, help="gcp project.") +flags.DEFINE_string("master", default=None, help="master") +flags.DEFINE_integer("iterations", default=1000, + help="number of iterations per TPU training loop.") + +# Training +flags.DEFINE_bool("do_train", default=True, help="whether to do training") +flags.DEFINE_integer("train_batch_size", default=48, + help="batch size for training") +flags.DEFINE_integer("train_steps", default=8000, + help="Number of training steps") +flags.DEFINE_integer("warmup_steps", default=0, help="number of warmup steps") +flags.DEFINE_integer("save_steps", default=None, + help="Save the model for every save_steps. " + "If None, not to save any model.") +flags.DEFINE_integer("max_save", default=5, + help="Max number of checkpoints to save. " + "Use 0 to save all.") +flags.DEFINE_integer("shuffle_buffer", default=2048, + help="Buffer size used for shuffle.") + +# Optimization +flags.DEFINE_float("learning_rate", default=3e-5, help="initial learning rate") +flags.DEFINE_float("min_lr_ratio", default=0.0, + help="min lr ratio for cos decay.") +flags.DEFINE_float("clip", default=1.0, help="Gradient clipping") +flags.DEFINE_float("weight_decay", default=0.00, help="Weight decay rate") +flags.DEFINE_float("adam_epsilon", default=1e-6, help="Adam epsilon") +flags.DEFINE_string("decay_method", default="poly", help="poly or cos") +flags.DEFINE_float("lr_layer_decay_rate", default=0.75, + help="Top layer: lr[L] = FLAGS.learning_rate." + "Lower layers: lr[l-1] = lr[l] * lr_layer_decay_rate.") + +# Eval / Prediction +flags.DEFINE_bool("do_predict", default=False, help="whether to do predict") +flags.DEFINE_integer("predict_batch_size", default=32, + help="batch size for prediction") +flags.DEFINE_integer("n_best_size", default=5, + help="n best size for predictions") +flags.DEFINE_integer("start_n_top", default=5, help="Beam size for span start.") +flags.DEFINE_integer("end_n_top", default=5, help="Beam size for span end.") +flags.DEFINE_string("target_eval_key", default="best_f1", + help="Use has_ans_f1 for Model I.") + + +FLAGS = flags.FLAGS + + +class SquadExample(object): + """A single training/test example for simple sequence classification. + + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + paragraph_text, + orig_answer_text=None, + start_position=None, + is_impossible=False): + self.qas_id = qas_id + self.question_text = question_text + self.paragraph_text = paragraph_text + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (printable_text(self.qas_id)) + s += ", question_text: %s" % ( + printable_text(self.question_text)) + s += ", paragraph_text: [%s]" % (" ".join(self.paragraph_text)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.start_position: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tok_start_to_orig_index, + tok_end_to_orig_index, + token_is_max_context, + input_ids, + input_mask, + p_mask, + segment_ids, + paragraph_len, + cls_index, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tok_start_to_orig_index = tok_start_to_orig_index + self.tok_end_to_orig_index = tok_end_to_orig_index + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.p_mask = p_mask + self.segment_ids = segment_ids + self.paragraph_len = paragraph_len + self.cls_index = cls_index + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + +def read_squad_examples(input_file, is_training): + """Read a SQuAD json file into a list of SquadExample.""" + with tf.gfile.Open(input_file, "r") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + orig_answer_text = None + is_impossible = False + + if is_training: + if "is_impossible" in qa: + is_impossible = qa["is_impossible"] + else: + is_impossible = False + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + start_position = answer["answer_start"] + else: + start_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + paragraph_text=paragraph_text, + orig_answer_text=orig_answer_text, + start_position=start_position, + is_impossible=is_impossible) + examples.append(example) + + return examples + + +def _convert_index(index, pos, M=None, is_start=True): + if pos >= len(index): + pos = len(index) - 1 + if index[pos] is not None: + return index[pos] + N = len(index) + rear = pos + while rear < N - 1 and index[rear] is None: + rear += 1 + front = pos + while front > 0 and index[front] is None: + front -= 1 + assert index[front] is not None or index[rear] is not None + if index[front] is None: + if index[rear] >= 1: + if is_start: + return 0 + else: + return index[rear] - 1 + return index[rear] + if index[rear] is None: + if M is not None and index[front] < M - 1: + if is_start: + return index[front] + 1 + else: + return M - 1 + return index[front] + if is_start: + if index[rear] > index[front] + 1: + return index[front] + 1 + else: + return index[rear] + else: + if index[rear] > index[front] + 1: + return index[rear] - 1 + else: + return index[front] + + +def convert_examples_to_features(examples, sp_model, max_seq_length, + doc_stride, max_query_length, is_training, + output_fn): + """Loads a data file into a list of `InputBatch`s.""" + + cnt_pos, cnt_neg = 0, 0 + unique_id = 1000000000 + max_N, max_M = 1024, 1024 + f = np.zeros((max_N, max_M), dtype=np.float32) + + for (example_index, example) in enumerate(examples): + + if example_index % 100 == 0: + tf.logging.info('Converting {}/{} pos {} neg {}'.format( + example_index, len(examples), cnt_pos, cnt_neg)) + + query_tokens = encode_ids( + sp_model, + preprocess_text(example.question_text, lower=FLAGS.uncased)) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + paragraph_text = example.paragraph_text + para_tokens = encode_pieces( + sp_model, + preprocess_text(example.paragraph_text, lower=FLAGS.uncased)) + + chartok_to_tok_index = [] + tok_start_to_chartok_index = [] + tok_end_to_chartok_index = [] + char_cnt = 0 + for i, token in enumerate(para_tokens): + chartok_to_tok_index.extend([i] * len(token)) + tok_start_to_chartok_index.append(char_cnt) + char_cnt += len(token) + tok_end_to_chartok_index.append(char_cnt - 1) + + tok_cat_text = ''.join(para_tokens).replace(SPIECE_UNDERLINE, ' ') + N, M = len(paragraph_text), len(tok_cat_text) + + if N > max_N or M > max_M: + max_N = max(N, max_N) + max_M = max(M, max_M) + f = np.zeros((max_N, max_M), dtype=np.float32) + gc.collect() + + g = {} + + def _lcs_match(max_dist): + f.fill(0) + g.clear() + + ### longest common sub sequence + # f[i, j] = max(f[i - 1, j], f[i, j - 1], f[i - 1, j - 1] + match(i, j)) + for i in range(N): + + # note(zhiliny): + # unlike standard LCS, this is specifically optimized for the setting + # because the mismatch between sentence pieces and original text will + # be small + for j in range(i - max_dist, i + max_dist): + if j >= M or j < 0: continue + + if i > 0: + g[(i, j)] = 0 + f[i, j] = f[i - 1, j] + + if j > 0 and f[i, j - 1] > f[i, j]: + g[(i, j)] = 1 + f[i, j] = f[i, j - 1] + + f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0 + if (preprocess_text(paragraph_text[i], lower=FLAGS.uncased, + remove_space=False) + == tok_cat_text[j] + and f_prev + 1 > f[i, j]): + g[(i, j)] = 2 + f[i, j] = f_prev + 1 + + max_dist = abs(N - M) + 5 + for _ in range(2): + _lcs_match(max_dist) + if f[N - 1, M - 1] > 0.8 * N: break + max_dist *= 2 + + orig_to_chartok_index = [None] * N + chartok_to_orig_index = [None] * M + i, j = N - 1, M - 1 + while i >= 0 and j >= 0: + if (i, j) not in g: break + if g[(i, j)] == 2: + orig_to_chartok_index[i] = j + chartok_to_orig_index[j] = i + i, j = i - 1, j - 1 + elif g[(i, j)] == 1: + j = j - 1 + else: + i = i - 1 + + if all(v is None for v in orig_to_chartok_index) or f[N - 1, M - 1] < 0.8 * N: + print('MISMATCH DETECTED!') + continue + + tok_start_to_orig_index = [] + tok_end_to_orig_index = [] + for i in range(len(para_tokens)): + start_chartok_pos = tok_start_to_chartok_index[i] + end_chartok_pos = tok_end_to_chartok_index[i] + start_orig_pos = _convert_index(chartok_to_orig_index, start_chartok_pos, + N, is_start=True) + end_orig_pos = _convert_index(chartok_to_orig_index, end_chartok_pos, + N, is_start=False) + + tok_start_to_orig_index.append(start_orig_pos) + tok_end_to_orig_index.append(end_orig_pos) + + if not is_training: + tok_start_position = tok_end_position = None + + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + + if is_training and not example.is_impossible: + start_position = example.start_position + end_position = start_position + len(example.orig_answer_text) - 1 + + start_chartok_pos = _convert_index(orig_to_chartok_index, start_position, + is_start=True) + tok_start_position = chartok_to_tok_index[start_chartok_pos] + + end_chartok_pos = _convert_index(orig_to_chartok_index, end_position, + is_start=False) + tok_end_position = chartok_to_tok_index[end_chartok_pos] + assert tok_start_position <= tok_end_position + + def _piece_to_id(x): + if six.PY2 and isinstance(x, unicode): + x = x.encode('utf-8') + return sp_model.PieceToId(x) + + all_doc_tokens = list(map(_piece_to_id, para_tokens)) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_is_max_context = {} + segment_ids = [] + p_mask = [] + + cur_tok_start_to_orig_index = [] + cur_tok_end_to_orig_index = [] + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + + cur_tok_start_to_orig_index.append( + tok_start_to_orig_index[split_token_index]) + cur_tok_end_to_orig_index.append( + tok_end_to_orig_index[split_token_index]) + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(SEG_ID_P) + p_mask.append(0) + + paragraph_len = len(tokens) + + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_P) + p_mask.append(1) + + # note(zhiliny): we put P before Q + # because during pretraining, B is always shorter than A + for token in query_tokens: + tokens.append(token) + segment_ids.append(SEG_ID_Q) + p_mask.append(1) + tokens.append(SEP_ID) + segment_ids.append(SEG_ID_Q) + p_mask.append(1) + + cls_index = len(segment_ids) + tokens.append(CLS_ID) + segment_ids.append(SEG_ID_CLS) + p_mask.append(0) + + input_ids = tokens + + # The mask has 0 for real tokens and 1 for padding tokens. Only real + # tokens are attended to. + input_mask = [0] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(1) + segment_ids.append(SEG_ID_PAD) + p_mask.append(1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + assert len(p_mask) == max_seq_length + + span_is_impossible = example.is_impossible + start_position = None + end_position = None + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + # continue + start_position = 0 + end_position = 0 + span_is_impossible = True + else: + # note(zhiliny): we put P before Q, so doc_offset should be zero. + # doc_offset = len(query_tokens) + 2 + doc_offset = 0 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and span_is_impossible: + start_position = cls_index + end_position = cls_index + + if example_index < 20: + tf.logging.info("*** Example ***") + tf.logging.info("unique_id: %s" % (unique_id)) + tf.logging.info("example_index: %s" % (example_index)) + tf.logging.info("doc_span_index: %s" % (doc_span_index)) + tf.logging.info("tok_start_to_orig_index: %s" % " ".join( + [str(x) for x in cur_tok_start_to_orig_index])) + tf.logging.info("tok_end_to_orig_index: %s" % " ".join( + [str(x) for x in cur_tok_end_to_orig_index])) + tf.logging.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in six.iteritems(token_is_max_context) + ])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + + if is_training and span_is_impossible: + tf.logging.info("impossible example span") + + if is_training and not span_is_impossible: + pieces = [sp_model.IdToPiece(token) for token in + tokens[start_position: (end_position + 1)]] + answer_text = sp_model.DecodePieces(pieces) + tf.logging.info("start_position: %d" % (start_position)) + tf.logging.info("end_position: %d" % (end_position)) + tf.logging.info( + "answer: %s" % (printable_text(answer_text))) + + # note(zhiliny): With multi processing, + # the example_index is actually the index within the current process + # therefore we use example_index=None to avoid being used in the future. + # The current code does not use example_index of training data. + if is_training: + feat_example_index = None + else: + feat_example_index = example_index + + feature = InputFeatures( + unique_id=unique_id, + example_index=feat_example_index, + doc_span_index=doc_span_index, + tok_start_to_orig_index=cur_tok_start_to_orig_index, + tok_end_to_orig_index=cur_tok_end_to_orig_index, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + p_mask=p_mask, + segment_ids=segment_ids, + paragraph_len=paragraph_len, + cls_index=cls_index, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible) + + # Run callback + output_fn(feature) + + unique_id += 1 + if span_is_impossible: + cnt_neg += 1 + else: + cnt_pos += 1 + + tf.logging.info("Total number of instances: {} = pos {} neg {}".format( + cnt_pos + cnt_neg, cnt_pos, cnt_neg)) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +class FeatureWriter(object): + """Writes InputFeature to TF example file.""" + + def __init__(self, filename, is_training): + self.filename = filename + self.is_training = is_training + self.num_features = 0 + self._writer = tf.python_io.TFRecordWriter(filename) + + def process_feature(self, feature): + """Write a InputFeature to the TFRecordWriter as a tf.train.Example.""" + self.num_features += 1 + + def create_int_feature(values): + feature = tf.train.Feature( + int64_list=tf.train.Int64List(value=list(values))) + return feature + + def create_float_feature(values): + f = tf.train.Feature(float_list=tf.train.FloatList(value=list(values))) + return f + + features = collections.OrderedDict() + features["unique_ids"] = create_int_feature([feature.unique_id]) + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_float_feature(feature.input_mask) + features["p_mask"] = create_float_feature(feature.p_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + + features["cls_index"] = create_int_feature([feature.cls_index]) + + if self.is_training: + features["start_positions"] = create_int_feature([feature.start_position]) + features["end_positions"] = create_int_feature([feature.end_position]) + impossible = 0 + if feature.is_impossible: + impossible = 1 + features["is_impossible"] = create_float_feature([impossible]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + self._writer.write(tf_example.SerializeToString()) + + def close(self): + self._writer.close() + + +RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_top_log_probs", "start_top_index", + "end_top_log_probs", "end_top_index"]) + +_PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) + +_NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, output_prediction_file, + output_nbest_file, + orig_data): + """Write final predictions to the json file and log-odds of null if needed.""" + tf.logging.info("Writing predictions to: %s" % (output_prediction_file)) + # tf.logging.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + + for i in range(FLAGS.start_n_top): + for j in range(FLAGS.end_n_top): + start_log_prob = result.start_top_log_probs[i] + start_index = result.start_top_index[i] + + j_index = i * FLAGS.end_n_top + j + + end_log_prob = result.end_top_log_probs[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= feature.paragraph_len - 1: + continue + if end_index >= feature.paragraph_len - 1: + continue + + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + tok_start_to_orig_index = feature.tok_start_to_orig_index + tok_end_to_orig_index = feature.tok_end_to_orig_index + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example.paragraph_text + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + assert best_non_null_entry is not None + + score_diff = 0 #score_null + scores_diff_json[example.qas_id] = score_diff + # note(zhiliny): always predict best_non_null_entry + # and the evaluation script will search for the best threshold + all_predictions[example.qas_id] = best_non_null_entry.text + + all_nbest_json[example.qas_id] = nbest_json + + with tf.gfile.GFile(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with tf.gfile.GFile(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + qid_to_has_ans = squad_utils.make_qid_to_has_ans(orig_data) + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = squad_utils.get_raw_scores(orig_data, all_predictions) + out_eval = {} + + squad_utils.find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, + scores_diff_json, qid_to_has_ans) + + return out_eval + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + +def input_fn_builder(input_glob, seq_length, is_training, drop_remainder, + num_hosts, num_threads=8): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "unique_ids": tf.FixedLenFeature([], tf.int64), + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.float32), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "cls_index": tf.FixedLenFeature([], tf.int64), + "p_mask": tf.FixedLenFeature([seq_length], tf.float32) + } + + if is_training: + name_to_features["start_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["end_positions"] = tf.FixedLenFeature([], tf.int64) + name_to_features["is_impossible"] = tf.FixedLenFeature([], tf.float32) + + tf.logging.info("Input tfrecord file glob {}".format(input_glob)) + global_input_paths = tf.gfile.Glob(input_glob) + tf.logging.info("Find {} input paths {}".format( + len(global_input_paths), global_input_paths)) + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.cast(t, tf.int32) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + if FLAGS.use_tpu: + batch_size = params["batch_size"] + elif is_training: + batch_size = FLAGS.train_batch_size + else: + batch_size = FLAGS.predict_batch_size + + # Split tfrecords across hosts + if num_hosts > 1: + host_id = params["context"].current_host + num_files = len(global_input_paths) + if num_files >= num_hosts: + num_files_per_host = (num_files + num_hosts - 1) // num_hosts + my_start_file_id = host_id * num_files_per_host + my_end_file_id = min((host_id + 1) * num_files_per_host, num_files) + input_paths = global_input_paths[my_start_file_id: my_end_file_id] + tf.logging.info("Host {} handles {} files".format(host_id, + len(input_paths))) + else: + input_paths = global_input_paths + + if len(input_paths) == 1: + d = tf.data.TFRecordDataset(input_paths[0]) + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + if is_training: + d = d.shuffle(buffer_size=FLAGS.shuffle_buffer) + d = d.repeat() + else: + d = tf.data.Dataset.from_tensor_slices(input_paths) + # file level shuffle + d = d.shuffle(len(input_paths)).repeat() + + # `cycle_length` is the number of parallel files that get read. + cycle_length = min(num_threads, len(input_paths)) + + d = d.apply( + tf.contrib.data.parallel_interleave( + tf.data.TFRecordDataset, + sloppy=is_training, + cycle_length=cycle_length)) + + if is_training: + # sample level shuffle + d = d.shuffle(buffer_size=FLAGS.shuffle_buffer) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + num_parallel_batches=num_threads, + drop_remainder=drop_remainder)) + d = d.prefetch(1024) + + return d + + return input_fn + + +def get_model_fn(): + def model_fn(features, labels, mode, params): + #### Training or Evaluation + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + #### Get loss from inputs + outputs = function_builder.get_qa_outputs(FLAGS, features, is_training) + + #### Check model parameters + num_params = sum([np.prod(v.shape) for v in tf.trainable_variables()]) + tf.logging.info('#params: {}'.format(num_params)) + + scaffold_fn = None + + #### Evaluation mode + if mode == tf.estimator.ModeKeys.PREDICT: + if FLAGS.init_checkpoint: + tf.logging.info("init_checkpoint not being used in predict mode.") + + predictions = { + "unique_ids": features["unique_ids"], + "start_top_index": outputs["start_top_index"], + "start_top_log_probs": outputs["start_top_log_probs"], + "end_top_index": outputs["end_top_index"], + "end_top_log_probs": outputs["end_top_log_probs"] + } + + if FLAGS.use_tpu: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) + else: + output_spec = tf.estimator.EstimatorSpec( + mode=mode, predictions=predictions) + return output_spec + + ### Compute loss + seq_length = tf.shape(features["input_ids"])[1] + def compute_loss(log_probs, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + + loss = - tf.reduce_sum(one_hot_positions * log_probs, axis=-1) + loss = tf.reduce_mean(loss) + return loss + + start_loss = compute_loss( + outputs["start_log_probs"], features["start_positions"]) + end_loss = compute_loss( + outputs["end_log_probs"], features["end_positions"]) + + total_loss = (start_loss + end_loss) * 0.5 + + #### Configuring the optimizer + train_op, learning_rate, _ = model_utils.get_train_op(FLAGS, total_loss) + + monitor_dict = {} + monitor_dict["lr"] = learning_rate + + #### load pretrained models + scaffold_fn = model_utils.init_from_checkpoint(FLAGS) + + #### Constucting training TPUEstimatorSpec with new cache. + if FLAGS.use_tpu: + host_call = function_builder.construct_scalar_host_call( + monitor_dict=monitor_dict, + model_dir=FLAGS.model_dir, + prefix="train/", + reduce_fn=tf.reduce_mean) + + train_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op, host_call=host_call, + scaffold_fn=scaffold_fn) + else: + train_spec = tf.estimator.EstimatorSpec( + mode=mode, loss=total_loss, train_op=train_op) + + return train_spec + + return model_fn + + +def _get_spm_basename(): + spm_basename = os.path.basename(FLAGS.spiece_model_file) + return spm_basename + + +def preprocess(): + sp_model = spm.SentencePieceProcessor() + sp_model.Load(FLAGS.spiece_model_file) + spm_basename = _get_spm_basename() + + train_rec_file = os.path.join( + FLAGS.output_dir, + "{}.{}.slen-{}.qlen-{}.train.tf_record".format( + spm_basename, FLAGS.proc_id, FLAGS.max_seq_length, + FLAGS.max_query_length)) + + tf.logging.info("Read examples from {}".format(FLAGS.train_file)) + train_examples = read_squad_examples(FLAGS.train_file, is_training=True) + train_examples = train_examples[FLAGS.proc_id::FLAGS.num_proc] + + # Pre-shuffle the input to avoid having to make a very large shuffle + # buffer in the `input_fn`. + random.shuffle(train_examples) + + tf.logging.info("Write to {}".format(train_rec_file)) + train_writer = FeatureWriter( + filename=train_rec_file, + is_training=True) + convert_examples_to_features( + examples=train_examples, + sp_model=sp_model, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=True, + output_fn=train_writer.process_feature) + train_writer.close() + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + if not tf.gfile.Exists(FLAGS.output_dir): + tf.gfile.MakeDirs(FLAGS.output_dir) + + if FLAGS.do_prepro: + preprocess() + return + + #### Validate flags + if FLAGS.save_steps is not None: + FLAGS.iterations = min(FLAGS.iterations, FLAGS.save_steps) + + if not FLAGS.do_train and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train` and `do_predict` must be True.") + + if FLAGS.do_predict and not tf.gfile.Exists(FLAGS.predict_dir): + tf.gfile.MakeDirs(FLAGS.predict_dir) + + sp_model = spm.SentencePieceProcessor() + sp_model.Load(FLAGS.spiece_model_file) + + ### TPU Configuration + run_config = model_utils.configure_tpu(FLAGS) + + model_fn = get_model_fn() + spm_basename = _get_spm_basename() + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + if FLAGS.use_tpu: + estimator = tf.contrib.tpu.TPUEstimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + else: + estimator = tf.estimator.Estimator( + model_fn=model_fn, + config=run_config) + + if FLAGS.do_train: + train_rec_glob = os.path.join( + FLAGS.output_dir, + "{}.*.slen-{}.qlen-{}.train.tf_record".format( + spm_basename, FLAGS.max_seq_length, + FLAGS.max_query_length)) + + train_input_fn = input_fn_builder( + input_glob=train_rec_glob, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True, + num_hosts=FLAGS.num_hosts) + + estimator.train(input_fn=train_input_fn, max_steps=FLAGS.train_steps) + + if FLAGS.do_predict: + for eval_set in ['dev','test','challenge']: + new_predict_file = FLAGS.predict_file + "_" + eval_set + ".json" + + eval_examples = read_squad_examples(new_predict_file, is_training=False) + + with tf.gfile.Open(new_predict_file) as f: + orig_data = json.load(f)["data"] + + eval_rec_file = os.path.join( + FLAGS.output_dir, + "{}.slen-{}.qlen-{}.{}.tf_record".format( + spm_basename, FLAGS.max_seq_length, FLAGS.max_query_length, eval_set)) + eval_feature_file = os.path.join( + FLAGS.output_dir, + "{}.slen-{}.qlen-{}.{}.features.pkl".format( + spm_basename, FLAGS.max_seq_length, FLAGS.max_query_length, eval_set)) + + if tf.gfile.Exists(eval_rec_file) and tf.gfile.Exists( + eval_feature_file) and not FLAGS.overwrite_data: + tf.logging.info("Loading eval features from {}".format(eval_feature_file)) + with tf.gfile.Open(eval_feature_file, 'rb') as fin: + eval_features = pickle.load(fin) + else: + eval_writer = FeatureWriter(filename=eval_rec_file, is_training=False) + eval_features = [] + + def append_feature(feature): + eval_features.append(feature) + eval_writer.process_feature(feature) + + convert_examples_to_features( + examples=eval_examples, + sp_model=sp_model, + max_seq_length=FLAGS.max_seq_length, + doc_stride=FLAGS.doc_stride, + max_query_length=FLAGS.max_query_length, + is_training=False, + output_fn=append_feature) + eval_writer.close() + + with tf.gfile.Open(eval_feature_file, 'wb') as fout: + pickle.dump(eval_features, fout) + + eval_input_fn = input_fn_builder( + input_glob=eval_rec_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=False, + num_hosts=1) + + cur_results = [] + for result in estimator.predict( + input_fn=eval_input_fn, + yield_single_examples=True): + + if len(cur_results) % 1000 == 0: + tf.logging.info("Processing example: %d" % (len(cur_results))) + + unique_id = int(result["unique_ids"]) + start_top_log_probs = ( + [float(x) for x in result["start_top_log_probs"].flat]) + start_top_index = [int(x) for x in result["start_top_index"].flat] + end_top_log_probs = ( + [float(x) for x in result["end_top_log_probs"].flat]) + end_top_index = [int(x) for x in result["end_top_index"].flat] + + + cur_results.append( + RawResult( + unique_id=unique_id, + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index)) + + output_prediction_file = os.path.join( + FLAGS.predict_dir, eval_set+"_predictions.json") + output_nbest_file = os.path.join( + FLAGS.predict_dir, eval_set+"_nbest_predictions.json") + + ret = write_predictions(eval_examples, eval_features, cur_results, + FLAGS.n_best_size, FLAGS.max_answer_length, + output_prediction_file, + output_nbest_file, + orig_data) + + # Log current result + tf.logging.info("=" * 80) + log_str = "Result | " + for key, val in ret.items(): + log_str += "{} {} | ".format(key, val) + tf.logging.info(log_str) + tf.logging.info("=" * 80) + + +if __name__ == "__main__": + tf.app.run() + diff --git a/baselines/models/xlnet/spiece.model b/baselines/models/xlnet/spiece.model new file mode 100644 index 0000000000000000000000000000000000000000..541f4a01488505f42266a597290130ce18bf0d57 GIT binary patch literal 691427 zcmZ6!cbrs3^F6F0sH>tVN5z~_Oh^uj2@IfO&X^ETL4p#DsIz%C%+4;Gb53l|vpX9w zV$L~WU~U)_X2gK+si|I{@9*8u{&7xMRo}jS`}XawuI`B%9W`>!hSd+8k3INz#9mPc zM2(oU&ae?t2S)8Z=j{JcQC_n0|K;nJJ@}ux$RAw1^gn)3)TlYD7Oxx5eMFQfEL-zG z1?9sT>>U*qwdX`FNd!;$Klrr&gHQZF_^e^fl7H>V9A^7R?e%QsFcDEq<1pJdYR@w< z@s8uS@Ny%zdt$KHyNe?D>zrupcymnzI|Xqlny|s>Fnu63g00?8Jh^p~;-Wg7QCzNg zQz~A4xh8_`A}{W}zCm$b3+8^)6v3gsKD>Qqm*PMZ0;76VC$}0G?-AG%#gd(aXVD)? z-R?=j4_@N^cq=`4u5eJ5i<0oi^eqwW#Ps3*0mY%d1l;(t;_aSfMV_^An zCd*XZ`PlQC9_YiDyA`|MM6AE?g-8J=-8$oq~h5 zz8ulEbNUhYk>amCMJOEoN+kVjPcbGQOHA!^G3sMYbt@WBdC#kn)TuZ(i--@y^`s)} z4aH7Y8_Jy5Bk4|45~lq6Mg(v7_|SIZn-R(+h>~UCU)TG zoKIETn}dX0#i2k7zFhHHq$NzAgsOiDSzA_7E1r7dbH#%_cz=_@P3d@N^ft{V$%6yN zE4I>oh?`|_TN_&6Haa=2Xn${5Ck=Oes5lfz!Xvv(dQ|~pj`$*yr&~RQ1rrsA1`=@g zXT)rzlc(V4hrZO}B>8a9DPKjflhljrPyRZBLxCEsHrUQd#N}0*ZZ&t~u114f{U~lz z?6h^@#c|ssw&8|6Jd~^03dG=)Cqj{Q*Aow4qhh-#7JWY%oYR8Se^czFrC`k-Rz%-Q z?86hc8tlu(-1Wp#{K=Tur^(^i0d$>ht9ol9?zlm*Q=5SikDBzJ5)^G$Y`1pe#+i;5 zGddH?E>LWDWaH`tgFUr)tJUD#ZoKlMVymSWdEXk`>O<10u3opQ9`DaI*x!ONvx(Un z`990bcw3*L14B+$9hSUjG}AM1R#sR`l9-C!6^boy0+#LfO(fm!>cKI0DzZ#3zpX}Hs4^u6im@tXAb6og6*uFApI-3AwyVb`PIM)G$W^07Lg*lJBc#ZJX; zPYa@=zEd5R`QR~%t+XLr`LJR;B?(U%98S!@M+2JfCT_-l&k^&5IU9j@hz^Z9!0PR= zb9$}##xQTS`!kb8NiN2Iq1a08!?;Vnk6hAfPQszf6x%IX@E`g^q(ZIKcsr#LD;ALQ ztqRB1Va7tmmbV_ctBHBlbx3{Oq^9Oz=FvMM+ICSR61x;TS*6%@>W`63?W`fIuLyn9 zNF~$k9&Ei-u~n3U%raH8{4LTyZ*MR(zR~gra7CNaP@u^w9KguWjG?a_)s~9v+-kY> zN#B#Q?M|75<8S#%Z`0?2KS6OQ7{ul@#ddKTj(c8lIBy6m#{8_!emahrN+=ylYa3Q} zD7N#`@O7`^P+uzUc-H8|*W$?!h~_*h(Yu422e6YpMP(vu7EXsqIJq7S#0Dh>@4;p=Xrlaz=rDyqYo7JKQrnv@gP+H~8@3w4(|;SQo=zmJ+NC9lZ$xu5G27zQNjU9UO%C9ctU~A_*kgi!&Llnj%rDA|JAC+L5z$F6H`=V`K$0BFx`LELGN1El;oP3l%x}1IP&}cI(@5*|>kS z?A0Ck>;z((xaD27ix{%lcf7kLoZp*R!>qI*EGlhum5 z58WNPgxxuaykiW`=|#qL#a5dKdo3{7pNpGU5lhMS3$jHhpPmz_MB5XJ?c`+aDp4FR zu0VH-Vk^E4Cx1X3c>+O~FKMTv)G5ny>br5Ct2CUGV%4VLuw80sb9gg$kAJl)nhS8i z>BNytn&iuH-9;)2^(UkBV#Q8eIWAsE%*Q)rGA_MJldZvAoN+fX+vN3_ve=|1`0zj+ zapb_9Rmyi<3iJ1fJ)VhFNM{^kj!|p} z;_==|#E~-RIN7Plo2;U+rvRZTiY;$9?!MHd7bf88YfXA<7S6gwvCDzT*LNy*8dC7u z{YF1J3lkqO`g~OnS#NNBJ!&6Q92zRepHC=u2IKLmhggaftYo9Ym;^%yd@?bam`5rv z$85Y;O2`^ph@NsyoH89-n^j%zo84!B3#q)ue5Jp9PI0I&1xarx4reFguuqLnVF~7Z zO>CYaUyhe8Bl5Rn{SFmz+9FfUd;cNj)i1`khwZ7?XwJm}s5R%|o68Mu z3gGf<6o;eJaPsXYJ-!Qfu2F2aX5j7(ibDa;n>G@&R2(fzhdlntVUs3Ykzho>br1QREbsB8k52Rz8uEsv1qwL)9tQk^xtK0 zbUnu2tytR4yGx0gt?f?_4HjbG6-Ja6gy#W+JG!ypVS}?1@WWHY(%DQy`DQ|~_NIsE zG019GE!&%c27^O^D!l)kriTWH;6AS=9J5nf^DdLh5)b;K+m8lU`qnqHf}YqxPe-XuBu!g@0y{De~;5;Hno0c_Y{0{#Vq*N%G>s zSBd3`HdR_25)Qs*^rzy5w^jc#Jp7gF+m+3D@CU_Kb1XjjMK!KK+fI|3Ux!ym?H$RH z4hflXUAiBsJQi<0zC27tp@2`uu4zXa+M6Q&n2(QE&2VfvMjWG92K~(jyPG%TQ-i|| zS=f4t>V*1A5OcQ4io>72LVP?`MOIP^MqfBAYO<=Dam+jwg@Yx?x>&J0l!&oc8|;b3 zoj0h7+yW<1jd!ZZs!oJ&iBaooz*TD%JAEDa{4t~B&BY;)oAiz#j(9?`oz;umqD^{t zCVn=UQ$swJVsx?txUiU5+IVj#KVJB~QbplJKju^!%qc@rlQAhOh0{$e&v+K5_G_{` zl!xVS8uk1RPLUN`)dP6&BgJ-hKKA-Tu@lXH_IqM!3GJI~dKIH~7!$vblN)1_TZZ#@ zs+v`rCVrPc{xWKT0Fw3@8F|!BQaxtwPb|4*=esfOcuXi;fx|)k<%cMChc+V%#8UPd z`12@Dw)~rsZ~`%{!-YX>pbdW*YG-E2B=*QNR5Qfy-O=+D+w?L0Fkf*vrV>RrY7TsL zEKdez-E2hp6^Osxm~${}Rh`)7is;$guvVz?tqFoO_6!fMX#yIPEEwc20Pi^SoyA| zI~+x2er|M<@{q92nEP8~syX^ArQsrv3}m0QRMTps&-@3)ZtW)28XW2^#(g74MXHBi z81X9}aUiL@^Zg#2d4%F{Fayy?nRH(=ri@n{>MKC+DT>{eCc4=T&d>}F6zTx=PGt;{74nduI~vPe;yH^BgeDhfZoRDAQSNhTbW+0_*K)x zgZXIuN3lDU!FkX=k$YqVm%&BC-b(GP7CSW>?~NptHr3LHszVL#>%qIn5%Xcp$IS6W z@~+Iric?h~%<9G|-0M(lZ^QZ{Y7Ee0`{l5&hTWfFbqzh9+hR@G*FcZ2G& zJCZkk>n%#{_Hv=`+@q9>IF`Q>H$9++9ExC*U>0NQWXxEtT2_3I7(F0VI>D+z#6G0h zs!hQ3wFdih(D1auL%CS$F}NxYn-hq67j9aQbR}z&ROF;3VqB_XH>n+4eI~u28O!q( z+eO73L=&@6(Fu0w<;yDQ^->(`_q}vH6KVs>t$1uL_V*uIvDz?kY5jWc4!ggdHWN=3-J{W6o@gS}^La{Yafqjo6X4`lSa|B5v=t~I& zyRqmPRSFHHp<}#?_zqZY*>EP1vZkg@!uA;|VTH@v-85UNo6;uJ;W6itGV|*&`U0cZ zoXQHr%8N+(nC-YibX}}USEBn8MOWdV%T?~wwc)8N6kGlboV-wRI3^AYZdC0X5x9v+ zN*>G!1sie9Ek-FAi+Ar)5r@{YKGt%d5mnS-PMl(kuEticVrQ@ltJ4&Rddu-thQV|f zK9*~AIG$KlKr9bG$s_LQTgy~r`3mr9g;C=?|Lk^R-v4_1BS?xENS5AWeupZ#(K%Su ztJuzN#mb~(Oe}duvBiG8@Ks`7W*HuTL(yuy@hK56 z7|x8AWrUc|hSf`Plx1*v34XH;=I62~tXTTQZ+}zlbT{G4-wh57;MPA34*GHNZpBt} z6<+_>q!)Xze2@L~9j@;~w82&&6@QG>bax;LZ|rY$I4gPZ0E6?{G!Im4xA?K-P-4C= z=g8{lW+A-yP+t{p9&41!dT~04d7qqdg!ALE@i0S!eKNY3KS4E}U?*x%Q<3!R!BP}V zSCJK60PjqL;|pLLY&QjQ&iR^dXUF5<%ha5HRVzIW8!k5@PS}pPO)-5!XuO@6&*En6 zvsewRB%d|tf#)8jZgZ7%-Gzoa!8j+F2uG;gUT+N!+o;%zZ$(kGNgpc4K{1M5Z!dPG z8thBLs5GPB8jTD5COxkSKbI-CDz;k#c<5z=^IPD&s@TbD$H}i5{j7A9zhQJdLCpJ%m{;dI zl(+}4{mO{^>A1vJkxb{fOxi1yRe1|u_)!H+m10f&Rc)Nk3UnB31sagHlj)q>$;3cE zIurj;O}oDW=l^AJxfi4UA!Z>$gDDnQUvAhdDsrDzT{Hp*85}6a=SM3R2bnwGq!(49 z=}cmpyLC-&ofli?7*SgZ2Im=3FjV&2cV_1 zy+X0mP)7fy!TCY7Ty1nbmALhG#crJkx7=-X>YB0dGGeLM&Y+c&jjQfeku~VWrz?ya zmlMujrE2WT9seNqh*cUcF0nX0IYcN6GkMY^Jg`Ycb{f}Db8&2zio%8QcrsT-mOn?n zv?B|RW`2xxO>U8Dx@ir#x?Zsr%Qfm|#dhCjyw|HH;nEJfE(YHUm4>z{1`GQXJM=vr zFko;-3C3?x6FVhdW;h?bq9Th!r}3{Ewa$24{Dxw;I~h;DY0|rDu>ZS8pDR%>y*I31 zfUHkUx-W=R!iqz|XdL~$N%!<4UNGyHGZCq!<9<|;+Z~SyI~Ch)Nx126gVU<<@xO|# z>RPQAJkkAa)9t7VS@$5QLR+#;U0uc?m{TR736>%0B<~)7WODd8#evj9TsJg>u z=)}C&mD*{wvb1pP+o~CEDnQC7#)>WO$WIOS79-;`#dd5w>}`sjLUt`*7(7^ysxOUx zPBU)(%B1JU!~cz9m!7GSJBazV$yMI@aQ`pIV1wZaok=2rleRd7w z1dK75&JE1iZA@A`Qkcv4IzV40UpoiEio=zq_{(6sr4R=mr0G^!8!k3DoY#SU$1t4* zvzh`jR=#YUp&TqUW#EG&m2&POVKFyPAl26H#C40ay6kMG@IizN5^(cWRSwtW$b#q@ zGnBGt3Jqjq^&F*qOmd-T&Lhus9xD@BFRgqm@jq-<8L`Sap6jwc}BDbpn z^Pf=c3^e04gYDv2)NC~7iAfmuq|vV|$AYJfelQ=WK26N$#N`u*AB$NYQmww3I3UB= zaFY9Mrs{EtQ@o>TSt^o{kj421w0jidY2_`WF@|HPs)BRhw002LiU&kD?Y8*)2XC*GX(zkmk8xk1BI9RJ^+r zo>S@s{dQLxcD$grR39U0VK^DwDx4y@&^Y zWh}k*5=&JWB4u@i^Q%zc7+jN&U4pewoQl1_Q2{6VSh|B)YPzslvfCk)_Et{~e*IZR zY~B(Jc;`-~GRw{^$GBaj+7c$g{nMD`CE@lx4vb{s(7p45!Qr?h6zs`#Udi=tb~qhr zGt|zE7H`9}y(pCdV{Mj|ktabmfqgU|H^q2-K+g3)QM#35%Y4yQHKV@DimW~CgfVZm9X{2(mA)Tt^6d&=Z%-7!sV ztd?#(cdlZ)A{rmgQ5@#P1GWu1#D}-iuWf z&i8V`S+Uhzfx;^lyUq1@ZJ}bP)`$JBHaM#bw;3!BwvE>s{hAcS-9XHHVz^?6PDZ~_ zX%N9W9CxE)E6|OoyNIp-p)ba{728Df65)3Ef^osR8;ST9b*j+04M6eigXU?O0Ku*e+_r8AZgr zE@KeIckx}VQVu<2p!|B1ktWl--GUoi)zB&Gr`K7r&A{DlT_(Lc2WJEnyIqB->Lu1Y zpN@q=6^_7!4K5;JYA2jqH+ce7s4GmaWAv#j3@~B#V(ZkCq#)tZv*`sUi-& zq*Iw!XGFXuPlJlW`ElYF_cf|WJ`v}%8r+wS^VfrHOU@YSMjlr2eMZ&Pzqjh=a5@jSgVTNMbg$2k`kQO|=G_vFM?j;;BmMNRSwhlrxM`Obynasn}{x!5L>8+?R{>Qxv;B4E>)?%xki1 zk+=BRIVy76g7`%+MI5C|gmB+^DzbA1xD-Lm+dly*HyEY(Iw|wHx2lqxmxy-3VxJms zr8i*UHWk@LTrRlXh?)X2eM-7lsokF?Gt8L#R8vOqefJY{)sy28u6sO@cNCMda;Htlwo=uy zQi^eJxoXj8Y8MrvRw(cJJiJh$N-me|$5*KmL)|23eq@b`oLUdgZ%}NdcOa)l)j|V9 z;;GrvrXst#31hp7*;DZjZBI~^^lDqsD2e)5B-ren9@Td@4u`>%*y1pL z`FF(T^<@YVyPw)0RO-Yw;fJ4yC6l}s8Co>`Y7BGYu#=cy+Biw<5>F#$9mxL6h{1}`*5tr@b~_3p)n7I{Jg9k}@tQh8Zs!M}iz**%D% z%TI2bEN%U`RF&N17~Hdp_`l9!1|~hEs@x?Z>v7dWdEIdQCaiy0vE?hnzfT!8 z_OpX2IM`#<^4c*mUPV@`*X2t7%SkG7o0D-(iosqlN-~UzrwlJ;850gr8KvaUQR;e= zq(9yzlw}C@rsJ?&qZwZ>-nh9{!?aHJZfve5!i&E*G7-F6Kyi(51WrcM7i=o#&@q7we(ntj2rcaitJccm%*XF z46Jxl)vW3ci+jIfh060PjKhStjHo8Y<^bv3x0QyA>n)DU>)%&Ri_798J}{aI9Wo^x z^(pCp_cH}kg-a1DTe0yAVpj25)NI#OXJ9irY+~LqqbI}M19h&WBAd_ony`wj<|LW5 zAO5paMytqoYSFK%87}f+$)Comt{2O9E4Er{@%%9E%S8L0VWvkIX zf{?E^x2G{k$4Sb^^?Nhrzk8UD?ZTymu>%&jA3VQ5DR0S1EJeqGD&ku#WAqINsVLmk zf}Vp_L>ITkLDe=xxo{*d?v^pCY1gOV#KTmRZa$YGtuaR`W&4$wuT-IOx%NS{8!V2B z4~Ow!IzB&&I?_EQm)R*pj9$fM$EsB*=)vcLS$@WA$cX)kAXjNrFB+; zxan$0cZ|6COJ^8EhEMN1llZ?!I0+}tR#h&O$anU>c}n@^lHHJ#=998vF2S!?63N}r z_j}Sp;uE8e`oD7+F3A2ro5yaxiaKmZQ>UQr8baPA>s0!a{2NJ`=e>A;5fSrrJBxA5 z?TW3YEcD-@I5b#{;GK%Ym4!I`9%5!>Z)%ce{`e9Vxi#FZvDDz95>(u)n4T{jxkj;_ zQIDq{B&IpL4B0-hbDfI9>0S76gHbE4!#R%=vwZjA@r{~lwZ`FF!E$YHj_i+o;wfWL z-G#f7h-FF_Xkq{=+pr9O?(wTaXuyvrHz~F_=`YMCmVAmlPFD;jFt*3J_t_#BrNVdVB{)w5f^XtCNNA z868H{JA|TM6|pCA7y|uCzfwja%S_Q8R82c+=sz6ZT!&MJOgi@;T>8A?P=5`+6f8HG zHRLqq;F1>%&5oCV|CKKqtK3|)599nktk`PO*=0TQvccJ%xZ-s+4+VWP(LCi16@>=s z@!4C7!!gO2_o1q>@?{IlN})Pzn1(YyS3UYIahapoZjVK^!EQqiHoHvcE2L4i;!Qn# zm!`0>m~YAzlEUX<=Vju>-Nd{}ajY3EH{%vzkVX#zDPx~+?YF$m=E(&th+)}-R=&Yu~2b1 z-iM2?C8n{R!u_L3D7#H5gAApwdv-CYjw(4UU8ZLI5TWZn#i2j}I#w!nxwqh})rzgQ zWZeCr(dS^|A;G*1$7if19JpRZQ@9R)`6DV~KPHhp>xHrg952_l9#d>F(mG|MsxjEn z>TAMTainbV`Vo_1-%3f5OQa=O{7{?}Pb$rADxUQki7yg4`B{pQ%)+k%c`-M&4h^M> zt)>RdsZDy$_8PlT&1A<~2g)HA`i9(I~~1KM&jYQ#Fo-rC)ezf1{RLk1oOTC)lN8HnutG31WFH~D#*9;pRdb#!m7X+- zRBD<_e^s^8qyIQf6&MHMMwejo45JX|r-E!u-!RiC@Y5Xh;_KN;xkfA9(7c7D($v$d z@xavv=cgg}I#uJ+iA?VIy+JAG*>sL#tx&0`Y;O9xMR7R43qRbdYF2ZuY$Diln~E51 zfCq0kn1Ksd-L2S8p)dAcHL=s`Wc1Up+)!_;<88t#LU|#FQG&@U6uark`0fE!3w!+1 zc#m9Vto(j=s1=V3WwTnd8vSe3$Zj0Og=-aC3Hcbmfmk|<$o4ex#SK1eq)mDF@G)X$ zuyPsRNg(9Qxe~wmiR8&7^;+I)jM$|5ZvSR{Q>@r&bOq|!J z)K2yY{mf9eIUwadzh9+{YLa>P=dkdiyl!KfHo>eV=YGgb%@kJH647niJ7U~A z!wHM_8msqSm?Mv8_efHCR0A3KSFlt|Vxr{#%qXSOw{r%#b05`oJL_=jzABP08TV60 z9i$?wH5plg<#L53QVHK4Vkp=4=wkihP@|d9inha5#O(~?Nxo~GiWtz2fd=vdU!FOqoztljYfjqbC`h-if(Wj8&jUd~5TBYROFWX3lh@ z99@K!GZlvz>$hNzYO??4Y~G^txcnlOa`7H_UQEn)Y3fXTBv9(Jh_#dg|0Swrm$o7Q zQiGeS5x2moahE+uVaHyjBA1=PqU#N=Z$rtAimm=!?01u5#&U2w6tP?-u}{Jc&lM`I zg2#EhcdNnNB6-4XiY;FQURk8r<*w1U1dDk>q0Ea??lhvVPVBymm>uZ`oVHk%7(*_1 zGU6UpvU=$dTxtyJ>rr{Hs?kxyaVOq?kW{PG9J-M55L0wMG!^@-C6St%jr-RVve0gA zGTL`AXT4(~79>Nf`YF|xfc zKV2zZyiRrlKFT1KTZ&1-gPRn~F5UZcRgG_$e5OnDRb*vW^>I|d zvPw#cq)49JZSYpPV!MO;`zqCt?}IeZ`PD`(wH+G`wz(mESN(8$H>$f8hwFnlwnxp` zpykfD8X7L|l{mJ4`cx_(Ga2y@kn*W5TZ%$~+P>xcUd~(H&o?W@xYl*L+K3~uhjh2I&> zNU48*QXDS!!uf@mw{N#J$Z1w@{Grq-3`mcE$v;YYHiw+|*7dI%TD6(@NiYpL$(L?D zc~59QTx{bBA@}S>%6EhX$Y!~&k)+a;(^`DutN?2m^(v7 z4)>aWZm?CIi!n2eNwpVS&m;cti!&29oNr{!JX+`iVrIw9xH9b+H%~>O-dJQ`tRgEZ zQJ&^m^GVrh-bSCL3iu%9&d$D4X=D>(7QVTfRI07ikJ4)uhq;<(U8mS#AbQsIimm=^ zblplUB`8mo;F3ocDV5B5{?pBO8qMTH#4)%da_d@2_b&NQQ?{`u8fqnVNr=Sc`_<4c ziALFKW7tIhZ8>(WQ;`+yz$uR?4#%cp(Nn5s`F%3otQe+6rQ)gmUZ^x8Uj^QMS~cZ+ zKtJ}~vBogIgAv2HAXzoTTyei7MMdoBWS`%@sYWvpWAWQH%CDMEcRPxPG5grhvQ;hO z^X)`ij#6$_kuS&4FioqM8SO#&luCCRIhjjF2Ck&AsNudKtOCVuYAlW_)O>gni5=a7 z@*<^DHm=o-C{sX^<49U3Z@L9*MVROW^}M5nV=DHk~1rdIsjrkU8qt(e)aBKqs3 zlO5M-42$w)hyCaQ)s!97bU8$B+{v!SXM*KL@n*=Lw0DP8BrBPZJg1nOhf(o@!5nD6 zEtn?U8!As6FBwr;9`1itu{Geuyf;*h6AqcJjeBQUEdlcd%SG8lea(3MJry~0(Re;G zYQa3bDVX1yd)DIH;pFHTjQv=x8LuL%Vpo4k`rmsy16O`-#9isKGIsJdQhCY+e%$h< zVw>Ii1z)Kem$YOGbK2KRt)5OI7?@AU8$tFwzvComGrIem2tD49?YX>o27L>*QED&q#`t zB-e+*KaDN@mE50D^{=f4G);$l4m~8i?!QPvgRZZ_fsJ9Hi2xV?oOb_6(7y_r>MwIZ9&=u#coVF4m;hLcs-aljAJs;C0MSK z%xIQmoOhP0ZTI+5JJE=ExCl36PMBn& zTyeQlM(4}Xl*K}sW2iS3u?vkTI$&oeV&Qehu!UQ>Jb3LUBjpf>(QOaiLMo%OmDqTj z!s*koa*?s&vS?Kq-VrM09U6dRaJa7;MYo%DdUNL9p(boz;--1@ZbJ(La_Yb*LbY{< zbrL#ui7K#DlNj7VL-{qAuHn?Bl(JaUX5b2ed|V6@l_PPYmMP_0p=@$oDU?OCIjes2 zULz{57SGie%T?rdbz$rSMwDAG(KmjfV#R$O<5sGOA4IXLTBlUX#Z6~ZA0cHyW=uoy zF_l;?{n9aC@&u_=N@X#A7tAtp3N547iBBqJzb-M%Lr;-1kD1dkCE6$sWH`YhSccLu zD+fUQ5l5*MtEd4_Cn~m@2k`nZF5+RvNtzyxYeGS?VkavPPx*+YntEE~h{Ao+l=4_Y zncVpeb=b8PmtxDXG^aqhm0|MnX2=I%WR_BnRVBQCZnkP# z+>CKsj*1vbWix=WzCcC1BDNVU@sH0Isv3i6IdjBsMM`?U9rR`{}v?WlX0?w$l78=jM(SO=4CG$LkSbC!&6(8hRfn* z5;Xl~HMCk9G4B<{VZL|AzNT2-^N-(D6Q1HBZ|Kx_Ne_!UDB=VZ7dwwwk>ka3K2#H{ zluJsVsfl<;_;@e+e3&+|yT-rUlv>rjGB3;j($H4F9G{c+HL1K?({T58Lf%%mup6%l zW&^Uz60O7p>=ROvTRDJ1gDuwJakfeKmf!<}os7*mBdq4!AugZ6%wg(}2?f$H=3AA9 zOY3ETbMg0@iOmfV>jm?=-0?0W=GLPHtB8s7`IzZ_|>uau{vTqvi0eZy_LE>#qxYEG?W`a zWsW{$A2qc64akZzxTzS=k5+8AY{KV)rK^~Zy$>dod*Uu*-qVspR4ZJYj&6h9&R)zJ zr)oSlO#EgKAEg=aYgsRl4@O2-9~`Y(+$|!@hqoT9l;#0M5V5uAipVG@2G$K3OkF8Tw z#BN;z3ah88m0iey`q_%zj?E}AcsqTxtIiqDt&vV=G|f@UF4nHA#LMRytG3O!cb>sr zRlp^Rw|l(koNv;-K5V~Sv7K9sUj)l7aL(FNii56F%2iM~;o;Wnl=6Yfb$`EJ4c(Sp ze1C&tixF&p2$q}mXGmb#o_DAyG?0akJ5}TkWzo5bf0n37zV{24s)##6WHXibUZr+d zzRhDfyYE-cP;daFRw#Cc`f<0xR$?2f1k2-^iT6yho0)=FSDT!8=83-(-4802xdG3l zy=I+eW+fD%&0srFj+DnV-R)08=i|mCp)fKjdV-WQ%g3=u5bu;T5-Hd#hVs+Lt^z-9 zR4Lyto73G}o+4$-vb;eod0Mf{wvjEEnb1l;?T5va%Jir4W-x2^3thZ zMycFf)n*9}qNk#_QnPY0c)~)pVw;C*bk?b6XeiDZ$YnGv4sT`(FT{9C`4WsXlyh?N zivQK6whZvc&)uqNZQdj^uAPQ5O33C`g|*M9rWN0d8=q6`MyKNU7Zlq$`B?O#NoSzU zV!?8Qer{Fr;IFMpxt1YE4c5M-npSfWj(*u#4Yp(2D<-`)4bu&_2l7z=nwm=lIs4*Xw0%MgnSybak#;0XRgZE=r92EoAK?;ngg$vTqrx8& zLP2;_8~zc@%X15`Y~NV;jS+QJ^PL@{a!^*Vm3bg!Qiw)gmx&l zIr<3wq9)=*Y4zdvUzPG)C#%oTlT4J#u!W6m=1x+&8DwnRQmx~N|AoRo{3oUC7tX*N z{}9Qepx+8WjJp?)#BL{vMzA!L8>LY8S)P{qBQP}YDD)KrF>3SdJTRwRDys9 zYjD|EBQ4I5y{}Imruo>}jhJXp!vzAF%zD6&f|&u!B0tsf@I0kfd_CJe z9yK)5U;=nns7`t>#a9=rDi;{Sxk220iLvx=miWTd`J_Cb;7kS)O~#W~tB}4*nYbN& ztx|gVWdL`Kp`2&Pg7Vddx|{Ol-0K5xFd2J`WqSC+jY=7yBBu`>dy`W3pt3?cWGMR; ziA~+(7D{=i&foxME>dh2@&_dZ^RDgUbm=xq7AxhllRTeEOO!IcNV>+Kgi77>`#NGN zvCNe{#XP`Mjs|*nnJP;t2v0Fsejlkkvz55-ek19MmgY8pIjLmB1)}Cv2DesW#Y2j1 zo*Mb6U}?OCg%*#_+ixu?>-+x2tJmS(btI-nXEVN3t-=L$h<@1MvUv18O3WPC4Y>4& zeIQgGb!iQLe@byE$T9iTs%AFceCAQgwKiG$-62$}jVpKi#i@v$wLF|d5>*uHYsbA_ z6)|i_7QAzljEK7n-b^-D+_KG$X_ld~MS=TKxA;`k$?8CSnrg~o2#?h7^s6Xbm5fKT zj9L}PX4$I7ZLYG1t|^yPeHQGd#3PuHrwTHQ<^X9%k=j~m1=z1xvCALs={DHuYDaXb zF=4zu{nPzbs%GbL_Fhdak8v^ft5Yo=QX@mlyM#)O7FJ@3!B+1e&TdvUo*gOe|1F{N zw3B`CwkQq-I?*gxp0F>=O0GwKyHcmMTuvK%O(^@IM{q-@(d+4yA8^?rl-bGh)_z?^ z6dx-atd|OvD-AI2uG@$b=*8#3g*{67WtI8-ze4r(U5LSekri`KLnERGjAe9@-Bf|s z1`Xxzd>&Bp%8*jGuOTuv6)KlZ3?lj&70F4U^xJL{Dk46EJ)Tu;mlR;)bE?J@XQksT zeo;kEO%TU#RS}QD3%3{HvX@BdebrHeStQ*giSokcH%lB^>dR_w=NDk%n<`>wB97TE zLpe8=A>@d+RLai*=i`VOroL$QHpmvr(sE3M2_G2E^vKzeFMpzB? z2SQmOw>Ab94l&!f9#GH2-e9>#no@&#->D%RlI)90`@v9ld<>VU+M$}^qC^z^uGlH8 z!rGmR-R^uG{I_D8Tf=O@a;<_miDqO7-Mjy&)ap#cZw9*q12}0n(|Ln3c;mA48$#uI zXLMluzr@@fFI?1)38NG{wLwhUS8=GX86|=vSvT>qPc-J- zeG<=Ie1=*v9r8Us`V^JQnL>1q?>yDy#umqy|KJHmnp-B@?M^z4RBAS<7E=UE0}L{- zE)h=(Wj8Sgi%(a9>|u%LAu1D<@)_DZXe?(6skElHbS$5yn4y?hF;mreT&V2DoHEQbp{HUtQn3%_%=RR&kr7388 z(%|wT9Qd^25Ic$mf>{_ISS4{8qdY3IqO0+sV2T*~E~g;h7o#-nNw@piAmWwEY>B_B zb&gjJ<&KU?Hlm_B*?+exMJabA$XN-a)08sA&`znxF+!!i6ei*}gG0d-e3Pl^R$LrT z$}&0`g*Y!)aX7yl`GQ$Oo{A@3#hrOd<%7wu)KWv~7O*qZur1$Y63vr93RE+k*)0!l zsKkhp8*y5hVymf?$Ac?&lDkn~ttMRJmD&1hhH?>IrqZrZ&X+lIF_4hk@EH({xZb|gWYBi?r+re9RnpmlbTrRaq`lvY*tZdC>cqD zG#&zvREx_384po!0Uig{+RwyqTW;n;sT?UtT;a>r&NZ$bO`)wD8oEt#PwCsNc}voJ{J!UlJe5i=!(61 z$mHYgmVH`5p%hsI^;rI_5!D9eMcey1Qtg%)1oFJfxc5YA@QfFgN`yGq@atbDWiGa- z8u4!uv+I~W8y~-AlIfj2`W?lxJK)NX6}yRbSpSJ)i$7>}`=`WGvHax<{$9|h+m!N* z0J<`8^LC}&3MW68c;vT+vVJ&kneZK{=^yow$hqG$MGBRlf^&aR>~!QHc^DVQ;-DQy zr?>^%40fAaG4K=9c@sqvQR97>=jP>RW0119(3-sV7@>= zl%~rb8{dJ7<#RJ6SgJ6qTMqH~{2)V%xbeLZcN|PApNyH+PGc|52JVzkUn=!D2(r$u6G=rKVF@j+0N& ze8h1%7%$(COC^O>Vr3`Dr#pT;sno$V+;WQAxycM$o2=OC%SY)H#dcaM9-V1$ZZzr* zX0SA_IEU#{(Tu6*`qHuIlCo}O&r(^x%lS};q_9v{b1J?Q%xkV$i!pQ5j{XlR%@IOH zE0(9p84=rzoVbu`M0Q6Vel{ZdykrXBdA`X+j$24Y-vydUDA^dR4DUz zeLa|TsfwInf*V_c)2~pG%~rkSYGM{&_Xo+i;q>cNnd2%sd87UYrTk$rd3fI%%I6_J zwU={}wc;a}>F(D;S(tEn9~a(XXH-&^@HA zvt{dXfz-7J~gp&n&o_s8TT97(ktuMDMG1f=kRdx6)KXQ(N(1w z^MH!1=uW)6N<}<)N|vpPS1T3o1kVj#yT)k74dHczb;Zl-z<3hjGy$E_%$Q^G68AY*ZZL@y+X=BxWtKe%Lw4NALNk zm=bxxd*!f!!=F}4XHJ+Wx%_6R)tV@d;SP^xCBDVdSk>fsRSG#WPDPSE7{r%jIY0CRMAE%~M_L zYE;D4W9bnG4dursoEnFzwJPN#LJq$Aq)rX3W8clPR?77fi6mGflr6{_ zEJthuF)L~2EMyCmCRksC^P7!IVmYpDBi7m6WfrtqpxIfE#-D0Uf7_tLWl^I+`v1WR+HE0@u$ zn_e`uEnfCLR}WMAe>g@g-)f{qtupCtc$t(1W)EzBeDh67k^7`$ZOZG!y!~0z@#TAJ z$1UJoTf_K|R4c??jGYGaOfGEtSk=BzyaC!`B86#$)|OEB!Znio0XaM6 zRzqdOD}DNBe@iLvl92*P63jdz=i>Qr_V=2{nw41eo5C40aNzGoX{cC^6YBm$DbJ#} zI=T_}r;+w$$;fQqztryj2XNotguIYdo`<7$s}_F()-Cbjv^|cBT$jEM>4RPu%16No z1v@N05ESpJ3XDgR6OFeBl_$f!KM(C~M5WboEXCYWN*UQKKjW}&AEoRLBx-ZbzND{N0u6*`(StJ&B|_Mn>-r0~W@euheeukm%$aE>P;m z@<{6z9za3LyO1^LJi$`)2@UevK76rKtFBOvUVh>d)s&q;OXeG#yctg}AeJV_bC$1A z3#WyRhre>ZP^mefWA$}vBuA@0X)rhHp-8Y6{sw#^gk|6_QcL{6Q#X(@b9zqByh$ZI zazw%-<_VSR_g5qF79$E2OZ>+zx2i}Ed{{47N;ey20wwFpX1gR8hb=NRZIig3eicf^ z8B;L-c9n3GiOj{S?;w?i)Y*XN1v3wdB<`y9E+gV;+$Swo5zmN`!`)UdF`9uSIo9D7 zLmA8~r$IlsObxA+MEodN@|00_6DBTKk;7lHnl+4Ly%@1V)vUBq1_9yRm8AOpF~>|@ zCMnEEHqx-8_(&*kXx0p@T4f|B*C=J`oj`XDj0VS6+``}z4=RW_#(&snd^ zRzf3g+n^#@O5rIES3IIrd{!JLl{}`D9|Wlz{}XEH&|CM5!4`M$4?d~s;mScg;!zXv zE%F;u7po$xzYSZxitPga>RyVfanH4!2636ssHMfj-DEJGQV(XT8p|U+QcJE8Rr?Vf z#*C6UJWtgk{X_t78p?B}BzA6xP*&T@<#?w^6~#xuPI7*+=4Yo>U`(lEhhzM`%8gp$ zfc$32+8R>n+HYv913ox=(v0aysLj+5s z+f~iUe%!Q!>}zEiDvMx6XmRaK9%=xoJ|Ud1x* zOAQ*#b*gZ`;&3bvBY%ci-ieu*|2(0TqsVJ>dg?!*5wWa zak0T16XIE;##0<5Y+>n3RFi6|sFQsY+!r?ZvZk;pvTv3*@3dD;R@@j;@D8yQATM%; z#f$GLWv?mUPuEa8(KXqa$7CMNcdWD*wco9!93R;w};e(d-~)S@39T#yTzZTe@rOv zn0t*lOp%l1w-^P^-egKS&(Un1gk;@4!zcEJ9 zt+?k~#jI0&Em+ie_5ycM>>3s^@_6F+Dq_DROWH|>O2i&d?cVzblL`0RlpE}(#$fyo zO?T>tVEs%i5AVNCAPK)Sh08j8_c-9?SX+K~wB-+z6+ii(3YPLzHdygp*kc!IWX?C6 z=c@=|-CF&5$ogCL7}{$yyr$zHW0o21@buNc{#9z%#K<_}sy&a^o~)5)2k%A93-Cy2 zapXKWLPb`b2kuBC^0tN>t8wTkL$f=@yY>8Nqgk8|9I7IAU~<0Ll(D3&_VvpUJi=%V z#Uu78gQJsh^D)MP=Miu@t>pxz#)I(1_*_n^!@E#JTgT^Ty3!UW2zTSQ<0Tif>ARJW|YpI20J{za^$&cB0VvGvK5BX zpCK_)rw`MrI2j>z&QWDMNlxm*x#yca6I(b6A!a$0c z%Z=1iDZ|Igt4Nt`xPpPQ2Fq#psn@ESEbnps>xS!;((x+u_5Wx#yORQB8mYVnoYGbb zl{~BRaK}x=eAsO7uit9&XpWWP;*Q&fX^CuB{CAO3SriDQZsrsmdzVV+`3l!G z;;*}nWH$D=hfpr$iN$WgywJ)uxMhh+F6U@Wu(Ytq;p%wCP@c#vQCzmB8IH`9wO#*(-Yz>Zo_+CQkX5b4utyqaoAc_cGBWd`2Vr>-SJiwOSfi0 z%u67r1If{hm9#s0X}U}OXxS~iEMq_-8ZCc>2V zN7#!3t!vLY(i{VTfm^akDq=x$u&{YYn6~`}J#FD0gz=ab2d@7NJDXl;8XO{;g{Xq_ z0M5+62>;pEOp@hE%J#NKd8=dP@DxB?Yt0h5m?SE4f?=a}Tsk0cY9LiMW+*)sn*Dv5 zD(1KQt(FN{%A$|(GucXe8++v)((HMJU&B+Es}PC9XfAhcnL=0!GEvKd3P2tADROHi z050qq43Uy*m7`(^)+&$HD9ekS_F1H{SG-RSskaJI(Q*c9To#_QwCVk>Ss~61orWA~ zX;;Z1oMh$p8LKIYis2T6UL5yhV*E0oTqOMDNKYQriu{E=K$_zO3mP1_9R3_2MsV6R z`GzDo$j!m^?I|y6CB{Sh_jYA@qB}U5t|vsLsgva6*Hp^enIgZurBXB-g>%T^yA(pN z>6AxFk3nBNE};0uGL)lgZ>k|BjWX-B--Fvnm~GEWW;@feKh_SR&`GD3cy{RzYA!2h?;1Y$FI z@UZ`DgCV61={4zlg|H>|)ohauKLDbWn-EZZ@lT*J9y6v%DM`+eh9=rgjM}G=NP~YrJQw7!LjZ9Vx4Kr| zI}|jmRN>@;LlMNKQzMcLu>^HJeLX|nNz(r>h5W?h zSoHG%gaQz&=||7drd*iyPB~MX!tLS&fKDzkXK7ue&~OG3@X{b|i;7~;DB~jdF48=l zdbT!2MehdE>_Y=oNlo&MvE24r8Gf#^oJzkX)b0zlmKV2q`o%^g;MYE*9pq$RInJse zN(k1bo#PcE^#IQ$-zUUyu3aL(2Z8jzHPxoh{B)AaB4!R#U}sNO=%UMr1yoFkN9go* zOTiSSy$MjrkVf}$?BQlaBL}aEB@D{(mn#c*OiRk!4Z;$DQM5`4(V!3C?7xhL(VC3E zLTkaA%1H>3mk1%|ZS0cQ12mH0{Bfvd%NpqNN4VNCM;mau zGvqbW9BZhcaBBVVHOlg|XUNId25KcRX5Ku5@HM9O+C7Bm1@aBaYyU_FGYQDZ>x|XV zELHQBb~7sFqMNiqEQQ^UM)~v>W%+U;F3G07D7Jnf>^n0g|eW?qkDzPqDPIJ zl;9%&(j^{9z%E8a$Uf4zH$>**%U|@QLZ~6x`uR^Q^tKFA-%@T6TL>=k;+_GQpP`fz zFEk^FQ(X8gO3*TA+5^E#dmA%kGih7}k=N<{`n9o!`pTs&Oj%A-Bn>f3!xcikKqbt* z5rEq7JLT4B6YWW-zvojifPC`!B=}<-NFPgh8Jt?f!f#g8DuSo$!v-m?;6vaZ8pjl5Bo^X%2X;!x{O%6#{mM>(GDy^VQgTXnR ziFapPMTE?R`)WHO>{#6FM4ZfNl4~@`4b!2yDoK?c48kh%ts))+)M4ui6mo9HrHgA2 zdns<1wiYRuzfUaX?kZM@Fvm<0>@~sKMth+iX=>wPd71|9O!U{`rBn5 zAsU#DMfeA#(R;eL;p5kvDJeTu7(RX65B~AG)rDs`w6xuC7=-4vDN(*9L{ly@%U`t1 zSkWCc%iU!VPMBC1%Eq^io7;xzDEGdr5a+-&7oYULLTnfLTYU2aKsKAzBPZ_xjUG}7 zs!7cc-mAiR*bb5<9R9Jvf@T^IU1Sg$$hb^A;uCN&a2u9L5J|M^L*9h%K3CdlZj_6^ z0Ii#biE_3JfKl}JW8Kstk9t&Y2+_NqAPp8YP>fU3mSsA@`7E$3l8!&-Y4u;cj-_k6HuPwG1ow1|h^8 z*tWcHECi_Mzisn=m4tNy)rHUet`O;v;9Mgqe`&>y2-7HkD+|NP^BVW4vvksA4tU^= zzhVR+FOkwGPaUna7u%<4$AH#{z>vlGKiqx1R&-&7fUs|cCm3w*XUxqbP6EWuJ@TaV zpA1q*cal7Mx|UCyB#mck4Xi16?wB)5Ia9{SA7_JPC)&~__Z*d?eIXRSkBqTO1(mS2 zlzHa^`rF1-zMTNoJoS9aDtuzFMKTH=YL(9dnEeYj#9SdVQ`{jEI&u=7Gk2FF{7t>r@KQ!-YO#!VN0r z$*7fo->59CL}>Rh_hx0eNqG`}i?V#ZDLndp;#P%DWInYqM=n%JCwZHUywm82UODY< zrQJ;U?~(T3Vg+;GwJb6*82#LGpR(|+VdG~nR_K&BQ(boS0}2tcmhQrmp?(BYx&i_${e=~ z3c6scPOVq2EDTfVAn$5ZpY#%P?!ZB7U%1k4NthfGp<*;d$1(P^QGoyGKqkt;alm-o z(zB>JXveBTUaYKr1>2R zX{4b{Zn{FO)%lP&WNIz9y+(>iQ!+T7@e3qNA%crC?)>$+278O}Kay_<@y6iwzgAAk zQy#pqd2oGWJ|Mc{c6Q2{1;)bR9Q-^kC{&iOAV=OPQWm_3sdWr5QOLg?G>w0is-)YN zB?p%&3&EKjWR#4l1k^7uCW_Xu}7sR%DAD_aD$`AHOI52S*viZy9G^ zvWxJaDKtSQzM#^ufZ{6us}}*$=Z%}><(HM?X=|eS+o(4cB9FJ;Q z<*h1%qJ+!O^LDE+cVJMgJn^121v8OakH{$26^8{fPC(S1+s3h(w?|1*-M%O z1@CK&Ov}g0VuZNlPn1P7B*Zh$G3cocVT98U3}Q+!RLjI6t>{A#`Xt91Nl29-pEiOi zK-0f|LX4_E&{{l0Pi1_q0tgJl*lCx4qtJ~@pMWmS9Ah?sH+auI%^O;;L0PFX4<3OMtf=!VMrngafeQCsmvXrw6D8g+@n>@Z7p>on&gIK0Wp?~gXGBLl!H|UAJxkS@$-P@ zT0)LjE`DGxj0s;qK_Lu$8D6&SM1u(Ch80E9NI=Xd&Ed!2Pu0mP00kG7z#pBW0{RK! zNz6s3Y6-HLVQ0oF_noFy@B`!Nb=>I+@s4pnf8=PDha~1YouxLPgAzW|k~G^!NmvHqIg?RCKf1ghSSJy@J)a8QWGvjIu+9FQF^ZfufZ0B9-1b0$CROC137<#Ld!5u%LA!Bji7XxH#2Q`{o#{tAVaA-^^2JN3s9D9+1HYpv`IP zg~JQ1Q22AqPLwOj;+(E+fnUCCF$fpVq3%R^CjfDdTwO0yZiFzdIMkCW50gekE=eQm zCHf|1xn*(k9BFj_u}5Uq%_@cHWo~w{%r*$iIH!4=jJp+F3>l3egHt%fSKO)1K%Xc_ z-lcW%ec&GbDuY8^QQVP-5uz_aFdH6rx6)2!m^`~kYw3ntatZl_+KOb?eM-Bj*hf7)0G_um#SiG;WfUno*jY*E@%9U*(4QQDhU zAP;S|Dli$v;~e&^LWFZ=!s8PR!VZHQ)22}4!UrA?yN?hvYQhw`J`5xuXe;g`LZuMT zgNpSsgUHqBje?I(fZLeF!;|Ppt>_^7&@s_g5jniDz}-rSHW^jr&=}BAZ(`W6D97*& z_2kNhaVpP^AiN?kjaPX__5X@A1_>5ZG*N6zP?itD5)MyN7Jk@_m2wgxuhNw#w-^m; zJ6V{lVopOgBWt`yNU^L^nU|up(>N%Nq;VlnNdx=bnQpK*iL34H2H{rAumIl~bfWXP z$2~dI+G(kmQlotZ#d2=8HsbV_xdlkeN~j+=toG=pla*rvFzDB%L%mtSc>P%vYATuRtCyP@13jlZupf%KBxt(cacHkzy?$La2%x zNuwzSL50!A4uf!mWTdzEN>tL5l_!Ulsw9s5=#yVvrYv7pgH%=+E4IT$>XSbW!pVY* z{l;qL@}cAJSzekHEN&|(cI+b)0mv{$h^b_-h!4L`?6~aE- zX^4>YCWUy<=$-aT0G6NwJ#t*LwzC?mq<{G#|i2feE-H*Zi(H|$G^-o3@x5$S-8x0@v+xCN|wMES~L{`M} z=jV9ye^H38BCa%zd0+lrA&h|Nhm-XeATGFOo!ouU**f+GSrSf~Bazn+Hxk-7cOQ%r z+!Nl``7-}-6RpaXtBz3GQ=296!$IR(*wgSY^1}#ac^gAy<C1cg}9aUtS8$skmPSgP(BsStNf-_Jc_-0nIq z^EREL&{tMOi^+mh4F(4@QrV(^0qWNrJ}6wS9(uY}M+iDtGyQ(1LWFzp4x~u`S%Ca@ zcc;r?qm|`$Haq#Pve6*kGG)*=8MEAgz=HyEN(MvaGn^cq52e?3d z<5UhRRqBVr#{=Rzc({yo06U%=CaAzrUo6v4Zx6tpI9LV9!IPECwH#jP!=@^P{+jxh zVbc^sZO2vfCPGZH=@aE8<1oe{Y)Frv4i38KEvb+*jP`6xm#w6=JNAjKoS~9#cBD+b zRB2yduWTR#H5e7cRFz{E1M)RMc=*E~(7J9zCR9fBiCMnXgqu>R1GsmY3cIjm7_(ei zh>+`P%ayr=dR+vT;417^6Y%u5QSnk`5Z#8slZ;xWO*#GDvUs&Ng^ErD^36PKFgbxM zg)`Tvq#M^N_Z#h$=18Z}*vlbY0%UAtB4EZ+@07y@eVw+06`3z8PVx<6W5KAk9qYAO zZ%dk7x>0FgNuv~QQkn-hHAXubMN+a^TSK}idSMnkVGve92)^ny$j>b{tA{?RT)ZZH z=hL?+gt`nq@bc3$fEaTh?4Q2b8j$qc)L!tcvYez)xs)`XKB6_cDaib75WbRd_mca9 zmCIc&6iDZX0%GbeTPKskl;f>Vr0c@kaBHR`h9^GrBMfGCQzyJMQXzJ=)QlV!Z4i1W zL~=bNQK2Wck!gcpBjoGV8!3+@DeX%tk(ntfhM3pzh>%o5dm0!I&eKS7S;&btkRTtX znLHG@Nae7{AP%*izIM4W9b9NOFzbUR$Oa;gLUcCg%6A5x`Wk8;M`wTw3%z^ftV~e; zUp5Slt|34rehXnl{0JciXZ3RNWGRO#ahw$_%>hKO9HbXrookh_J!s97KMhh_jzf=! z^Ofs1RdM*+4f=|+`5k(l(0_Hf@10tpqOiTby}|pw+-+K?CAs4 z`JbCU+w?+yU+IU?5E;e*P8qb<+MgA799&A<7VFQ}w9rA&Sx%~3#74&zPaDFwd5V5vNWC^Dx?5Pl->6jfV>@@Yuheku~El=-6 zx%T@G34T@U;9~eL$$toO0XHE-zA)NJ?UKJt48y>Ee57J?>G-M!*LHa1L}33b$v-}DRR;RGF#*yKBdZrgw_q)b+Qsa)(E_|w{H z5Q!TYN@=tQTy%8O7F(U`9J~KiNaZcQBwrfD(E;b%n+J{7x(P`<(}_78kh2s%miHZ@ESy*&_?~Pc zBr7&siia5s`{lY~$sMi`)&q>feao>b>1373*yD^&GGvY@ z?LlPnS58(jDxzRP`no|kzSgM@k*`iQZd4@gqT5b02up4R06F|$1`$cFD+aoMg`T2T zx20QFj{@WsQ?|*Q=bB`FB`v5M&r|3tP2tC~%OITT=)e8W`O3xjoK_+SU8oR|5_yJm zF(H}_nm9qOH5L@HT(I9shyjFZ8D18g7=M`n_Aqn{s~D?wsR~+nkv0qabsp1CyciGz zyLgg(VC0$=^5MU=f+wL%MqQ#3>o&=Hk{tT33fk(d9j8)mUXz?MKEMk0HfH!k5s4@LM5Qm}DQeX9Ke6Duw=l?_qM;EQ3)64ASeJZ4ibHSX`Vk7ZAgUS%-;5 zbE9!ALH9f`IJuED@115F*)ngQLZ=a5vOIbGnt$pdA^}y`y3CSGNB!mr&+?Qgw%N&>SgZjD(peXs(VPYZynLpl8jlX5IRvBIX4pGE}V)) z*-si%W!e~d@-D07Pm?2C4LVJ^u!NOI7ilGiNk8pgm4s>!YEwBo0F%2ASxdeMz{G5x zYTb3eHJjH)8>Bx25V^x(=R9h$H3f}IYLOheB+zoaHzHQ1EH?<9CXQt9T>*&BPM9Vg z4}!!h@geD6sWQIo4i^S^1BAFawl;f}%1<979+Di9)N)@smRS!QjE1hLTZ$e5EaS4lMVggIh6fWmMNW^cRP)M~3l(dmg8Mm{L7Il-KR!B!U%<^9e(V8!}3+GT= zv;Av}$?MwhAwu4fKM^ngG$Wot2?lTK7)jr%vTjJS8=NUc&nkrR0M&L=LI82=)9{*? zhbaq-Pi72k3BdendU|-ntztvHr#DO*BebFip;X62D(x*xlK5y7gT(@JwSH+3zFRJ0 z6)lWWE_6Ut$(|ak5FVI32e>xQV15r(mIZ{oEq`1~t_7CHqXfN0)&O}a;U6wS|1wMW zh-Bru!LYDGMJXy|B4{Vq$;tP361wP`BsX-no=(dKl8 zzSvGDy+O7Z^at$dlIRSrNJo@*qn(~unVqTSn{c#SmIWHSDW|!RH&UL!8^1DJo5jb6 zPG!CTOv>iwBJ67a-~Ve)G+l#ZLP9s;@eXP zS`}YGGbYcFQQK6M-pM!;`@kS?EjGTHX$*Mlv|(r2glhJa1UpU)`k@Jw<~un>5FzA(o5nPrDT| zNFqMM5BI2~6O$sTeMXnW$_WE127{cTfkAo6AhewPJ3sp$Mp+g!)OrN5hBdag9&qY40?UCa)6WDnsz^zWi&Y z9c1Sl_KndMDRRblN;7Edg`{!+o^6%1Z2bItV__eA(GN=dO2cIbX->lGdY=8h{39SX zM-wLm&5##=QaL9aPFO#y9JX5C+60LVz}8ZpSAMlmt6_8}%ay;WD1Jr!G=6?SA-xpv zwXXRC5RZC#kc>X)9RDpXS-Vo^lIG)z&*VI=I9TCOPYTt+XC4ZO(b4@RZA?eAgo6&h zvt6Wlt2h~e>gJfk6~fYo5dj`Dh`lw}I!_sd*_J1IK)Q!P7&ja=15umy4Yx)^BjgX# z9GKQxDij_%N+A+XxzKCxF^Hk%$2j?DaPcX{{nLI|S4M9iqpkU}8)WA3THTEvbeiL2 z?g*)>q4k4Nx>TZ^(u_`rTs&tX$C?u51go{Dz{Fo{m2~eF5 zy4Z6~yhNdsP{IZ172_1b|Ask^g9+`8E=u$zHpt2#l(5~9RQV`C!(VR66s^dGI$qU= zX@KbOh9@O!x^j@12>L)toT1PSE#vhw3}Q2id{QzPfXJ1A?3l+~YQl-p{5juDh>Lr$ z5?^$gvBHyRl=jT!%5plJvCB~w9KLz9x{DCCoQOynb)~Y98lG=Nkins@XgFrcy@ZhT zAqDzMV=<{RZs^UctPvPs9ye1PfjuKVoni@DtE)&pm<1XGrk`kj1rDE!5*_?$@*Dv~ zZ<;kzM$NNo<;jvnn$NVij~Dy%8ii17(&_TV>x|pnI|RS012+JoMO9RWszR#y0MhptkHkowdG?|1}|_XQ8( zer(hGWcV5rh=vb!q%2sgkheKdC{xz~qQeuW%iSb7aXaPJYluE z+o(N%BLI=F7RP&|o>Z>ewTo&@ZTR@^a>WIbrKj9ga;DX-v zLiv$2$F94;hr^8ettzbV1A|=r$r7Dvp6GVD{8_E*wiL;eq{&K-hLyk61}h7}>gB0W zV>M^cf9tGpK!${FE>wd|On)rT7@cWkjMWOSlswX=n+%fMW37!?f84gd0L*XV#^pJK zJkfv_xhWaKxJwwG08$5bSc*2hWbJY}gEV^zJCyKfX-!p@FT6>POj8zFV`xD*U3wy+0`Wft35YQA#^QnD;$*a4K}6^A?VX}gm}y|#>nPEkQ{*Q zLHrno`WhvfC1}Z;RlsfadBuR}orCY_uo7hr^+(bd?jAyRys}45EH%0{TJ9l@7VzL{ z<~*`oA%fyM!3cvzh+0EPg}kytS%~1n@5$!=poCq?b|=LRu5 z5nu{=NFa&3;JXMzUbsu%Q)^9SB7b?JBpbw_It58`=r-m0tx^kR7oqhpO9BU(!oq) z(eoKfosZg75>~p5Ycsk-o5IcdjdmO5NtR$pk+j-v4wb@AK>asNl_~;0`h;Qz-a4a8 zE4z((hy_ds=s*dU=ZM_H6Sf|k!^h`!EuphO{%erL-3T6JImh6BQ;OlIG)ri75QspMOW2RR?D3RQul5if%)nyz)M1tWRN_&S+Wc z?y;AJd_u5}@A|;#;z;q4M$3L}d6x8ks1Q1E%w_pL028wKmc)FdTm&4WBf)QbOt^ax z3J#G^6hb%77yZl60C~lTa0&TbSqO5%h00l9D9h7ZAfcpnnq$xymSEbDmOwKATLu0a zm$QVOMD#e=kY4L5#QGPjV0q31$P4u#4USjq!T{4O*TcTHy74J=4E)UC&|m{jC}r)p zTGtmGDXB(#Ym23YG`?u7*U8%Nt;wWNCecQBcC$p^mYd`&0$knORw6I_pfppN{QVBjb7>Rs10ia;&4?}Lg^ z_;{`8scMw{Cul`PN@u#7f)kC^9fJI{)@C%a&B93gB!yl$VM1S)Z4k>hnm1lBQn}n~ zbH{hGLhLkY+y3S$|3F07m)WNp%tgeLJXsci@ljle>^coxj!8(pG@K5aA0s^jN6TlO zqjecPrMFPJ4KgEJYp;wySGnH2I1fVOW)O0S5L2YtSj@YC7h^Z!|GZjQ#EH&eL8Pr z3A#rM^UMa>FkXfE;ocRXQwwEekcz=Pi0h_1C##s#)Gcor?Tsyv1EkR!m+g4yH%w88 zZ0THJ-!;`5DfI^)TSmwmL2zj3KcF@C&QJjatZ?!MYy+OWxU7(cKOpXuutNWmQ-Ic36|W%!*+ zJ5~8Ii?sjoCQ{k^k+HlnJ#y4t%0euE#=Q9PZi5K4fz{TSMcRnFtxRSZjdxDQ+=FsV z0fq~S_nY%)-K!Pxd*DXufI%1xAlRBbb-!`TtKjG+@r0Z~T}TF6&sa&-OH=^c31;Gp zSqg|7aS@So_;Rb;fNx%pe6tb|w>x37oWDw?yhyLOXtmP5+-6C8NNc!>)!dI9_Anr> z#-9n4uUH42%@U4NNr}W87Yh?Jzxmrr0=o$qoUgIQORD5?(iltlXH&tjZ7m?WbKiXu zzg{``x6uTwg%B6pBU|J&y+8pw({yh`HM8CeN>VI=jE(mYXK%)NDzKws7Y28 z$;IKy@|G0Ps%Th*vYf^SDKXj?UnzTxMw~1;J5sBlhdishI9eg!Dd=W58AJpW4~)1= z2~|HZQ4(TQl%X;ZQ=*WN6Tu((i#ngTO@MnzoNHyqLjc43*d|Fy)T&s0bEWlIl0rNs zp4_iY32+k#k4^>T6Ryvf8%c90kO&9|8^@;s{_{tlCXwmN=D{PPcl=6-E>TCJztz|b zl%OrQF+!?Hb1v0HLh((r`Dxi&!dplP<}Ey%gA!cN6H`pP?w9iv;usv?AUQzD)@n=T zU-?R}*sxJvEl@Ee2;t`H)Ix!rU$kM_sSx~iA?~3KCba9PB zoEp)J=Z!i*Okb>Q_-4mPxbRSUkA-{~ZHQ1@ue8%$hjf-!URy5LG=k=91;gK4vj}9t?8w*!RYoAkyw;Jn38TKFJ(qFDnt|#Q9$%7kdCBtAh zzi0xbg_OT~yTSZy8ok|VkWp%qvSlM7s$;2vnamC4%U0s#XS#V^J{ZMcxC1hLG^KG8 zW3-!CBExrT%lIDCobZxY6w-4Hf6Ye%u&a>?6!KqFE{=C;C;86nfT}uLutMG?!nnew zjkyeuc@vPG^7}hejeWseD!}0SSbG#3#C8$;bg48*?IxCChwX+i-=5AMd7Ly`?(@e} zJncPYIi01ljx<@ZwOo15dS4+^_hHvvNr*-qWcc{-1C#7YN5X8m>O+G#(Wq#Zm-nco z+Yl!edqJb;_{zcXg>jhPVhOL(pN2zCx%y)hZmN<#(tLFJ)qD@Ipgrsp6=rorvcB;% zg>W#S8f5C{3Vp#1G}*kyAQCe1pEv$X*==Un_*uB9*m&6XILDWSN}#4M=p~Thr}S@k=KgS%PLf zrF}B&+d#bvW={B=kk16h&GWufmJ>e6_#PhzU{eYer7wJM6&o}7R#*LG!VQJgI=?`O zcDy0k*bf?=)*unT7~N7WnZJWZA6IXX^#?%WW-v>bzB1V$A`tSCkNra{F&^YKq#@~T z@1*_6xIYylniicKmm8#&I4Y+64KA**W{o_0(0TrP{@{p35<}>}&k|X8u+nr$>?6%- z08bKVnD0CS5dS;mSKcIPM;fo8kSpmW!xi$l6??6-jsoN?iN`WJhadLxV^jdTV^3(e zR2y`X!szw$%CXj3UMaKvWE`h(s4I;7={+YHw;|VQM$+aJt*J8qrFWfV!h;3A+j=!JK{{MM7zddi6X?+hTzol_*u#t>Z@c}sYF8Tp!Kc!ALZ01CWZ;`H@+REwZ06F zXNK_@LQb8y6uI_nW%29=E1z-aD9aO-D(lZxmfMP$Uvcum7=y5(EQ^qz3DJlLnHb+Z zA2h~5wOt&8_gKQG+!V!g?Ai-d*cX~6|GrRJc#rr3))S&G-*iMY$3}0z3E&rs-GSs@ zZ1UOl2!bY;k5lN&ZsD2Oi-a70m^b+~ck0D>9S`R89*5FujAunpIPki(el@+deZ55Pi__XIvB%Pe%z% z-L}fMOH~v~HJ&%!eVIXi-v?!%L2L_gP$+j?Zd_z#gHCqHAPhhlB0c14L6_(={ znq*B8d?aKeA=~fDkko}NaD ze;^=QX>-3405KIbphi5f7R`NG|#*sRm)U&u*N$Qn^%*cNfT{Re)%DJ>nzZ zv|3qk6QYjcW`j^L&=&JfLf$E?v+g$5P;V^PF;gGbiY@{VHpj@$M+~NScw75qA0aBz z6(D^*X#N|YL^52JV%l;n6ffBTi1RdWC_F0qp{GD#OkGRMk(*Q!n^3M@stn?#;v9bM zG2S{>TOJsf?$P+REHEn@?eNc`ZDsRoiyLNjx?HEZ3ut; zUEnpxh6Kle;*_HVGjFl{N{Tb1w+Ko|xuX&gH_u!ksZ~n5oegquwTi*Clbg$92=Ur1 zT_GpbC&-F15-XC0xJz9h_6QRf2p^*M9%EiMmUi0!ohd()E*V^l{Wj3JEp*(8`D6u&~tjhMWM$D>?DKa^wp} zW1b9qQN?Jo*MP7H+W~RgjODEku%RqrN8po&^UDr{TqWaUQt`5J@kZcq?fC#is8_@- zYS^KzsfvVCrTi6?;68?WV&q-YXc>P~9Qn!A*OcWJwaM9UXhqJkto)%tC`4I;JDUJ6 zFJq!r{5_c7=ZUwiYILon?FP+90PQM5$_*Lx<=`N$LdLys-0necL@pyls~cCz&Bnoo zoB3VV2Lx4HGwnmYf)n2@C8Ti$`0i8t`I@mjuC%An z@k*AkdRMQnG+UnhR3Uz5Jazk&kPjdu2bQhW^aTCkd z$}dA8IlO%d675ymNr;wS(zvGne&kbwJmSXU`oV9N%a0N!`ta`*B4~|wFhvHwx4K9- zjQ!V}2C?_xL0HTWRu^GGpiw*YCxuYwQpMi$v%%OX*h9#Bgn{)3Vo5Fh#VV_1;M{$H z=zS0(DlH|=9#_{g2j&C6Dn$4Mo^}Nrr0IMZ@^KQPqBl58-Z2)QBP`2g_HSVEiTE>k zQx~;kzqOpajl1Tj{;L8|ZM#`Dd`Iv3T?Oz6Je}|$`2&!R4>ZX{qldbiWZ~Z`1{D!k z)bA4p?hUDDJ~FnqwBA9~u~!Zn<8Q@{YnEAufaWuf3Fh)0PJ^$o1b2(GC`JPM%5bgg%@3C6NK>*V*Qx52#UlWDd)*yazVrO!yrWgvS5PD8AEUI}j4(h(dpcv~ z)?=*})bPm4fW60C$D;&uXwz!hd;&;Zf&UU1F5fy*Ar?c};YtG`MuVZ5N>2st-_=c< zAzz)Q3@3R&?*Eqw#bru5X*37>P7e3T(-k7p5KU4F18^Xc-g4)isa!-^VxXlnMk$27 z6qHeNok3qt26ekWgE)bw1=E9PTV1T};RgP;K~Gx*W6D;XV_clO^cKr4=PAT4(dns| zqt6HAU6oeKSkf3$|F6}rH28`oxCQ)IX_!!Vp+ct|-gss5#8|C6)YT$C8|_r2OW8$Q z?jUMu-Njl3D<+0tIO-CG@UNwYWgj8GZIdK>97tRo^AN{CwCCt#2`&y_2Ty0N)CB=@ zHnx?^u!&0ha+Bpb(h!3}k?ZEWCMnBPjZlbFm4*0M^d5|vrjXAU!Iy^50K^qx1A$K# zO)=tFf~F8_lA6m&mjdF7PD8ECB+ZdP8V-C`=3b^OUtgjuzFb*2%jSZ2W6LSAmL<4~udYFUzRKz*MDe`%z}0{lp2d@8(ro2mc|bMcTLy6g z>0-KEFvsc*AdGvAJTMoKLse2Bj~e|&aJF~2H0L(2+*Gl;yJH2>>Z2(C>EO~;mchsh^}aZmIfp$Yh%bLF^#3YBvIF=Ge$NulwP1wjE|-~${#TV?2G@R!B@n<#zPGANelp$p}{7W z&@R7|4s>+JB>6qWL~9b<3IwtURp@OgpbtYsnAUWoljO>9(0HR(t&>|Lwb&O4U(hJ6 zfw-L>9G3RPD8zSz!GPw)DRiR!5l}-3@i4sEJP@M~MVSfs%F=LfCtnBT8)Cd=1@b*1 zhS<{_FWK?daz-Z;aScyUmQzq5CzD3AZdnW5LAj=sM3g|+$Eh5agq;Ac#ozQ9N!kdu z(QanB+#G=6!TcTnY!HdHsct z+#obgRQ%1T`QL3#mYKx3zAv>@9;`JQE9zx+)(Q;25x)6>Z3Y`N7^0|ykiBkBl56Uf zc9H$PtwF^+H8oVaf7@ua`k{1$61~|Z3qs}j7G*(Y#_$B&3DKY@zCof}l?CfV4}$Jq z-EOSGVtKDaX{WnevOA4V$d}A6t%AI9d`$s~=GGWAVX|0wyZ<{bITzayk^Bq78s+*H0J4tgG{7!mYT+Dmdgkz$(qpE*TA(oho zE;HhNg^V`cULsc!@_|S7O9g2(g>>&w!tlL6;REHm`E7FYhe~@8NAZ*RSa zNWN|1P29;1|H>d{CxT|>8bq*sdhyKv23)M>@ocY`&%OoV`1j|@Pv2=ZYQ4ImfRbxg%O;ZOBVQ>TZ*DP&b6o^^lB@qxE+T-^g>T@X^Znh#u>`fZP?)Se6eXM| zNF)!l`l}98h=sNnmnor^w-2?t!1?H>A-5;(mC;Bakx}ki(lF%&!Qh zXu**xfd3f(hld`eb$#iTvYa&P!n7NEMS0m+9w;8ijsWYQFYiQIO6(VnqsrvCqXBUl z&poF+d7-Cy?$gwI6QzCAyjxvZZArf566$bGG=tZV}(nAO^^^?)QoM0(D&1wb1 zjVoG?IbES!6~}X)i_S0z9R*GU_Zx&+FHb?XoCz-O5xWt5cOGaou;eirF-E0$evRM$EQ8SLa%$Xp zfmUJ`i-?O<5^fAMw_1ELAm>pQbO>c!5>>JU*Jsc$q)7Vm5`~yMIIWb^#u>za0Ot;) z3_86XLx>CeJ0W}5nlEQhuyzn{4GQn?CK?OJhuKq<9_s6tlcuT|e_aW&a-KnNvOn5W z6(Ji8>yX9(tD%~@`HX2=5t>+U3BpqoQW9gDZBzS2W+VnUTGqO z(eff`P9DT4!6EO;s}*|sG8u!je5OLAd*@#ydbSB8QV2p;eqs<`M_R8eo&&Ca9dDK; za{+L<*^kKXd0NX?l0-GbG1ma1vk15#lde^kTi?Q^-iqrK@?fVK2~PqrF^Utr?gp(0 zr&IZjG`j7!$1%Cv;Cw(lN$5`z#e~1o={H+b&8<8ysl7#^lTgQ1)X;4vpIC%o$UNQd zz1;+$+lKb+=eq#WNl#>voWDqEZv%WeNb|Y{72GoYWGr7rlZ?E_TIg)>z;Wu|_i8O3 zt6pui7tdkkeJVE88|P{3mbUv<%om3ED2t6o%&GZCyNP-7Hfdfozm|y_!j~BIM}d^> zgy^b+s3YT+DhoPKy4Tebg5?edi(@Q&#Nk>kK4UQ<>G)+3<9pz@)MF}y2U)^dksQuH z_jAhu@i4DlAa}0NCZV&FF4C+Qo`-cA_cyy)f_hLZQ?u&}2aYrUv_Cor?LoO61!0%EG6SC#PEs;v?tF=#&o$F$}lg zA%|}O$qAR#g@Z-vpwA7I(3Lu#--p{+0`d7vHm+JQpvH2WmGdi`LGG zp;EnBA(DM?f{lD!C7opMnw1692-@5o`Gm53={d5PGzS7n5;5+-8Oz%gCucrwEhJ>h z1pzv_QCdi&3M^uv%aE~K4C1FAkt%-$U_urfyl5-9IGg?d*p?p^0v1LBzd3%}BbsFz zOE~Lr+H!QLHcPW`Ox>knfEbd;mP$TJ)Wdv)pA>ghFR%nX!j6H*C=Z4!8<7k@{TLU_PCU3K!SL7YCjIGUIrZ%xG_UT~`P5TZxEykL1Zz(QggsNWw*0E-V2 z2m0`qM9Y6BS&2VECoNVcrlABMwl$1<)sYUsp^nX>viCQG_=HnyGb>BENO;NCJ98Cs zk$__ehYZ zAs0XJ5zQ{qx}Jz~d9qAdm~`CK{1AXJNP8%cVB^w*ki|Ltv;Ao@c#KqbozS6GyzNa+a*EvA35bEh z?RcOL!Fzd5m$gaFO@xFHqLZ`=ND9!cg_7H2Vim1$^pP(OLN`m*+~!{8`ck8Lh`c8N zOR9LDlQ^JU231Eg;>QOK_E++=k!uj)ulR>q@Eo`tC9L!_NMmGR-NDaLegJl*F&Wpq z=fTA++q_z?`HwYOlE8c3r(r*S~!^5x_XKz<7lc^v^W6H}Wi~tWAQ8KVYp{KNmK~m=L z1VootJRvJc@@0r?hDwcogTb$$gm+#REH9I0AG^x=LAm=?z<<{FK@$DCvY}X_J$)h} z+Je3ZkDYzq^@f(vPc*oN8m{U$wF*)laD9BoTM7}@o;&hLLUwG8yk#6@mG|XF%GJBH zB{ev55Fl^%+bBUlktapEjdmhq@WMsB%`zVo7_prHV2V~U=L_Mpx6R>u%uPUQ7pkA&0HXh zNb)9dv<&;AWqTFk-_7tUw|oqUX54~Ed6YCIo1z(j_(@}VBhuy1PgKkUMf7u@S_^(} zA=zmV`&};jHhl&zuY5aH2wKT0423VL(r!Ys{PwxgY1<^?3(&YGl*o*RQ}-nxW;M0g z4dqyRP>VgtLdd|Pgeld7hqSsUCtkiG&FY9|j|^cU4j=}?liu#Fg6~d~oa8a#iejnq zsW7%=Jk9*}D}$KR*oK_)wL<)u-G~OcijdmvJdq&Bc>VXfTxq#A_52t6Mj z{0?070MnhHHK+vkvjknlLXw`yV}CFK=q)uO(|DHn6$6uy4E{%xj*Il=luM#PCnXwx zO8JnG_lN8u$&p;HzxtEPTiBk1epcvhM*!|Dxp|+pmI1SgXo(@jv$^Fq`P#_ME2Q<; zfKaq2F-j)xSBTXE(_~B|#5g$6Fpm2#Xk4Dr^W$O|BEO6!9Hr0-*-08VihMIPk9?m{ z2kko9bwGJ=f1#0S_a9o#?M#s=e=3Wf?udrA${_z@2_X-ce&_qt>_eW%F2T; z(Cfjc7fOcT2sxAc>g1|Jl!YhF@8+9FD9amx1C1k<1ygWealPC|$Qx+ukqTq^Lh70J z?TZm63A_9LaG8IMLgxFXtJARxVYtQYG+!8`_6ggUzX{Ro7r|9hb(~7#b>o8hfI%z< zVEQa)AFo`@7N;UhdI;71VD>8cnh4GKLJ=_e1QnzQHX>q18e}v^1fKsPz|HApBI)Z- z)Qa9{qv z0s)S6dz?H|ARNI=n1 z*ca>Jj{_zCJTct-{LK=MR}W0>FI0Jab`Vm45hDu5D%Y1CBF#p75j5+Mi>y{+i)37^ zv=bREgQPhvutLj7qBrn{OHhJw!n~nI|0zNUsD72_zq|hth{3~9n=OC>I6^42{x4h+A|TtL4B%qZ5+k*vX(d zJpL0`D*GN|iT@)yd4jYMaAdI8KVu4LUJni>P@5e(RUr;;2%k5Iqj3Iyo}H##o&~^e zbNmcIUZ%4eH^{}x@JmsG8-Z)k>m=E5eHYhSM_!?_cw8`i=kr;}63E^*Q;xk7Bs%2F z>Xm(@IjqPC2+!O_S1E*XAJf_#e>GrW`GlzXYl$HZ7cd+n$Z0bH_49DsCV6@m5U#L% zi5xV?LJSQ?FNw! zkVieoU2k02e-EU}tQ!>K;EV>Vljj3+7U6Ib;o~uHx3Yw{+n2@ov;QT;NVo;bvhYT$ zTad~CPb&=K6GM}@KMB#4FD-<+F8u2z+@kW#K^)yF&k}M_Q!AxBK!-NNqygn{ZsyK7 z0WQP;D~M%YKkd|8dQnKv?*U5k`TB31E4AlY;-5y-5K(V|UceiM|N3nz>O)_lam5mU zJxg$Tr?X0Kx&t)lLTnL#p}mB>$+$>4>`tY<$aVM(X+F&SV4iz?Vi5Zfo}u4+7r5vR zUxnNxZs8(W0(p%CBHImmdP=F&yY6n~ItgJ65}ardYM`P1XgOpNxP0Z7$QdNjjD63J zy9W^0Q71N9SzUdvmf+JtcX0gm=duKMFcH@H-`=m)5wYACTp$NOpez?r{VrOnEQJ4I zrh?(i6yjhIA0la4uFy?P zLp!RHV0knmx|7rNTG@LcULm106PK7<|X+eZ89L*>MWR18M)GzEG3ABZ3uaZ*c& zpZAtugCq&do1 z8>vgt2QetY7(ic2L&DDqA;5nR3^bcFaQEL*-%SUNoOQg|Nr`vD!x;zb#_mbHS2YUVzK4J+3;N%LUc8N+i`T9yGdKd^msQU4bbPVFL zI1zA+&QM`IUVi-Z4dRrVA%vG@f{Wq2=N?&3k{9X0Dg(MqgGjE;;7<={*){VDU0Bj3 z6QcXx_GDREV62E5M&{W;2$s98Urs4h7JP=eF6=gl04l7bgPkoA9Uuw0Q%jGuG z=m|%(BTc>xz_t<^8hl@7wUUBmScTHQl1N!WngJ*L>0R*H@w&knNRKb!n}i>Y?rG%h zYHi=Cs6y}(iLEi1jI0I`@=L9@iEN*8-!`Sa(eX01Ud3=`%fvG`H<(yjg528-8lUt@ zjJf|k0iR|}J9CW(wE}YD`O`RZO*6|%@UZX?hfj=L*k&?VB1GdLqfyqfgpJfE)6Mqa zb``*HA8K1#Wz6VM0l3uh#U0kE5L#54yl*AsGl0YQE@KT1hEvIy-mQ|}ETj!=mJ&k! z8%>qhjOW((@<{Te9u@VUe%HyEUaL766 z6E7OKxrpb?G20a~rfM^Ci0)8`zZxTn)$de@xJ^8g{Eg7xKO|o$e^q&i(12)dFv$R9 z52|CO^-ZORu*53grL?a$MYfa11Yfoq@v*ensScBfx0Uwb$mE!JKy$1^I%rWHVGtTB zp1Ym*u5uA`o3?d#zNe6>No$&=&mdnmWKjB^5Iyo?_f+}5(oRX8%=^&FgS(`kG%m-p zNPHmPG?u5cM&A6$#9G_r*gY!dw?^rbS%kdTmOd%ot1Q2gp-Vm>UF4a%@iRb7VvTu36=KEb zD!{M?NDKcbA^PCy>5|wlR2Z>5;FHTg)Zwm`G$VUbKL3H+y;>c{+gyk>e5DYnR1k1e z8oyI{eEZ-RMs4v8-=hTWyCuo;-VZ9wg=a>Nd_;&RymX12^pkSj-V$#|t4s*Mgdzr( zm|_r{4evmuT>mqKc~8mP7}o@!ye%x@L})-Rg(6w}i$XYT(Aa7HK0sV&{R+A1SLINj z03Xh$2>EQFYzokBM25`&P0M}xg|cD4(uhtkIR~^l{F|t*U;if{Z(yBdkVFrV+s)g7 zWa2-y)NnwWYpXs&mnnflBe?!s8e z_DiZ65qJ8bR=uc`$KrnwaztQ#h!Y7M`A z56Pu#!;WFfS^__yA4zjmN((LY+9|_T7#ac|@2xmWg}sdfl24isah;4Fq2!h=@-0dK zM`p@&S#&gje-?xwLbm)th^{%EZCE8@0q+!4%GJlHj4!W~OZpcG(NiS1kom_d3kM2( z>n=JTP`|H}`X!$%SFQKt1sgR!`kOUIXb ztV(*)JNZu@aj{AwG@7K6#t_YXL=G9JQuMQiCGqI-fcj{ue#7qeJC>mP_%rgy_16T2 zh?LLyvLnbK&K>c}UpPsjCk>V=>2fcj-CXz}gO$I^67+Wnc9wgmDp%Dfw3hf`nsRTO zd9(a6U1=vdSys;g&F9!x%m5WRmnuZ&7p`tkxJ;pc$yg|{gnWD%rE=UA%EH}I2M~TG zAp4`r@R%xDdbJ9>Fsit1rV4wiTWMFidA8NXDJ9ma=gd(_Z+r*BEI%Oe384q1oTn~r` zb>kf}ND^bO9pf5BSD`lB> zlX7v;!q+&55JTtEiPBhZMhdg@{1}iNh~OvW^qV26Gkmg)y+vEW)TfSU{q5FD2twj5 z1dZX~XScDN&-i(k-~*+0SR4YZnMd5AVu%#rsp;VC9nTWpZc?2S0wsw-z9umVGW|~F zLbFW|+(maO#8r4iaP!>?nJF8KgUm&MItyX@_kYOx>UgWFt!-2+tZNJtUGR^Gk~H2#CdVKVz3m4X`=?hn2r zgp|{tBrn`+EI5xx`*egEE1_QgLz=y}w>JPalTQ|#WLi4S@*Z0P$lA51xrN1C$3`%P z9T<)jGfA@#NF7xtcbIpCs zazJer9FFpu{S~J0rAu-oV}&PN$*?5z?gvC0Tz8e+LlQL#hNlJ#TIG3z_(a^&N|BXX zhU1Hn3rMr>T@4)9rW(XKH(f8>;Xwq!f?eGs52-LaNcM0KO`7L%xwlAEHD`8 zk9SkyduF2wQw0PUluI54M9X0kzz0pelw%*W((=oB?~-K@x*1gQ|Iy=?#qeZj|0f{K z%Gcz|Oww$k)MQ-e$jWeKxq~h8CuuxIz4vn17!{$yZb>N>2=0qiVJEFdMn@?f3?Gok zNV81%>c+oXY;dR>N|MdeYY^EN{ht=4)0~+kwj4OhL_u zdNbr7(s+uS1Fh={*%7M{e;?fN$iL!M((TWb<)k5b&Fa<5RxXt%2w7}JwK=4ilAyv) zW1GB7nw7`@f(c|olCu1RnGhU3W?P^=U*>OB$Y5(+k2NR^hNsb_@<~GeV^xS+V=M%- zMK_m*Mk@!VzxnyH&4bPGS>GWa5i%pxkoI`2)O2>eVNGD6O3(v`_8{xf$rK0&3Oi(b zt8(?0iWOeE4Jqj92tCUq*^k>4Vz365>%F@?g(EwHtcXAPoHab-5Y%krir$m66b36mvktEzjvjHNVudA-+Qb21O< zvj;R|FtS&EF&dAM`v#GY3eXRno+Ll?jHYNLMDD4E{YQ&pBw6TD$a4x|-pzC2Q=eBD z>U~ftHs~)XMhP19B6 znA9o=Q$L~kRit3HTfIasea&P``mlMDY!4##EB3R|Zz$Ja32)JLKTdWRHjgXDopXx`=LQ! zCDom0>{Q4|Mu=jNZxA{gfxL7X_pu2>y$|!~)gBy3;@RQk&y~v>!hO!jArp?==G1ML zd_rhjLah}>qW{k|(U}fds3iWG(6TmBK4VI#bLF_XNixa5R{*+IjN(%hKnh}`UJN)TAuhCw2nZbuvjJED9 z(tHrM=vYC2LWt~WWa`U7A>e|2RRC*LCoxYF4Pu?iKO)nEa4pl{E7$F(!q5P~wT|3C zh`(w6e5o{!lU_qxhk@aHpM!H?Tw6-@{+bn?E?C+v!w#^LqEl&ga-athL$R=2Dh^aG zK56*qk~Iez3|&9VT7xjc4As%&=H@VN%Rhv8MV!e&sg&yUPaTF7H0Mws;`1GDMWOxR zf9eZDHfJM!Uh>TE!XuCZUpZ&VJETUBI1{}ctFd5N&R$Racq*K)VQ2F{cXYmgK^RSO z&m`G(q?L(-iI54FBpihl{0ST|s&gfY5cP2qyJY6k%A%_YT#Ozn1`(VOhtSgrAsJ}t zmKTl%jY-)bUFz?s=gDqWwtUG%l#C4zv_)m-34nZ^&Pu#K&7T5Hfq;V>k=S^daUviJ zT)k?kq@ApEAjv0(oC5lS5oe>D^AEzi=C}EKv#?sgDJ+iV0{sjdyte@m6ikcd>3D0N zymu-fo{JG2XOjF=4n9o_cjD{h2GXo#TN#EE*=?-PVbOfp8Oov(7h;OPO33$xo3zVE zDLro7SUKq|rJeptDI<+%aS&2<$K%dcuA7h{(~S=1w#(t?SUMsQUPzklPy@H01CnA8 zx1qFs-uGPP4hN1+1N?sz371AW$z+{ZoBJdt`K-@yeS8v)PZ;w+J#(kRBoiqWE|2#h0P|zA) zHx4k7jSAgR_&fGZ1VpRAcQD1UV30{FfT%X1iJnlm?o*`PcvJ<3P3eA9z{C5_yGo`{ z1<4MwVXd@ZWF^204K(@=yHsJIt%2@Uo-zm@p)5b%Afi#w!s6D;R2U}%4)&;1FE^N! z!*C4G5%OiDand!-Scsy4`8}JEFM{rt0?ox%`F%Q4P`lL+OUYF#i-F56>X2@O7>?** zt$(I+{T(G#3%KTL%{PC6yibzBWV#u|d(X=sy;U#XIuVxV)B`Jlp~uI^w@ zrF?79Uj=WAY4Y7F<>CgLTa(*Y1M)Ay{rc{QK!+N1(qvh_PUYOr7JlC^Z2-iPH|rW& z?&_tzT)k1nV8jz>Lip8372=Sdhr&w<@rLNF*sby$YWrgx8Qf zg)U?EBuao#N3?Mw;Q*+thEzh8Wf4>O7c}Ea1Y3a!g-%S!FtaSuAe02F5@j2q{+pQp zs4V&)6TtY&PwVk02(ZSTFzF}F(Zp9tcfRi$glW42MeFKll?+?9RFZt4wUQ=h*=Z1=lxQ#)5vN=l9Y;mTe(`{N+jF8UAVw8%41()DIWqx}wJU9w zuSv6ZP=~>>?7@l3a=S|8f>g^D2R+klDKv-yn&vl0rYqO&%ynCurQU zv`ye$7AmfmLj7nQ&fssobF-EZ*qn!oiO++PO*~nzC1iJLt&?FHCfN$(@l+X;Wh`{N zUef&E8sRdwSI)>*$b}_VyZhwqg@J)?oTzBVKuxcFo(Fnw8%>t6`M~TFw28}P=8p
lN>o-ZgvACYYG4|;t%5rhD zuz)llT9AmPC*8S^F4c0d#pgaQmyiO`tB4J)u`esr6nr|W6(akUYl=Fg<#o&grl6i! ztT4du9R{Jh$8aMX3HkSjXUUR^y`|ySo2o9Gn8J5VEAY3KNfjZc(+x}HmMY^U4suCx zLydA4Op{q8`ESGQ8(v57IrSPdY`J{rF;g~Us&K0cPE;duainjV7)A|L@kc%oCd;2Q4r1D3ryqiNA=NHosr?i{slog#`Pvi4QV|Le zXFm@Xg_G+1Tl)gZ@$z3w(cl#9jAy){Ln`{FO?hL1Q&J1 z9S=@Y94O`8!4%|mw`5DyTbdV(XWBbF{*D#ZSM2l^OSA`5LyihwdDnz9+IR%JgHUIV ziE{M&04%hl9Ud`3{tvW=pSxE1?B?Y2<#?{e*)%yy(-(o6x6}#tCZ4@L96ohjHU7xk;-CyMkCM* zK1B+9)S!=+L3MF$;sxg?toXgk9;a^#n=8!j^rwFkXgN_>Nh!>((QX3OP??+{MT1pIjW4V$O zsL;LJgT7jJqJ(dii<<@p0iE|9;NE#_yj-?N*|3UraENv6_kgS=^jQ)51$%-Vrr-^6 z2j#%I+;=8`uhL@m-1`Fr&~8B}1a**R_ULFkhdS;@)36k-qH;${3#3ON-a zAY10odm*M@XbPEd_aJS|b`r8jbeGGMzbXrgS#VD+XZ~)it_Ugr!*W&B`lCDKeGm3$ za7v!?r*Y$A7@=gHLD*Beu-Nt!YCpM1n*LT%1|rA8X5V4wg&yzXQY_tQr!Zf(?t^r^ zFuJaVO5bN+q_Dl(^0{t2Za;-k_e308xzwQ3T1(gG4;#d$n(Og)gJ=;Ld&z%2ZhVHD zks#9#(4t_k<778>_5>3Bf>ZzsXsi}D!K0qUO@oM57`9LM>4J0afR z&8d=iPgHu`l(7;p+Fw{I-x(bk@JZ82kii+=dGh&4PNq{mJ5IOyNnLPzH{aUR$xmD z@0yRiP{oFN^5w#D#;VxDudj8ILM$M-1Q{a4vx1TBGHSB2;KX%M`X}@-}YqpPmMYT3vOm zd`c2M>P(q3U1jLY4BPI_R~Zb4SMo~fBn-7fxO`wN?0q<;u9*QAOIs}$%~YvCd6zU? zZFE6_Ou9xzW-XN!*Mekk8|zc7f632&hPnytcqOj2-y)iXyS4AFV# zJ!vi=>f|rOUG+TBXikm*TT3|ruVM<@FS1rnz7h1^gGMK&ox6gx`AB3<+VgoWxy2wR zBHG8i;&Bs0P8fe&prz3z(Xxf0_2V){&^o5DePM!Df{NV?$jLMm;>9M@MNDLElTsx0 zR+F!&;}&`HZ3^*+;owp}FzEPT-V`Bc-wrN&N?$wdwgU*)eI--)IN0^4G|D3$WNgDq z`J6B`{!Ex6Lj-Jvfq1#<4$J1N=V9Sz9)zAA?k>N*Q@L#G{#My|mu7Se8f4BrphqGM zCYmM{HX-zn#cf_*3~DTA->Z^=wl?h zAz_ePjO--}vAE$*;eCTJEQcni?6XXZ!oTa}#s>?t<%H#0ls~CWURePe9dP|RIsSf4 zb~iW6XwrBBY;gIy^B(}z9^ypj(%$i3D^-{VA>E`=#;O&FtD(tGUxuuH5H!jNmJTq$ zW%VkRLWIxLRx9~>i@ZaUuN7Y||9MC=K+zf+0`eUpie0`!_FJnQv>ng()*58*z(Hb~ zL45htg4nUng!}urM?ZN3AfJ#2Hy;~~?ueJ1hfNGS9Be?>Z&HXWH8-w9{xOLAG=47k zJq9lSkI-U^ce5Wch5rGf<>EiM<#B^?ArBW{J>khS^b@UD#y_b7fw;IpN4X>#boz2Q zMSpM5?W*P3+$~S3F#fY(UYzU`0mzQnS}*&O#&bi?e5>R}gXmI-Q!I%dH!qT5Fuou} z6T%wugGeo7sIN}Wi3Y8ICvDTPqP@|lrBc}%^b;@aktUL}AZ#R~+t;6Z)lV!$Q0XzFWNt!Frm=LnfPXWX; za7zd`hdg=hUiZ>a+o&8UlRuB~0aCCKUmS*64hpt>?w58yG;yM~DfXgfe-@5sy z+4X@b~>_0R()Ege`>60__EN@9HR|O|*0Yv*azJ3`Zjba1vJ&&um%kz~Lgf3i3fzr5@ zlQ&6wtubY)%qrAup-oVatRQ6bh4wof%)Vy|p0N_)=pHLlVK=YGg&l7jp)P_a%68*H z)6Ip_!?t1-b)vd?X|8t{N5K zB#91#g~2YS;BkStVo9x2+Ue?-OSh^dx}_;PZEHXZih?zRiw)Ha1~GGT9lfkkxp0)g z_dlUUA$lD3neH~o6E~dOjcN6SlX-ABYMVlw9n%BG1VU6P2tCFbMh9X_A~L89;qa_`+hW?NyfIk88nR*MW7z z8%#kKI9K3(+4yHQi+$qQhb&|Y1h5ZdHQ#;?5Eb#4WXtCs-Ow$Yp4W7zr%0}O!DyHc zEH!%l@}+Pxq(#v$4Px*9{gR3Wy2|8=mz4$MEiPNu5aKCLW~{vYim@7VI6Akzsw}v^ zl=n%q@>oJclh$vLSEE>49r7Bud)t4kEO;Fl&4J^4y2rx}-^4eNf=eOTAmKU_?@!H9 z9NrwWUCV{$D9i0Lh;=3x`e(jr+@3_ne!TxJg}mp&LiG7}0NFpn6J*?b%7P0ksxq{A zFb*r1L>yFEMyO`Py==`6&FGdFOXUZau?g;Ix@Fym29t4UTP=+{4GzE>C{ubph(k48 zkR9@omP1?FQ%R$Npb|5L7;uS%cwer|@Y#pfoOH6(Y+|sM;Mn?_L9BcnEDmBnMlN)) z*>Iz`OUn*KmdQh;F&JU|TO!X9;6ZLxW1yj5J~S98%@66IeXd;eSnkT|2>C~}<-3LP zehkE4c_~S;oNlmFN!zViV6cpm}K(u*auv=yaK%;O~ zsHZ0DtxRDrsm|onCwvVU8bl{tBvT2F7;zB#`?~d;prrd36Zj$#X~BVHS@azsN_N|Q zvXeAe{uYF^iIQvf0HRsOT`2t|!9g5DdO9ip9+2N%NRf?m#Xdi%fSV9b7XeKMvAS~_ zSBWM>;M9}oXo zA-qfSprMtJEe@9;G(O%Uub2Q{oJZojOaN!{wh7tzi)O(N59Sk`BI23CR&GlWKWRR$ zIhvd46Mi)qUFRg0$-BQ9geev}=;+@Sawf+JI?bTdS>+$>lNLhMO>cTQ)g1MQrl=Mu z7r9wX!Q)_a=!f~ifr#7VN41HsOHkb=`v`(FoO6J| z+)6(Ny%#*_gInotIqe|j2J?LmyrdKo;(eU*B55-ky23K%U=^cd-=-KzF$j|q_Ro4k zwnAjQJnON1IHO9IPmKjFVflT86_Q_06}7fQ6gsUTL$#5I8q6+mx0cD5hXH!SFUPX`@lyfL}*`VWVqtVkYLbPR|ydMf)9ItLV z3MnXg+}LqaLK-Dwo56iZ;AjU)>~e$EdI$*QHmA3gZxfL-#d=C?}ZHFAp0F zpQyj8N}x|3DwZ*f3hLy-;{d&P!`}H3V!jWLF9+nb6HGca$BFEeRfPQap}mpS#`Zqq zL=%8ZpXhkG)Pu>PKx#`oh-nSSdJ!X4*x%bm6P|*T0Yihp>vh;Da4~ewyh#ow$%kXu z&*|Weho31Z7)nx{qfb7|N-8Ua8W$oT85d{ga04@rkS`2_!o_DBtDu?2zI6tv4~%Qk zKh9A}r?5gg&h;b{19e5R={#eV)JY3zzItILJyd^gFfb5B{g%HBqL_pZARyDD+qNGT8C&7b-*{SjNeDW3`a= zD;CScr1hU4Cs&SBIj6UbEBbka?7L7)iyW_Xz}F-5Cn!DC*Ct<*#=CLM&rFlQ4Z@!_ zW3pa55nTNz@KI60`U|E&9-d9$vr7Ik2&DsROC35%xo#mMGWE)n2GM_bDAaEd{+_u4 zh?=Ux2sTcC#xD}`Z;S1ch>MgCL?%h)C01sBw+nX-vo8fim9VenRvo&Wk5~%E1xGKm zM_i^^{F&Qm*SDS!B|90g#<<){+g3w+sb+()k#@SXo{6YV*(RRC9~ZX&D$W8|+nT7n<`m{Z_~ z=_ZA4Um3k79yi}BFd0Wh^fdMiQ&4j_Y^V15-wPBvskuDnIB%iKyTw}(o|}%m?qmw& zdH)KRApsA@V`<$W<8KBRuND|=gYP6ODj)ISI^=nSZuw>|#)jW!<#b|+Q6i@q#FZUa z>>u1=+=O&4Xb-y6U`Z(z=qD1QSzy@cHf1ta)n!a!la@rt5qBx=K-YZ+Y19@MB~;*h zoRHl!&` z*jz}uLEe_u8-U4 zR~|qLzO(D*NX1Gc=S$;*AazMRNiJRm!2S{&DSK8M4ga;rtO3nOh3>T3uctBvErHGw z3@?@uYXS8bKDZ?xHbOLVoz6tcq}M<}RK7I`t5B{=u3ZN%p5>%f%gFVhQMIsT^63VU zs884`=thj)s3mZ~#nSVhhm{AoMm7+jbWE@uQOA<-!Q@&!Yt%lzEAJ3*vnDn8844GH1g?kBiM37XK5IR2WMs>Upg(NPk|~J^53u zsQe;-1HA7Vbfb$IuJp(>aJ6xHLW7R^I8*oz_}3B2XIQ$y0jO^$$pE3QA;!zR&Bnt* z6EW=*37%VH`IQM3za(Xt7o@&4Qm_hz%JKWoA`@#N;{6%%%1!{sf~Sk_M(Lu-obDzzAYRJGhl8g+)I zW(i5unY|0g`je^v`GCPjm?iOihQ?uJ>4{6;hfkFOp_=D$jTN<77s+LjTq(x_o=YApZ!o$x|L| zEDzL#%b%WbN;A1<^jJ|aEX1zqNrSNS<(ROsS0B1!RhS&rr;@ngC+j#5c6IThZ=6Al zV!^7dGT9(iGq9d3F-Q;07;4`kL|@{(iYNg1mmlc2!hPGg=-Frxc4oX-J!rtV#htv0 zNg`x+Tvul|QvbUA855|^a1r%KC4{ZI|pha<-q3v`PpC}xSlk>?^V8{^2J-q;*kXQ+Iu_*9sQmR zS^l#n4xN8`W~<><^IQoo_(p_dVj!7?SK@z&htc4<^!BbdEuU znsiI!B#MxA*c>NGMmv=qGJhx1L*pUL*DQtRb&LcZ5v3#VSR z(vnX!g+o(9F?SlreTo!3C2Z{`IsG%G{bdMTZFDfARmwa%GhBx6Rv9NIT-r$EariuW zet3(c5E^6dwgj2r8r)Stnw7x_4~vAk->5K-9{5FI)%Xfi zAnfn1lZU=j?odw!t|;ivu!Jed9!N&W;9B{JF!Y~Jnkc*XC=bVy920%tE5uhs@9oh) z0HWNLOE<~uq*+RNDZi-megs6<+&&<~eo_v02t)XA-uM}iUn@>)VP1yuYV|LcHDXI3 ztzTjZA%NKt;|~iv=r^R`Ic{^KTu9pM#|ZYfoEXogPar6F9s9dN8hEC_*9#%8M=)f- z4Jy@E3TvJ8e%Wqfm_Zp{=hQznmp?N%5E&(Dgm_*MHY%6@1sX3hZH~N7lCM~r#8^m8 ze*>bPddbB7OxHgufP*9+Exl+E4g|OZ{LCPxT|8Dc4jU6HjV`Sa@^8CA?BZ~TDc9}; zE~*fSDw6s8sw9RcI=k6L2$ml)-EZ7aS;3qFE{>-TH&$7(d`ud(^>;_OjoH$;zd}=g zx#Ivpyw*BcfRaRS)#y8JTrQmM$+QDiJQRE+TXq@5;KwCr%faBX3i)wTI6~AFHW_n_77h(qr*WGU z8^rF6J_<&P79NBSehg{6wJrU@gKNp9da>gG!qVeoVWxz{B)D-EXGJE7aGL+%1K2i zIS*!rOjovh!tfaaUq=_6p~VIwW916cZ2gc?tbAxJx4%N3JyTim0ms2-(%A}eWqq)~5wL7yKw?C%qzXQ0ebcmUYTnt+>A!=>gA|094{Gdzz= z8m}cd@u3BCG9jwr&)+Qd#zHV{EPCWCW4S$ra>oP}3#7NpU8MOv^lZkVo*&8xc(0G8p4y2!?78qPgLm@v@7x zC=M1Kafm1`FJETzJ~;4?lJ_rH=;p#BI${lZ5W4&@LH*t!w9I%mwC)NB^Q|Md$qv%2 zMrt}U&bd-qPFJIplV)qDHt@bOe!4*zW#Z`d(EHe<4GLug6VweJX&K-!!v1^7x z=y6a_sc9ylE(0dWj|40_ANw(cXuKW}6>>24$PJbaHX?{ve4Ymz2Lo}*QZ@@*KD9ps zs~k>>=gqOA;I@!<@`N1hV_Fbv6?RT;PNLj5S7k!`+i+P+7^=#|3G&Z8<*^fCmpSQ1 zK>odmo(`)$Y&g=H!WM!9Kj@A1m;mHq+d?(`Yj1*pw}O~7NuDJ}26#7e;tHi=zLsbi zpc41j0;J$|16w2IrG-kn*b3F(0yL`?MT9kt*O<5Q2pWqmphc$LE8R zvy^#~SaKmoM4tz@Wd)iVBxsNUM=NUOnq^v!zqdhdCC&GO%m2c1=3T-RwtaOZ!_6d^ zu#5PkzZi@6t8j@leYuw7Vn!<`ja)cd55N+$&!CeUNqDHKs+3?mTCaYsq2w~ zs)fM?@UjgioEOgs-MB+65p08brpJeKWTnEik8HzeXQ2;iH* z*K?~RKduD@a}p%?38h`A&5n2yG~Wk}of0Dx4dS2XG<^*r8x0?Gez;0P*NR{J)CkL% z20f5YdD*OD}Xh)yxs-Nv)Q(^9pZU|tryu+sEt8B_Sr!Ya9}NDfT{ z^hPuIX`hy^8Jw;poKrctcNt&`GGKeim_>erPU`lbKR+?5kE17^0?!Ki-F-; zfZhWpPLZ>-mF@4%8|sOZD+$p;yCb6IGt%gZf%MWqTC_hN{udggGe-sdXrx}!8~|c` zu%>rXE9B5zq~Kw}z6>X^U786Ypm#((%IswddjoFR?#)x-#RwXT-Nv(oU=CIAXlDtpvn)MDoCJdPfM^*v z4PK2Klod!W57foWm?pr3M(lHjd`=3Df!8nx7MC<5g++&)Xko$2Kj6|96$|N-glq}w zECow8^E>flD`faD_TW|@8Xem-3vI*nLi0SPuuqlvxG^j-hymP>wMnf(=uXfn!%l<# z`XVlO*R)yTwV|D310nlRMJldroQh64u3ZJF*M=*E#|=XNhvVoe9pLJ*)OD-mRU&pL zIKyb^)O?&&5j^HCgWUaWYnFEn`ms=HDwJVeDvZtFQ177p%OK6mF}~(?8#kqr{^5!Z z!W%D7Id}9b7rsx}JUx9T+!628C&`EXfb8>8n0gX0RiBS(l@JY!{OIzKQ8fD6?_pVf@? zL)cg(ClI1843OH|%LA3^CIFhP`TBn#fM;$%sITXg+}OIXS@5FLSnbREMmwz-N&l<4=|vt6QEoR#kFzlKUFC6mqFp%bS?|H*R7S7KGsr&; zE7Sjad2uy>3sL7KD;XamcK+EfE5t(3-;ylv6XIocG@!cJo>xsCR?x8RyyP{L$IOb! zU@0N{PH~yceO>7QTxLJ~hUIE3=eBOxn}C=CcZVlmA_o$10Lt-EukBz$Hb+yhlo^X2 z?{Js<`P)_sJhJ2N{eX8BVu$R{ER)L(qB{-sVhy`P%L(Q~!-h2el6xj2zNnQ^oxrfA z=Y6PBZd-&rOj8sylei&!Vcmi~)aE)Ix zx_t-}P8tojbg}F-a`gr|F{l|>U3e0l^Myj}*}37m?Muz?U^~BtH2zx@%`2hjjpf2h z@Y%1yLLXSidFlfKR5pN<==yIokJH@aG<8WgA=(BD3TxVXzf}S5eYOtBLk8IxFe&vD zqRLmnnaOvW6&lzUY5HZR@Hrh3vSW|Y2uZj8d(iBIJyo2nqJK~plssh9kIKR_+D&eh z?|xE<)f7XEzxT6cghv=GfZqPaGFIcH@mHmtW|(69reZMIrE89V|6#21M!AGE-jACp zI91^$XW`PLw(7R)6- zdoX{1=Q*?X(~Lu1Mbb+e8J*OvP(PDLhFdw%F0Jj6UkqZ2MCS&v2Y{>XKz|=N^j<## zDflO^zDm9#iD?(takz7~esJ(1nghoh7#+Boc!nvcb2z__*d!!r|B>9ze=LNGtDzl) ze6!pF=|5UCI*COv-H`BOl;uW1vrLr5?Gyf+RfNd33ZK{?#zH>}!YfVIvC8tJ>Bk%o z8lU2t)oa(wZ6^SrqJg-0XdrSV*~}C)zP~mG(T{_@&}!?HA508yf&q=)_Kg#@LA~g*uJd|NR$+vSq`T_lP6QpRLCU`))B3QxM*RE zg;Y&BwqpnxHOj^XCl6nsk`SYfl?Q3uF;FUv#^p_Lhfh8tWUX-lbJK;I(P@t1#oPSxRt`M>L;dXj z2?~SZh=P1~7P20V^Hy)g{}l?knrms1!wjM?!5metAY|vn0nOYijn0Ucg``8(OJplQX^_q?whTy_2P9Ox&(sv2e)K1E`*_gRRzhN; zzrI9nx<(-!Kslx9GV@wM{3Y9)<@f9NX2k%*W60_oENg8Q4S~8nh*MB>&YW4s#e%CT zLynjYxVNv4msP}gW$fDBgi5)44j@`)XfRUlA&qK7@fA18JncAbt_nM8A+3S0J&52d zSWrJY4_sQnc$SkPwHg?TWJoDlcnWS(;cY-h-DnW|CEr$g-XJ%P~Rt!kRM4u?0!8~eixVF*Do+uVxs(D^mW%=C!Gtm1bD%7ViM$ln^nwBh?XyJ z0nKj%))*SY`sQ0z7z0Eg0>LK?;yxTlowC*ABCzct`ohblI|%V_-n>Gtze720PlwylD$Rs?=yjhA8ZR)=?=(ipYlN(7NGp;v z&Wt;e!luY?mN9o}LEQPn`NQOU6*9sBv;YPT!aR$6o&Cd<%ZqWiTKv+36$3Qa=v<^+ zCvt!qd;2d2#8aUBDicVfe&gVxS1uvg+m$9tHZi*rv{mrT)0ZgpcP6;um2#!Qp&q!6 zh?fpRybaE0f_WWIW4@erpUUIBnWth`6XLOums^%8E6@hh{xmsyxw71p5=rss#9~>s z!qO4;DT6dSFLqpTMS|99v6R4I9Yz(%|rxi<6HHqBQEKk6Eu=%&`o*{KN)@d`*15XKu8-Z8(UCkVhX@NY@6w4yiWC z?+ogcQy#IR()*mWE_sO%4en;AN$q2n5w1Frv7V5B7Mv-op3_l}BL!V}^+p*@5-o?= z!gL*dpFyX$1OKGt8Vp7xQ1`5fkc~l41IcIrIqM0eVAQ}C5iO0U;W!wbm=@mOKM59U z6v!)*h^O=^!Lk~6OPf>~44t^t%Wgsx%iTeIIQ!R$;hNWlY5W$_?3gKWoZA}>GOAH# ztIUhgymaE4n8j1KZq)J*w?2eJ88HJF*d%XZ=0qAE937*@qA3_2X$m2#L~Z*pqldck z<-bPzqdKK39y0tmFPenqSc2w9TwS^lsKHIQL=9 z%>NqXtc5%Lb2C+PDK6c=B#mVwZh%*>l?$>pA5LBPk1r&|quf5&EgLIjafe+&u1cci zgO$y4VxB@53k8grtp@#34U7zN#uno?mvg%9H^^X<%`tLHfj%VwM}60kW)F;yu-G_Cig;cL2={PPdUvL6#w$Gs2ZoDmd3 z5Rwz%6ysf^gV|BC!^B_^KzoVN#a1A^MI2e8bg&EdB&7LVbpF&Fx!hO|44N8@#T`5p zJ~K--m%p}F#+PX>M(%{I_+tiXnS%*`a=FLdMorWVg9xI^8>eq7luH#U+_F|yYOyPq zu9FVZs7$bAGde$XeU%kR$dk@$WznuMp+e$n6ykoIp36QWWN(5MJyM+xhw_fQ$ zR*ljq+<~?yq6s}}EPqnH9M_%oLruJby!Dfjd6g3sY} zd`}x7KgQ6R6nj)zy9xxir0^KNze5FdZsC68^ez<`>S<+AvB^COF*^B^Q{-zxywRpr z65g*I9*A}KN*v))BMzCjGHlcQ^pUx$Q4_!C+fHW5<3?i08OT=~Db6XufjDZzDYk zHCYOl*P+NV@OJiJ&5jsi{_J|W{Uy!rhraP5qutJS+5WPoJKbfnlQilaEa_#q`{Q2& zTiJvOySDI5?IFdfbRu##l{wM~lK{kvf(Ky$i^5nT$#&ydLXmo-7Fy@o1_{ z97~$*0Hs@+r5nWSP{DNvAqTYHRJT3FX{eIjCO{9%SQMQ2o@U2^JdaJs8btTwj(RE~ zJ5+X|Y$UCJ0sVsEeC4Y5jXRhmkCGa4oZLC%E_!X3rHLNxm^W8sdE8p^ptBgb*~Cf(B`y ziyQp&Ke19v+ITj9&ZmHEkC3MhRx@IkrqHz*3^R}t#S~UNtDYw$@q}!Hu5Hq9EZj+P zzFPH}vVt%>bSJKRsvQ51Ej%X%(KQ^dEOxJXZxZg&##jHt@+Ym zbTBneY33?TqHmB{0WHLBNb~0x%WVePZQ&CCyr6QO&Q{(SJVS_A=JKna-UVdbmnwi; zV}9K484L_$a2fK!SC%~*9tgta@2>&z`fz(7M}Dg;Y?GPse1qsL973)o;B1JalULRpqgzbufvg|7MTinQk99fLob@FB^@C z>pGsVqMqx1*IcTKB(MBox%yjR;!Q6XzcGcqu?7aGf2uI_j<`5Ife=-KQYbzImR4v2 zJjbM8zP~iD9|y3%kVXSy`1IFz%hbOC@p&znDfuMXwD8M}BZAiq@(>jh*n$68zLHYv zRy7!e!d9R;N}4@xba9}nM;;hvKm?w z+mrO-AqKnfw_=De^(>zYc?1RM3(w1Yb2hKU9l@U zhyfSA(|}xEHh`c3yZUR{PDXVgn zyv7vv+qjruKEl&`5M2g$v$G}|7uN4MKrbLfb{9^4n)~3|RQ521gGmxD&9)*7*Mtiv zArW6WM+?N+!Um(EifT9CGhN+ST{nNhv<1>V83}xNG8;p7P0_-FRe1;_?|FEdibT8( z6zy^HuQURlNd{@;#l*nD@DGDn8#u{PGUj5HbUPv?-soU{vi$oJD*@5n?j(&8aK1`^ zWs@&e2p@B_p@_cBxHtjAMCBvoi~6u@ujYV+V0cMP#IUt&gA`q^;xyew@A-oewGM3V zk`Y%~){vi4##9`Pm8nd$e*!D1t^pZ0V{Zytja3*s6Dh1CN?$Od7&CH>K;;{jO2ggh^w?M;a_FqhJansm2P_HOm3lDhtXncqf@chysJ9 z?NV+m%$|(EebIGb@#!H2D2|~QUvI@W^)f1Mxj`Hf(IZpPAl4(?w0t#3OQz}CucSj0 z-`L4=%UoqR$?5V0X;u{Xzfj7}@*o-zq3^_qdQfbu|%Xq1}no#`?RpFuN*&=blEU@IURY zkUd5_Z8>t_Jt`T9hbIig{ej*9Q`lgQNifuum4wV#mM*6*GFE3}0Oo}s6QasE^l|$# z>5OpnViV}=q{7QWLgWo}Cdq~+p!M&bDkm=0oLEe7jf(kX0#oo*oWbyZ@e704`C)${ z7u{!E#NUMC;Pr&8ldn;>812M%OOGek>!S-nhtQf7?#Xar^w%;?!8d^KlS+s8EZ34X z*0OxMLLskRVT}8gK};h@7kPJ<4B9p(|CkH>O zl2{dBhc3q$#C081*W{DOl*=7dO}R{YTp8UCa~j4T}I?jz(s)Cgnnr<8Vk+vG#i{7X`CBM>c*hXb;m zP;7e(w?KQC0s;SE5=V^-BUFI5rd_6Q zGB#Fo1xmwtsn-#2VsL!7Cqd~z9IUodly=~pX+bJz)Gg3ogIyI)xG-VKVkYvfddhhI zRca7VXQTX)rd;l8aOvNW4ydi>!08AC0!%@6?4CJBFWjsU9TSdY<RliDR=z^) z>^R;Oc@Wpm(Z%wcL8mR&O)QY10#A4=4Um=;D#YhVR~pd<@z#u-`%RHOsB;T#oTe5V zqzm*xIike!Cdcy@<5@zs5`vqAxANn8jw$SvQBm@x2_vK-^ll`%)Pxb`Cv-v}mz3>I z!MQds;C7cI1zm0B2D!KbB>%3m9&A$O;3}2HbqYdRQd4W9rKoxv|Dp@3HTRkoGLIxb zFT^DoNO9X*W#1Z2fuN)D4J$1Lmke%1EzRp+VG5p!d6)`Xch)MT#yz6AFDAs2P(Kue zt#ubwI$Egw+w^AmL9+?NU)Utw^SJgGRM*&`;kz%MHSLB$rOB4e~e}o<0uj zQDH9A5nFk*K|}#&N4=<5i-kWCxvmd19*QL&hgXbo%b0=(`7^3`h_T8b2O{hoV?1u} zAbmYF6XL~&`toF1zZS*MTotUw3?PMzXoL*N!%8RQz9^N6$l_FFNjYggt92k)7$@C? z{4+u^Te-sz8&m;&xlVenR1xCOShG$Bjf3yfg_r&3j5Bkk9Qd@A(u##_$YtQC|It$T z*WjLN_;U)eLv?Y-oJYuZkA!e29Np7QLGQ(u7fd6Dlcisn=e|G5Noom?{J4`-2mG>uk1_Ny&|Eq-taR)-DFO445Gv0R$ z!hsO%m}T1sRueP}wE3Si$U74p@QwaZxiE+41@A>3jHu`A^0Yy&8*!mpywhq`I}ohx zke>+gKaPX**w&Agb}G8%GoyzPSSU!E|4}cN+VL{?6F~IZ>8oYjr-xj8XnrPg3Dfmv=@U?{Lh#r@=TMGA8P`s(nldqV9hMzk}f_qFH z7PbhMfFro*?~#HAgu4>n1^Yb6*lnd!Y7jU6@Zcm(g#3EZK9B#P84*B^qv=CG8jMFk znKC))CqTSNXyyxD225rO|0y_7z}|2sA)6Wpcx#MB!*c{k&iPp-{p~n!f$0Y!s(;TE z8TkuHl!{9>teN;N_b~;f;;fhfHarL~{Zufj`c=6OouyYuod*+&s0`Ba8@Qo{oG?*F z{jQ=IjCcb$@ehSV{p~#1$RWgo(Wn-9?@beMOQUEkKl~pE@O7eM`u4}(ToVr zhOeErqQ4v0-xMu(54-Sx)&Wx{$;5qt@feKx!N_v@=a|A2ydTUGsQr*dh_;x2o%}`; zPjrjY2LenLOVhRyo{emlC4L?d|d+Nfa53NS0Sj zii5aw)AT{y5NJY8>aX5eF&6 zRTvx=$j*Zm;%FOpL~`*6K(=Ew?mmrn24NOwbg(y7UM0=);mNfcZj265h`0?L%l{xm zO*xG)I&8K4WdgWp=RuHrn99T7JA3@-!xeJA9!Qguk5Guks+*lB7Zaj&mWIi-M}pLX zb<-=?GX)jFttSJ&6c`Ld*Yi+u$RL!?87a&?O0(mN8Akvd$%>EG6bOS@&?a}~k^2t#%6Ak-Jc{1kwT znvtVTzOjM_#;ZLD_lyXi7w17=B10iA8v|iHH<++ZhMjL2(cSUyeQR*2tCE|$!!A&+ zn-#}PoHxe->cl@q_L-)p@*hxvoo2hgw@HC4r+GKXV7i6O-E z{AKX{dXciAB|tN)e-VP^mdDEmW8r)E_hrkc9;+onMqjLA!AyidCe1ovaE7;_u1f&X z3@er`UoYohs&al<0FJs$>6I&1$wH%@c266PTV&Zyn%@~T6>!3S zKN2>Ivs zV_yqbZ{1SD6qGXaD(N*Q29ZEJjB#S*SElG_H(s_Z0AM}h>ZIys(0HJR3YHFoP->!` z^rwUatbU72#9M`yuV@x59y`UNb4sKGMG6qFMv>Y+QE z9}M!U7A`g~3{x&9b8Z3(2vIS7QvBiYtB@&pkkh|adQ2Fq2SZQ|zhjZ+#U`KH_Qx;Q zye{GmB}LG~SR7OM=fIm;euAWU5N?(W^5tcNPERo-49r=g*)eFr7XwXkw=6}<|N7i{ z>J}+;(Um;w2 z@CIkI2Rqx?D;o{EDVtfz&pj?LBiiNa2Q)8s12oUQXQe_MM_~6VpAoVnwRB3-gUSjy zI8K+x*C?d=C+;l1GZ=tBSFEZ14}lx%9-fxRIcv43A%u1*Ck?Ke&inJDWZF9AQgbY2 zJ!rhUlM#WC$@1a`h0u@j_vOli8v#S-*Kh{eh#e%q^a3V^3XG@KbAOqf@UZ1VxGe0) z9w6k4RQP0_v2e!bKp`#1pqo;S3wGK65tST=oxlm3v{;;K;og9&#pjrUx^e#j?eMrq zRX6~LG^s`hBZ}nH$5agKS0}z*4tw07FN#Lf=RaYEG&RzBRi42>Tpf)Aj($?Pu#l$k zT0*=UhA-Hzb2U8kDa)Q2^5%FiAzsZ*iIO$O;>vVOj6CB(c(2B-)2_W4i!tBJ)#1p< zL2sHwL~4=Hk>y+79i^PvtL4aOka&EkBZjt+iD7h3au=Z;c*HvaB%??;X$m!;74Jb zL7W6odD%~hYU1z*eI6mbWpsj-hWILoiPK_`YSHL0xryN7B?1UwUP&6&4w+u&O20v< zA32eE@-89UKQseVbL3N|zykzMJLprk018z+*>lxG^OPhl4Hg(&!ryC?e8 zUV$hl8OG~`x~jWKrlf0Tbt)r1gHB8yck-_ogj)dzm;8G?;rdu^-G?$XFQRVnbaqyz zJs2UtRUPx;Y!w*li=aN*tQ$3=gn@hEBPnSOst-g
*j%P?ROs0P7f3L6Wi3YKz=jr;(gsfjSOol>tao?B#)NTV% zG+j`x*`Z^F-YfSTbQ7E0ESQp2D3=ag;bv)0r3vGL0}7x27=*Vm9(i9`1#alS8#h&Y z2>5{|WZ`;VHdHHwr#E^@$s)w-`lFg;o3U^O>2__E>{`p1<%40q{8|TyFMq*Yxv$6mbagYTlAFUVgF^j6c|zaS=7UL1my^j{_WblM@>?#lqDUcgSK(;U#@IR91TA zai*{@K+ideG~NTzxw+x?AdG5h@bs<6#ZnSx#!oh@u)Dc|(Lr|;@*E$L(-JrHIH#~m zd@U+Ndn2etb`auO!EA(AY*m_0`MXK$aJ6EIT)a&)1(Kuu5#{oVL2P-c5Pf~Sa^21r z+EL8yv|<_x{cX!_1HFfZj%bj`un8}xP(8>0cr&Pd=&o$jh9#c?LuJB-f`LjV7!17k-7f&mfW`p*baTi8iJ=gyzO)MqsXodZgpKo8UhmcrefE^A(~6j((wIn`4@l$>pIeRa0jYBS^O~1e)lt;-gsf7+fZLkLzPOnw_|jfa zx!m!B3d7o;Dz%Ik4Gve*GsaFr<_*Q9a}hOc(My`wg^Ar>uc$C4U1xivoc5~1Kt&F9 zcV`pwmsQp(TS%iEw>ZT~tYGhtF=1#!=&B;-H5K;YfN%TjR!$2}={lu^kafUK-JD)p zv^OvXy<<^?ocM;yBCZ2^wM-%8?**Q@8;uSiiuNBS#`y)dXMcGUV*bcAZJJCc#v@?4 zXO>*|FolCga|PG98wmL>uwofEmN(K&_W{SerII+tg5pGOd|RP61|8S1EPV$MzUt7; zQ8GRjHY}(Nej>@@{0+uh4X4;%BK*( zbG^h7d6YDSv33tel?T%*B=s{ZZM2i7AkP?dJ3}#LFaBJ)P_5HbLMkDO3dc_NBV(aE z@hEce3o8es_E53BWYF2($z|d@gvf{mt9)!MxV52Yo7?^Y7COM(IWj;J9}0)e!F(K9 z_^`f}-);Su;G*tMEc#tJmHqQ~SW0{Y4v*X(T!>)`f9_@5ICmVjQw8vQWjF%tv-TS| zfvDmDJieUeDumUU8`}%V?JAFV!hIS9B9kwef{MGTjq*r`U}^%PyCm%g0Ts5@He2@NTG0E93akSivRn-jnH^vHW4-sP!C`Goxvg8 zzQ%EiK`ckPXD=YcTX^-U@*HVw<=B{T+!Q(#$6h$l0I#gygT<$|)dm~jSg~5(u@nen zmjUxQ8TvsV6dEZbCG6i?vRhLw9Y%*ir?cQk6@$AedYoEAh>~@M8NiZ$iwVF*0C)PY z`0{CuTy1{ub5rZ7yPWWom4twS&iUhPP;O&!r@s zi&m$~e!EoIjT@7_|Et2(>?0z?v4rg1Z549z|CFZf%il<|sj-o7K)gF+x$&uT$}h&M zL8z==d4!N3JzRE>#FKSK;SI6#e>GXedB+UVPKc+ifG^k+egnPxU`oh*@l7I#2OBzg^!>>nmOH-M?HH7CgE&Rt@9(iaCTn4O zXooGxlT1Oa7l+GV_5{fXx5jgQ+GUVetxLAaw|glU@%uc451qa@Al|@DZID?;JEM(q zIcXMxZ;7*j>UJEtk0~3(03&h#7kpM;D;09_$=^#WN4@;!4 zNn_o_$vYAk7(&Jf3n!{PtS+70?egQkmc5}X(A_R~@24A-_~A zOd4T<9V_i7fD1Y>?39D|x5D8!9r~n?4B}{(p2xQyV9$bGT~CGN5&EAJ0f62iM&sVQ zMJ_l9B-<;!SROIjNi2~J(z=?%^f^q#E{g#fcR_H}l5;Rp&=?3}!o$#m57{l?H8#@4 z#$ilB!A?iHoJktOfwC65d^S?kf#pS zV%?5jIsY&%)~Uc1#ZkH4hlqcKjayl3VhRh7;e|Rm@KlA6=bV~Hh|(NbTh|*4t2Hcv z1vV)KT_aXma zqIMYQ8R-?Maxzj-{xZaF{DCyMPAKXqSCV02nu6Um%pTZbmNJFkGkKkiY{Qh zv3yy>6wEWz7)ULa05uJkhm69r3?fJdw=Sn#tXwyzpJ%!IUjm2+Z-k2vhcsIh2kV%R zi!KG!2V+KJxSY((RRC*2MlZf?j=`9a&D{qEX&;h;h?{ej8;C1&Yg%NQL0;NIh|^05 z`Q5t<5O)?d9)zDfO#e7YZees+huO$J8UNqP#?i2cWU< zl{2qVULXppr&bxeRw2$jy}S~cGanGs;tfmWR+78Nf^`#LRSC*qlplhzN!6(@W@{q=g~21Xj>b)POS zm-k3hvIrhvy4b-#_fwMLxxSHWxBYH_3~B_mX>brXUl%H*g&)Su?+E$5lFMb?O`uVo z@RbrnlCLz-C08x7962#O>nS9J)WSJ({bD5-ua{RzA_Fv>60yWG;BXm+?Wf(WQrBJ~ zQ%SPE@g4La1_u>oOH~%Vox?_#K{%V?QhX;NtAS11x4tC&okNfEn=gqm6`gwXEm}?> zs-0%3QH01CEG&>Vqj$n1?FYB280>F&()1A_#GKqj`Sdo>>}1d|r*GrxW5(^80&@~r zpztpLO?PN^4&+_!a>AVogWEDVE1qQ#gEY+u^9a#Uk1mr25ID+L%SKOt= zxCvu!5UJLKqrE+z~D^l2#}eUOCv+`c?v>CT>H% z{D*Yt$CW^1zPs)>*iy@z4l4|fk5w>iLGo_6ohOa`tE^Z$V~>^n!U552*W4%xB=Ll0 zSaRRCTBR@r^J@4fYm~KO`6k(Gt-NX4T*>63tzA{3KQTuQz zQ;-4EFx*~Av_X7x`m=~JxFb6ffAY3L%z5MRVEjKqwm>J`Aw8&cu(U-!C(W)D(*eK7 zLBuK9xxrGfv`s{$=o*=}(c}veD}*%afWs)PqU3&qJjlXO_$;Bm!=@FFY?4DZ0mE2> zW8eSpv+8!HqZC}0h4NCe4PruYa2_#dGq|X01WZRuNkfuWlDO3n_K-pxb#ZT>M##5< z9rE|a;(h}zOlCi984-O2hr)dZ;inSww>)hySdl_+DAON-Fh1lBD`XK#Gzj_?x+O0p z+-xcAdm)oXd7deFtW%LE2W`>k1X^H+Qz?rLx;Zdsf&Wi~=+#aP{JA}<8QrLKIr=fp z$W20Pv3y2|=i+PH0^qc*DnRQ@?725QsSsNT4_b~YgX8^(v4^9xNN{ztGiQygBx1WH zX2_eQ*@6XyJe1i*=y$W3Gv%siWz!EXM$qqk7~4f};E!z6f&wt0zGk~gwkGgO!9xav zMV-6{^*kZ!k24pXq41_mdyLA%LXyk$39%}GyL77jmGhW_S_ToL_D$0G0vk8W+&ImL zlTn_`eQ6Nd4K5MC@wsX3eC+-SCLFWPEzXk@4MOY0(Bo?pm5aqB7ak8MDP)XG?C<_T zh-c1OD#s^-Wb2O<<5-zX>ANgNw;lYoPffAhb&*a57CeOP1g-7z)^1j25VPWpRF&i{ z0vrH5mB@x>0;lbN5~8Nzk4RDm zNYvE)c6G$cAX7p=kwL1P-}hnYMvWYwsq)@PSs=Qfj@u?P1@dTI)^7hSh4h4#*e3T9 z^0lJt01V+$2Wqd`s$>0J`YhNLb_Zj-27D>OQTAN#UAEhX4puQb~>^y8sc?RIim zAu5?LYSkmGAxswp`^wE*uNtMM869{y-QEzuaF$DN4!hrDrX*@bm*+7kd zCZYco66heW+JwVqz}DX9V%Q5EaLG9(T9g+%B43b3QK1_%6+@f_ zC#_vl4Dwb)RGLD1&Hb_`Q~W8jKs?NVC)feAr#bZN+Lr_IkA3q(+tG z_sNQv+X*qUo;PJO{NBYAK`>a}FEyn3`YCa+mE_>`x@8Td#PA36V6&Fwb`Q~N^NALP z&S<1JmMuRK@|jH)64`33lss-7zanJGaPQn73ZXr>O@#xE+a;bf+cmc>Se_-H5>hxc z!@{?-q_mp=?(7V}z=<%_hi1r!q#>;4%v>9_bXZErfhxDiuXma}408V@&1ZEFd4nl( zRF^_0rJN4zn|xRb`}$Eir$@Q8enE7LIfQ%`_G$r;rkUa-xS zs|ZogKy-^-Yb1OE0`Z6v)@$OfoJG`(s+a(NwUo6n1VSInig&>ai|5yqf2gjlHXA-l)qGLH4-8_ z8UZ^FZXavjRRI*p*`duKE}!w9(uVhxODlNziH$ZGfQuGb+^>EgTsAKbPvS|lbHShk zgW6lh^19ok`U4dU;I`HkAA&{&umYqF9qtJ{$P{EmkH8gB$ufwOOZNKNA1N2dYgir2 zGlYKco;gFF{1_ND2z1p+tI;kZC;a6Tq~kFdJ+Sj|OY`JwUMt7;yLhid;h;d;+5R;Bxxs& ze8I@70Pg7~#>qFvrHl4`e*{<82eV|HfNzhE15@Vx{-dQh^)Yhs&!ExK&}^_s$Hj_} zTO}q!K3J7X(~wR=JT+LJEDLuj3+fgci`N^3B@N~xx&6P&bqc$AsiBUL)ve5uy2h|@VllB7_9R-U%ES%iT zOeeNaHZc)dT(}4r$dzXehMaXZ%axPBWyif=vPrTyEfCH7!tdGHE$ z&LPG^^a}(NJNHmPzaBHEh#Urtdcd~?S9s6_tYiwx!s-wnf+W+Ro0HFYGyOhys24v# zx#h1AR>i>F2+1VUtw5Okhk$iTfNzW>IY_M7ks)U=N5TvuzyLzw$^J)}a9+9BIw-RV zQ7atJ<1#rFcq=W1?Ug%B#mV1}gbZ4D6WU_%D5Zl@Ju>lVE2^T#t&EaW2q6~etd^^c z4#!zQo{1sgUJ%Qp*~dT(pY7VEGC-1dfczD&*$_UQcG*WQAy6YJn38IkRTfj1Sj_oZlA8$)}i1 zWE7VJeKwvCPv)gK5YVY%H5ty{nMGJY6;lm&`aSegD~Ic3Mj)M-1Ac< zamZ;Z44)A}IP4EM2#0tK>%N+hPsagzC|rQNbh^rOomjnHW}Iyh9@C%P6+KS z)6Y?EFnK#KHs%tBI^vWmQf@pqE1Mn#dI|AioR%Cp=v>Q|A97Uk7@=mHCU2joa+nk4 z@bgs;njY?n$_V*AL*B|566uh?nLuwRjq3NkKp~Z{Fa=&f$d?OQaZ{Z-U;^Vqi0WP= zv6CSXdhp~al271wtQpf}|1e-yI4Mf5As7 z(&*ERu9QxakaDUE9q4+8rYoeeVnVGPJOdD=qMK=C3%Q3WEEO(WFe+~&L>0W~7%$S&oZ%LT;xraYY8zLOx_WhJwsZESnuuzx#vO^9PiH$7W7EAK~#qmY5^f6SFGD8 z&lw9V7GB@J`y$Qgv^DUe-`5u#jA?M9x+UxqD;CbSv9X;_$Zn1;enB)=!yG5BH5oT2 zjc#t{8^o%Y>zTQiAq(HEvrckIqrEn5mR*({Xdafum+M1dK)_D_n89FGXRxYFiVebo zjB3eAb0EwQ6xt?m&A62*{^y)NQ#y#zv@R}O#C1vbTtGbZ%7s!yk{=0IzY#gJ--oHV zf;cE&7^Hp%C!)U_g!LqB9b}(*mNzY)%idXp>~Yb9c0nqQDSXl1D%t;v-4CM6&Pi7q zt1(NCzDikv(HPq7?zq|{N0T|+Pr1g=m`TS0GYC;;IQC)>kj8tg-6GHXQt9}FaDzHu zGhx#9+EOIWAZ;(3+T^nZ%7yJU4K^;g&V)xxr~>%FU@&Poklrm7*Mp1K!gsaXS&!YQ z@-)WJNR*uhc~=_#SsNBAH(0ryw>X};$qHvchh#Z>k#Q^9xMWyn5Y9ws9Q&q0uczHX zBh@Z}Fx#)IR)&mTwrQ0dyA9Q-GLjGpRQ1Wbr1=5xlgC08gMh~rbWT`_;?NCy zX8wBr%S3*1I#+CxZ*NiP#)nj*$+rTc>_s=qdABPEyF3RGuGjxvIoHmSeebY58Fg-3 zll$eY^}=Mve)vxc<|JqYYksyfod3a_|D<{k5EZ@IyM_ zE_+aU$nEmy_WI^J17`P)D>WPAI>2_Tkq|XQR6ojHni(ACE~J{zxrQ{W0Y5MJ;W}#-LjAyU_E=}! z5}f~Z$wfZwjrSt+WI_bEs0JzkpRNqLIeop#JAL&WdoD5Pwp1ZNnXD&-yn1ot8Ya$C zyzVkOsX>;V4?>3BV_Syozd>o<{`hR8(oWQfT)hc&sCx|3N~OghRSt+Hv39d^os?`E zn;!MBLT_{%?+WG`gn1wj`HBppqaqHtOnd~ws3C@7*f{W-$5~9_vk);J>iIhfQN1Z} zEgV6b(?&7Q%6fTJ1#cdiCT|yLD+y5`%phRiAJV}*swp@~g4+?kawSvv%FWy5z{f!I zbJXdnP=b}T1Y}b6aMB!DH$X$e}9Jdxw72^~$#RPEfOx4c0F)B|hg^n00 zG3ch}x>>im4gN51rYG)gSO^Vl!SIg3jn{m#+EgN*Gsmjeu<~un^ zK~r5dM<(QIF}M=N8ywdV^0Q$7yW3QGVjdDvizQ3v%1NYI7|u=x8yQbD!nor@v65gk zBDcw^e9h%Wwab^J@zOB#cWPVZ%>rdP8L9FaX%uMfgkJDUp$bDCPdA1)7AZt`cB;!{ zS+PR;FszG`QiCDS@Tu~Y!657y;nvSBG2yywUXxi`sxa8l%q!^k8+21Ey!u9&RSquR z9S@~-MtlP`uqRg{@wnYLPD8hE{_n==Y2@O)aKwgHi!>R8{UoAdNOl#tyQlcs(nm~b z#8rU%rE97I`R3V`Ou^SrtI-r3jMKdnrsGAmn#CJ!VHB?S>j3d3?!&_1qYgf;*j=gvjM~MajR7h5O@-Uh!HRSg0o-r_mEO zv}<0R&@-ylE1iI$&1M+fX>}Qi_095#SbK`hK z;Q8E?F8Upw)U73Ba-=y#3L$D7%<7j9Nu!ET%40fa8y(T3DcJfz^G>fdK4K{*LU5}L()!YLFXUlQUui{MG+lIK99qv7YZTv`e6 zr7)yXL-MvkIM?Sl{fLka9`bm~9&yq0Dvw_xf4Mgq#QK6$|AH5ki^Ug}i?14lrG-~f zB;WZE-W_o+c-V^&HtVHzvVe#!k=G~_U$SEGgT`^t27|#$SSMyk)yv?bQm77QXP%kt z{ff%tVl!v(hYez90TV2F@ilNmKddR! z`gCc23+bGEI+}4=2l@q`?DsMe!h!T*86wRd61qae>y^gp$E8lm)Nh zZcd&|{0AVafo_Rk0FPoLEX6BILS$_ZRz*>k5;C7F<=u=-rl6{{|A9$I4xvBw%$S1v zl3FO;&cb#KcSCnD1-)U)P8eWa_fG(R@2pYDB+XXgm6IO%(TAALp}F~k5Y4w}j_m!V zX7VcfTapk&$>0G0qX3)v04&tiJX^uSs`;9M1qa3dd&W%dt4}*9mH@QZpyUK;J zx7U;_4+k`3U^}$iLC{)pxV>TjJ?tyx-j50W1k&JQ{=}I$`^=D^(o%*M2~Cf3zlIDR zjeF+YG&UG?s-fEMmVXiQw^BJM&;ARvt>oHr5buiz3BNKC?R$T??E8(CR2WfS-TMv2z1vmU~k0_CXY=$G-*Qy1HGN>lBNF!7w;N|^Y~>yswC~j z5ufyp|0oOIZ{>Z`SeH=wfgSE{Q{2LzG)pimihbb?LjG!I<4`G9Y0rPW%{AH!p%I`im0T)DyrR2ZJ%ilnInS5?^_{LRo(f>3(49d|} zr1^2P+h`O~ZxEf68i8fMfXhaQ{Yz^ruhmsD1!X#kRdT_vmN#L{YZ{dWzxhSsGI6ei z{jQJ!c;Lw4%s&886vy_MI;V9&o@5Gtc;OO9lBHHQ$l-r#KHeF@cR$adn-n_cpT5VG zP<{EvHFda&3CItcRLt|t!4^qi3SX!NtG#4|;W7dETltdkCd>+p${$m2T_d*r-e+{Bf(-vso ziI8W%oIG0sq*;qZ9OEH2PsT13<{(Ohm=p%o@3jG)nX8pc2fJ7(!j1gvqUcRUWL@L-%S#AtJi7 zYlIW-Zop|W=?Gw!S{o~ekw&e(M082$EMVKj6xO;oi)*=04RQ;JkMZv#!R7WbWbV&F z2&ZPDSV9;?r1B^&4g0U)XaOSF$=*k6X~D8uj(Hy#WL%`tE;;ZR<-+X}H|yv5FbyFZ z3*=Ek_Tek!Ns|2Lp;KJUceI24(!{xT#f6L4k5%#ITjcQLwA|2HXQJFg$UYT{0>Po^ zWu~C$E3QO{GnK`%FM!)!U-?<C^<~(ZvSc z?$EJt!-?S9HGhmZ?P=)HxT?#U;d!#~3B2xpNjpgk!D%OzL>~~MviOZ-qsl}3ttYDh zRwFJvE&OT__wpQsP1$~mavfMx!`f%&sRol{;QCP#3_>*)tQwWC2vIn?FfO#v!`qRk znS4wiT~r-qkRCGd(`_~p=Pe)$tg*A#_rdp3{U&mv^6z-B$%Sn$CL_f1md!^i?|$9I^dn^-8G&w|^3e50GD z+9ROx!dB%CgCWb#4mp0Basy4JaFs8~26=h2rA{VJx4iIqfvIB1Abv)8S(!P*xG@QV zKG>iV^6O_MO0!Q#56ibcU7jM}8NDfDgSeoO-J~q9FG+R~@_~t!^1aW({X;nPsk&Gt1KlZ{E?+c=O^vs$SpID=G@n+> zeV1rCVbi9|Q>0PZ{|g=WopIgVW_jpRaCKe37y-2}*M}e|9_=^!2r)?E z;`2n@D=V*7j#D-etWK0yd^p+)rMVn=jdH0ghjwbcLANf6y`}0}D>XWikxdVquaF_- zuok+P5DnlIx5zruH2A?`1y)sw@ZTT0q&t0q%D6pQ5=R=Zh$R3|{r9;}Ar*ZX60b4{ z*SF44v|Q~&*cjk!et-}SfZuzdhyEet`0F)04KC6!+uQ(%@47q`e0;wfjfqpAhEkbt z5R(uO2+9^J*UgH-1S<#J1Q=S}!}&d1=n0LCo%qA{D*dxNV4B z87trUu(i^~mnmAQ<*Zw=Nj@Td%A{kdBcLB;TTeLsSXvn^C$3WY0JbMSy&-(LoWXQ9 zXMFznU;+)mE@TR71?No|N#$0{=#J$`hq2&3pBI3;dqb(9GYm|#_;3-@M+8^n)@T84E7u@B>n`~)GM zuoEE@^8Jk4VmJ-WT5aXj7f^5g03jp;@gwrS(QX!8gOKJ5GPJB>9kA&B!t#1;xRjJE zEo(IIrVSDDA!&AtVYu(gkbTxFgxjz{%67TUAeNzA609TS57Ixp<2o(i%iu;CP8^P2 zrzPM=wI1a!GwAigU&ElRH;7opRPcUg5Gy(kcRvu~rNW@-{>f;ss9NG9kj`JS+NGdX zE0}_62U9VqM#?iLU}2DAn&ip#R@C&_@*-&n!w8A~oWC##Z(1DI4tfw=-C@j-Z~~T# zg)dGH2YgtY%g^(^L9Fv=|M{muPVFPx<(!S07y2HAt(O}OI{nGad*nmlqKB+pA=f=@ zf@$fzQ2LTV=tAM8Qcih9x#I}+hkLjS4F(1p>ABkZ-b%yICnKT+?WyFTHvWtK} zsO?2qHF{}nGW!Xnu+Y*d4vG&EvLj-|-Mm$4FF#fuA`LMv!8;2i;YmQ0!?Cs=u0{)G zcBEyj?~-XzDhve?767t~5EZ*`v;1Zps111A?(}FCL|f5g`?=e!jK&OENg59fq&5XI z+GQP~J`leb4r&{G0mNB=-p8@LPG+XmZP&c`syu~#!Js#WV}GdeW0eahy1~3|dD(}p zRa~b3`*5*1s zFvVaXGK2f6G9PA_VPP$wW~nfpDZml&YdHo};d5wEj>`q)bJF12u8Yd1GnsX!1gIoIxy5D>7>z;=<)pMjRu|3 zR&S^b-i4Km8zEG~?pg~`ZfGM~Di4`FO6H}GE`v@| z7rhk)3Gqf=PbC9@;y^T%g35ULNjxC`+T^{Y0m2{s>{;>Xv~pURW=AWAl-LQfkSVAG zHrY64!KB4ccoP%Z?PFS~WyvRGJ9NUUMEUL#>{i32Oko9}Tx=PVEe3J7&neEA41>;S zg_G7P|FP^?y+C_HUmcSww7}5GMA|m9y8jxt(U>tmige(<4 zV+t1xuvkwXc2BKXZKK;TJ)IE^VNG~-9))~Z| zJ%rVJoDk2SvlbVUm4jYF$FF~BP|nqJ<-A7a(9{H8wqlxmn8Z~@db0{HSti9K`Q30+ z4=aIAgMp?R8jZd|$gbCh5EZQ|313mXMKYf-^sC$kwH~|GdnSNIxr39WWo;19mB3V~ zB;aE(DVRnE0F!V zjn$USulzTIfsQWPLdExJQCz8D+`(wPZ?DP+iV@*&Q1<9k2%0~fwS;&y9*X4(udIzS z1r-b5yjHFlP%iomH2J(4c(0|fo(bHP$m>jDd!^UPgM*;){-F)s{zCxxXmeM{`r+N> znETF7OF{S|r)`+!3@`=dY{JE!=ZqUN2&kYE;>-~(K}TK6U&RzO5q=|lubHC?v6SJO z>>5IpHa=P|^Noc%^SJM_F_pvt2rdH19D@PqF*}CjsHc<*ohHYn%L&mwXaVe>py!gU zOhL&`X0wbN9jq*tKP)3uoIJ(7{AtbT;K-x7g6E2}pV5q);K}wb(*AUd7;N>=0`q&P zr_w>t0nb@xX#U|=6V{50n8K$HCrTS>3Zv6u>Dpy1x3@^PJ+CbIm818_N`p>rYapdw zh7HoV07rK_d|^6EDV2X4r1=+gY@fyly`imwxQcNXF z7NIs746iX@$z?C80P<3yAO5oDbrO=Lku=@~-U^)5N_m?Qd7ZRv2w%y0Z`v!G;=*`3 zt(7j?K4%J>Vq1<(d{w#X&Yzlr$C!cw1M#q!Xq0k;{C=Y`@)IHY3A3ns0 z1monr-#`{rd;>zN9rUIa?!=XIG#)3!4+qu6&Cj=#$Eceyvx|Ql5N(2L@#F|yKa(k_ z=(>o9!x4d}@f|>v>qgc~>$|@2C=PWf`~p+J4aY5${~8^J`!@65S4ldQO^5n6Qf{{t zw$)IYHx?&tOhMkTsWZhPjfY}A8$?j_6F*Rv*VZ8$NQ1?u!Ln@c4;79Ng$(GgGzjY< z`UHQF5U=AV{PM9D1sw>N1^0ZSd8yCQJ-~9N;L%=B zHGi%?gk?}LH&Uuh9$s0Xv*RJ!`cJh4-3W4G*uoT)fVM(Y$gIy)0F9t61OL<}&|MVh zE|LHt+Qf;*g`Urq1yx)C2P^M=p^)}dh^iI;4?z6LZ;RNxWb?f%r2L=E=f=vg(Jn%$Jw-Ys8Ecxm^S)EbV0OBEL>e!Sa-5Pl`O0Tymdj7ZLIe{G3bOeJ zmE?`?=30515F-=THqLMwCPNNpAD9fS+A(~8M#z3u*N#z!JC6B3S|R92_$U>G{3z)- zFY;MwZ9HMwVbFyY5&Y?%`yVYTq^PB%Cu#Xfi-LxfBluB21M(ykUm>Le^Fd%+8ion1 zR``7{BNwmf4Cm3zWr+_lbKuONahG!4vUHA6yZ&p!?c3?namD|Pn-J+WM@u9j>kRef z-+nR4Rya{X=*?dhT5QpEgrSzNcJs6NxjRh&BL(N`9}FS}Hf#0gZ<^hWj11&v$nC#t zcGMq%s}VFznk@xt0XvL8{twLpy*gjB!657~7_l*akE#0Bt(D{bo$|av#Cqlc`UW9p zDV%m3xu*(Zt<4|wT7w?$sU*?d=gy_9ZxDHFAYDqBYq!o=1M$2;sVba07 zF=y`sVYXIQl3Y$2AAuu3PF5#OP*(VTE97a?P|`1k6TVGy{X`Xn-!QJqRvLusehx+_ z{zbVM#htM{d6E!Mh*-W_%1NWe;p_yqk33m^&lkq(2)Cm4+7|-owWtE`YV{{^;V^TO z6^IKvokQ|3LKcFEfv_Tj6N01nM+zRXY|}Ei+GsbWQ&y113sETvO;H>n8uSME6f_RQ z)g=|ksRwA0dJCKj$$1BAR`|_;0!`*0WCA0V_%-oUrZNT33^b(5fKQL4%b?NWu)H58 zt$QMeTKB+^=tR(bNJ`ZBKnc%x-e3y5!ZLZ|BxUl<9yXHDYoBm3Qcwe2vj`xh z(D{Ur-Mm6#PBBi=kPFY{FB-(e!SmnG2(f%ycJG=M^4qB@8kzvAWcg_dz4p*C%SMB6 zSK&2u$Z4l5*UgXh$~xs%LiX^;AuPdYR#I{XQcwpdw7(~b7YoKT%LQjD?bescb)+G( ze65t91rjfSL5O|%`*Q$M9DY=^p1~^i#&cD`87gNy#Rm;SN5;MHV+I37MJPeCe3;!s z>-(e6v%(P}8=m6cJ|B?316+-p^Eah~2t)LMPZz|y@+%eC)KnrmOhG0=H z8z(D-+g+BKM+kXX!`nUP6T?&>SP~y>iI&q0LT45js*^tqa)=w~kOQWuFc)udf^fb; zv?iVIq#5M#BL;;YLjFek;~D-FTP*VQMCAy`!gV-e66uxTGP8iUqf|$h;W}oyt=943idvykiCf&O>J! zHx7EGGC9&93`20>AVETW?ZwNaa+Z?Im&=i}Ef4$;VkJ7=hbh@!3fH4;S9N zUGexOkYO`*l<}ml*N4ytk5o!OArJ6FM>E(ft8+FgXRwVv;Zn`(byje=UPy??Q4tu^ zOQ!^FOyR4I4oLWApi#rXHUyrih2ag9xbiC#sgqa*-zxJjS7|SLn>&?3PSdt zqb#>EQm!TqmQ$VL#B@uh4`U0s$vA1Qa(NfKtzXVGh!X;un@{$+Bejehx5ywTe$0AD z&a=F@?tu`*%LrNf>VAnZ7K4ytWXLq=M8zQ-vAk~3ZOaQr56RwFSPhG-T-+|7dL8KVVME`kqSfY#r;{;w}h$BR0yyO9S=4#NiOv3gy zH5NKa|2S|^PP|sj3;oDqWYYr86|Cu%Hqvaj{v0lwuDVVkk1F9^b-qD-+VSB$xV+Yk zPEC$Xy51zQ96wSD?c;IfnT z7UNc|xd30qL}Y<$AL>rtGDyoy?8?6Oxp_riQmyPVh*J!<(tC?EFP?>4y|Q`+ehT$*YnjVEFQ?PW&Gz%2@K!bdC5(pwecCrv*g-M1MW?xUUK zySD@45l(cz9P)Q%g%(}Ka=k$~Y2so2lgkki2LZ)p zd{CNY#}N$M;gUP_L4nR5IK9>fIYYhl;q}a2far!1_scPXc|KXVZP4T6X)FvE~wlP9}G!8U3UVXP56rp7} zZP7B5G~XQ46)Yt_BBUu&e>AFzAEx~($1RJOT_|_2*DUBF!IDx*^r8C8`V*SJqEEu<`U0+NDHJl@dnto!TDyb9QUwh!HCBf zoocbI)R&AHB30bB7IPKtA*-Q8%nurrCC`-{Vx#N71iTsW!U7YEU*aC?5 z!!e>9YjmJ~1cn7F<`(7!5QTj5V+sS<+!S@nd_vn7g-eDp(et=#o%Vzl=?o7`o6*6j z8W|>yC%ehHZfd@~Oo%G;f;W6Y>0&NytLB|HeX>0Kq{_q369T!&=RPcl4MMkMM=BSq zURnhV8Ds;b;xO4DR?G-LD4!GZS@^tVWp<#ki-~ykKy97u5v8T^Bno=AL?2dU^J7km zHZG1Mx<=$gLUb(br{O$<2Z>uv0Am71B6;cmV6dQ1O1G;#esA!89vEtr9;Wak#Bj`$ zBVrYX406)sVS`k^G}OwMJ{LO#7_rZcv%Gy3Zf27#BxGd}w=g~rD>`||7l`Sm51qT? zO@26%!{-`AmE0V|wV4`lbXGW*m*yPI=XbIMlKk z$yOtuZINF|@)w!aAitC5d*VzVo0SDgRyq{eFg*I%pa)w+m^aNxQ7%jr80~CnszPe1 z(571n`L%k-I{mA-ZYiuoy@W<*_r8=Q{}Xk$;nv?-2%7^&5|z-Vxs2{=e}&q z3(x)d@Xr|xR^(H$U7Vv_DyJ(_q>d0T{O}6tB1u!f*cdOf0CCLeE~0Y?$Eor(S^O;y zLFbXEY6{saS4+Z81S}8braXo2U=so~$)5SfEh@#{S&k#bHw_M@GPqMygDkKV1me}3 zr*aEZ&>mB;&5ZK1x5v|WXjp-==FXib#f6|zYUhUjnj_CQOyNf?OC|ipAddpD7Mf73T(4``fd%m$gvbux8rU1LHg}l-j`?`& z@*f6+nUw)JN;;rKv%_;aHxH)}qGAl)u+`{5bS5klv{=l+T*G4mI;RvVn1Iy#2b_8# z>abIlCI4YM`$>?fdnKd)6=nrKEi?g_m*;uZS{&31WUWSpuv*=Z1iB)k4e z6%f`{$4oi15&$pBA9T$C*9Cb@!IRv~c7%wQU;a-3J)2HRCRIa#EgqF53rHh7S`&sp zf$nmtVG1i6Qn*UcAgo?!!ZM}C3db%U_D~Uo{6e|yI2NS-rkg1o-a=QgabU*-s>AE7 z%or$fqhwA!ASy6r%48`bjmP2LXe#&`p^oNrHpng_eL3_8o+TyC89KH$XB~8l0l_F{mE^P+nH|nqQI?KJzW*pJdYHlUd`Hhl`rG+%>lHCcz zcIhyPp^(=j&TKKs#s+C7%|;qc=H2NwgRsq@{`a0%m2?q~W6w6wT%3gbY+Q1H(;TW60`N$n_q=HD2Ct+@fLVqU3Qxv=4Ts*l|1X0^H!IglxU3vcG*mvoJC! z)?d8_=?pZfQhq072_e6Z9BLL1nS4(lU*uat{s!TKUQQo2UL!2i+hoEBAjY|Qi)9W; zd<87eU{_rZ$H#?}}IS9tL&p4LLRmK!RN z?au(B?n_ol3Q3d)RXUu`Fy3~~vnq>$W_+kw{$&s+omA)l>2q;Fmf9i5J*U|@vV^?x zO3w30!S{6g2Be)d->jyFhq!MTM7Q76|W=KWcu#j8bn_fW*o*M>#zpF`yS~NCLQz9NA-!XKDgB$coubOb! zbcBgkE?xmsDwnzky{6eY#^Z;66(OHB)D4AWV6aQ_n8M)%F`>Tqg>h{SH$0Dd-SQ4> zquJw0J{-)(l0+^gWX7^0dDZBh+nVHi()b`(ER#Lo0J(c>J6%>2hdNie)0ipo2B9eP z@-b2pqHd~s!7E6|n<{U2!Y_PF1)yp})YI|7cv;L86v&907>gevM3sU#->fheUlRlE ziEnFO-e1!TjNiXw!iZb;6KOUwHi}SS?D?)jyRLn$K`3=_@l2jK=nP@i6DRHOsW7hg zGW^#|?`yGv&N8|0L(pt(HN(bQ_g$voS2$%dW|D6S_|%jfy5{@Cq+#;sb|tbN!yl;t zJg6`r=4T%R;`O}bR5|Js(Aro|U#*kYC2ui>rNy*!z?}Q33SgRDynIze#2VSe6n?Og zK1nbxB0XSh(O#7#n41;il!=xv3qCKdTe?jEwlg^A;3lh|DXexj z+{5lrVHhAdV?B~eh^Y!QxtEvre~ZW*6N8g}2DQIsr{;3fx6|o<8zGx+Bnv|d``LF) z;Szus_8BUJ#uu~4H4%$#-rL3hyfn#TA{%)WQ3!|S1s@`AF!lq}94#4T;yAftEE0{E2YDd z+X7k?EC#%2+}kFk-ftuYl?+9aPQslmX51qP7Tz5Jaxr$edCf9~DSRVr&*5uzjR{~T zGNr_rm%Oy`1U)}XwcpFeR&`3ysweCdo-8< zg9Knc8Fvi3l4a$;0P&pb=gLNsS|=KA)qDepXNB&!@M5tX@~sN+bR7fLQiFlC#6Siv zB@yxi!>R&-GTS@k5nlkuC7LEOZHL#d5F0;1i* zrcaaE|1lP}M2PYbX%N-Gm?zFpDhchRT>7&~A}A^5kdt>At2#ysNuxfvfP%YjQfv@w zM7Ly2nhE*g;E4EK8KV{-*^l#wo(*Mmq_Ka?CGUxRamY z!u|Yxg#75ERnlg(6I&yrCdM101-=#;;mVFcNMFaQ;h(asv5?TlJf< z;&2i$Cdmf5L4=#TjNRM}F34UZ~%s z>1%u#gP`mfE&tF=fxc$xA;!EWVAL3>h4uF3q zWRs-C%HlnyS!3tp2!=m`l}uqZ@)Lq3xv+NtLUr_*3#OpL z!T4@@Zh}4v%T)TqnlaHJOxtivd8I+8x{}8Y6aS*mqK!AqZtztPU=OVQZ<^*CS;V(KxP7oQUAPIs&ffd@1%ly8B`VVQcYLbg1NX3LLLNZVm} zfO?jYtsUCdamn$VFECQcfMv%WukyHo%t+rK8${pZcl?GBH3=3M%c2vM9!L1A3ev29 zM;8?Zx1OkwKTp`pG!t@J725mjQjNjxM^Dlc-1tH{<7A}+!zJ?gDN1`iX|jzpO6JxZ z(U$rQV(R9ljMoV9I__Yuiy-gV@%_sLob)uVPohuN>`;#JW=g(6m@3ec^0d>GOABPU z#6043h4_?SPmkmhqHx4?#vhNQN|}P!kJxa(1Wg!|8?PtqbB5-{dXKMskU?0&FiPri zgw{z~YH2<;d$)Wk51B_zfQ#o^Sk5NMo@Z)-aCqi5Hp=lnjBjVW#2E&$R)V5g;s{Yb zvX2iW&}#64vn+dRJ=eMPSVf3& zZ|-J!%Q$W;EWYI4ao85Astn~ zs*p1=f@z^goW?|aZ8rv;u~N1WBCp#CZx6-_R5a1`+_~qgWDtAaxCFVFkkuIK9Ut!J z?sA2t=mMPYchldj@Zu(VOjD1C=Q<-xo$mXKb_{@nK(u zH`Xo3O+iuoDp5IdztLX)m>f1$)7`>onMxWp#M;7cO5e*AOd$7#BcjZi(^S-ptC6*& z`Adz0H=uf1Ib9)s0d7a0JUBxk6vkeCv}78@%*nv!?LMr*xz>;zJyV6TNFDF*mhA?2 zU?7ApInE$mC-s-e`?FNo8||dSs_SOkbFufwIlxAPjFXF5H-nJh9YgpJr1@?+`55x) zS@zTmH3i=T%3yBB?_dhMa|ccKu@R&{iF+0*Peh~iiw>4cnNXU)d6 zaVY@4{q^(YCX#p@W)Ex(*lj*!3M-S?EYX)4U7sl1NMrjkXWe}pBEs*Jcw#;;zS_<0 zlNSxb=_6GMXIu_09ucuxj-8`|P`>gkWd|X%d<^g7Dp+xP5(4uXVE3W_-PsX|KH6-~coB@}0jSr;Da!Uzqg+eYM zj=fR~gBQL)ZiC9;j{-cx-o-JS}t|;6R%dT(_YBo ztjLGy+k@pX^6@p`@{8ejpU{TuHXb6@UU)4MwR&9t4K>JGrr;5@HGx|8%K26%ZmYve z<3&Qe*_F4-H499zzJMpBvDX2jnyay9f0{JQC@NvN<9)7Ih{oatt|NBCY7U#H7<5b8 zxenW8(A%EqR;P+{gXX1L2g{F-Z?uxLN9i2ynT3FCoA^HIzR6fUQMfQHUl@c(65f2g zZ;_RQ6HTgz7Au7H3vQXo9fT++V)I71&sf2zJeD(SiDlH2%VjsKq}!h^Ii%5e*M~3P z5V0|0=MIl4jYh!iOlw(}EcE`3;V=RDo`Aff0_eNts-;?}gFxg9Z&8*r0?)_2 zyCGIBybJz3xqRU^cri6e{Q7xm`6FWml3L~UTeT24YIo8a<+0lo!tRk?s(TD#reUn} z5kh>^ha%)nl6WI$pch&T?wkJfvsSjdnWZxMb}b19f?U|oG-z?s=J_xIj(RcN|6PT# zRG^Vyi9uTM<7#@?9r`TH9Efd#n>VtVDQFeci(xyr65?G0*z!DUtYFk2gU0^TSWZWc z%)iSohx%e}(0ZA|a&Y|?XN`N`trN&j>&@%rs)tmZ!3q(s_ECepu!4c_af948AdKKsKFn|9HLlkU;sX8n zaJBs6L!6o<*2_~5Ysp;v;Eew(gET;^%aBQr7#CMzFiMvh3=H;gDcM5EPte^Z?-cp)xH6X?m4YwiOd)zYAFHB7;zV z!6}op8-)8IFS<>-3Gr@0sAiWwspY``fEx)%<~}Tm<7(tRgMq&8z*wdn5v3V-MwQ9c zMu#C9@N1-14=@`}IjlsSOV`H9UC|2NvThe$dpRK*h7}!4=Gob1rm$yLGJT)17iI6mMebQ>Yc$L|mh{@F&$(9aj0efzFf>_unis<8o<|L zUv~a$`QqawCn}=R6@|c3s zBCz0ok2IPFk=78XP8JsehJI8y;oP@Kc~mTS)ySC!X}Ja$LhB4dcR*vJtp<5uhDb)ChNkx%Ci6ms+)1ejyq9 z`YDXT(2YyH4Soval<|emZPcesoyvFyd-8Q}7uAaQyr$X_U{)&p336jN)(ewr0%|8Y%mk z(!vzHJYsV4z|(6{2oq!uQxCTq!~r9gklz}Fy9Iipzr0OLL$9LMHuP9;`YEVVH_SJ3 zGKKM|60rCNuQODXb~wg zrxPsphLGPnF8^+1O6Ws&aLF}Fh{oQ0zg*d+> z=dh~?@pxXAe#~gMGfK|yL%KFS_qxp$63Y~}Bh453WW*rmBc7kWM2K?6huh_pewEyT z(HY+O-WX6A=#)5opG4(9U;Et|D)@_!}KhwKYko(0)9Y1k5mOj5fy=i5E76Y zLJz%zVh8~ufj|mf*v)1)+k3Lvl4NVLn{3}rw(ly6fD{4gz3kJFDxfs=`+7ffZa$yi z@A`hP>-l5eXU@z$&)joQIde*CDBJ#}G|dj5xm1PEd?=Jnl{BR4JM5%K0;FuJqr^^e zC}S?7h-L{=RlE_yE~Tj|nO(u!kjTolXPlD02`|u1_@oy{4M8TKv;Pu`u;}vTN*diI ziUx1%aw$2U^eB*Z>6}Hk&$k5vdO+tmF(@@X9qv#u)9_A(4!pcyXa`K z`yVGaoc=CeZ2uObD07Pp%=dpHKn_!~$GX{Jyr5P-sP2ez37E|}p)ktDNW8nk)DQ{BaFsDd6R*)e%_jbF~ zb96Pr7JK`@c^xb3wW&Q{kbaTqtCSn_4LIls6yB=aJU;Y{7inHC^^v+~fjoA~VN-3} z=R!JOw$jEt9h*Rb^~FF>w)NS`N^30V70Da>880~;>Q5FA*c~qec}urQvbt6PR?MN~ zo8u1=T-~M}oxKurx#>%6DYdzR;hmhizzRK+;Sfi*I^a;B9UGdxUkgQJL*r5(JxK6t zRflQTB+Oq*a{u$l>wy*J08V=LibL&{5W^kwhF4TvkQi;VUkTEE`a0^`6`>*}3O#@Q zT-))@z^khx6ytVy3z&XLhn=mo1|O!_MJ0Bwpf&*6JqY)7xywG`>WF7m6GC6x;5_Vq z;{Cm7r6``l-y$BC$CfeMmIJq>q$+HMMBbN$yz85&9hlii>`vl>P zW%XEy{5FXxa!2N=&;X2S!`RGz_n0cBMJ=3t__Ki9D7TUW+ZZlt>z2TbmSS! z?E7DazFb z9=Z4;`_gk_jna>^U3LhJ*dQyG;NcF@qmpy0mmMa@xV0H1C`fn?ZP!H-#+Q4GqE2l? zW4g!=cgV2O=HZENhR6PKtL?ZGNzG9hZ8UAh0AI(VMIus~C_;&V+d&%5=!P(Kk^M<>FZjBd5(#ZXTzEEStGYC>t*>*G0*4hw_h$Aw-TtY)y6O{=^|J zLFEBx@wX_ep;WWluKOR-8YQf5CC#y(0=q#JeWi=+HYL@3#8=GS>l|vRpqVx9Furw5 z60an?>>6rKt|ENB(&`M48ak-$@sYa)h9$&Mx?E}1%+(58gwG?`+=d#l9r$gps;S!M zMCqYsyH#nbMcTuek6>IfGQUkkRn=3ZNqv(be~7a=3TwFbvR!r$6j`}4(J^0tTNH}K z^V)RmG@@ zc0FOwz%LJTXn9=iQ1YjOa=S)QUk038J)OuOvPYv*gCqJk??t8PCpHH^{f?K)S!C`HMF$U3#nCynm=I#h&RdrKtiVl~_Tu z6$|q3-{LN~jyr|b*5M4CNaPseU(P@-2J4!czWM!7L$ZNkl5J;2uy{bn)w>{6r4CMQCe+g<4-vB+kXhu$U-N-3!&92%MnGL z-?%~^&PxR~lKY#i-I-Gxsu_1`|Ca}oyZ`XdzVFU1png=Hsxm_ z7q6<4nXMP}KeLgEbA$cKDZBvWby^bM5JhuW-&Q%~s>_XG&+^lsdqKn*t(m&I-7E@C zQWRQ5`yVnGkv<4AzX;Zj>ooJ3AU%d!gLd$|(c`9ur=+S^+I>pvy<}EuTioC~OgzyXYN{ z2^89a3{auVt`tRGP?w|T{%l5oC=|-0;O`x>`C}p?>3tOCju)kaa~|tYA#@<-%f*7yFylkV#{26d$(6@JS7)WYfU2|RiK3^}=A;)V zhv;C*Eyz=XwC8-Bs2=vqz^M~88d|SLaCoCW(5&M_E=PGS=r1@#`mR_phn^7XiLWWK zE0k6nm^jze9Na1VCaEcm0N>y)XuX#q31&N;ti!D~x{QJpMN9d{*&!#{+ZL zL_P7d%IpYH_%m!Ywe^0(p(L7R`BruYxjbj?yruS*(&}P)l1vK){~-#?>O#pBc0Dsx zlxi5(M)DAc>3YlTTtOwwEXxXIojwx~x2c7)GIRcHp@oH`Yz8p4E1_AW!Ct zLcK+&mcKYddVRr3@Ognhp2t4B%E}$ec{|F9PYF_-DW9FJ!hyZ&3@8R^nLgrt7-;x; z7uvoTgql!Jl^ct_F9gyZi-^YexYA4ld>2739uAcDob$}UIQw;_HROiIB~AL!q0Fc8 zLw49jp|&a0W}Q5nQrz47>ZXz@tyN zs!h`O9WIBWN8khnpN>CtsJjb7X-#p6;+SU9;{|B|yYR`8X306GT;avp&+^_n9cuN( zKj1?S`7heD9&u%0)~+4qsSa@#rEA*xU*{@dk^SPTz#LyYEgdvBGh?UTLy;Q;CueyUnwvOq*`6D$RI2 z|3*7&Zb&Y>(-tYI&h1_)a{m)_Bls_~~^}Xn1^cmW?T$Zg_`u=C_?6m}PWU)EOJD zu)9TpAyHDV9ol}^2Z+Q(BWRVwSP^4n#Ae^%xs~Yy^eTtRjxOC&A9-WQO;+XU_6H3(`@Y(N)7Y06R+qFDJW{>Yi-R~mh{?<@@rL0&3bfqO*J|7*bo$e_J@dw__M zrPqDg9Y88i<@MWUrS)Rdo_!=$o_Z$~zL2}-+T%*MHb9yNLkkkO=DRf^e?mk0t_EXR|%$P!V@AmY;PWxpdVz%j_rjdo|gua>}_x zPzzd4U3@K*LN>FMPFqhzY-r9Rrka-Nr7AlUEv@!o1f@1vY!5q3cCDD?8uVLP&dipE z@AqIq{=_2001tRpd`*kYa2|BX0-xqw5ej0yA%)4u1nFZP9kKGjj~9lOhh~S`;z*Dx zJe^ujyk({7T^2DML&{8JPEM$e@0uv?_N|qnU}_wXe@gq+)grq~NH3-{gEdtv@di=U z*XS;mRoUkb2DwY18f7ey`XuhpgWyABhpHzx~d zu=BIw=JHh1^3w_ff7;vbM*CY)fb1$H)w{CTYa7}s^@HmL85F*W?{4BMI5yH}FFISi zJzqY%?k|B&|D6yaTu}1Y&Mk$)tB&DM^O&+wXJTR`hEe~=f(*ga&$H>}UJey;1RkgW zT_OtqCON(V`JH`M0i+LtbcBy3trJ=qcw%F1$?73nDyS|&+qAnx!nS&6h*vdgncghO z1Lxmi*Q_E*v(7!^Y@4wnZbpvi3>@g9kTOkf|S8A8E+nTT0f)SbS z*>goy6V_q|zDe@8XCq458M0P~Q>p_G!&^HV7NB3(gsRe8^CC8q>bUhH(!wdzjx-E2pNPx$q+t3FOB^Ns6>GzT&>={nO7xt=#Q0detJW@E7oMA_t+Yk!owKM!3*`Rw zK$^Q$=Zqs9ESNWUp`9V1|3!!~E;6@i0O}uwR?#XTpXwC#RDcoPhcvM6f;=P}(@?Jz|M zP9zA2HNmMjiOnC@NSwVd5JitgKEA%#&TV#vV)?z+%C)-18HyU@MX61Y(RSSud%$xz z&)YF zmj9MBAe@R1HrSyZfj?0_qMhj)hZioocK+g}xc>2$6jo_B`IsQJu|yJ2Ro~AS&LHhT zE=qpg34{LZ@;oc{bdob@t*58XIMRNvH2)5r8(Gtx&=ra-z)ksWN-K-&BVKi$An$h8 znf8$9-~vdi?q1#G&=86H395 z)>#yNXxxk_x_Pt4Yn_Pvrv=wqgC~D^g#ATHnuYd~q$STe#4bhw!@rDjxqd^-dhyoq zq}YlTe&2ArQmXF%3QPS#DR-Ecm=~3M; z3eQC`BeAN*+8xHnYLl$t-46LWVr8Rt%vfN?5RgkyyVzlDIqnY|>_dmT=WnjFGsgon zZWE<;-|J9Ef~*qTZh~A5h-ouyH>GJ6-#Z6wS#XUxMg7x>ufbOP$?pRX#r176!JaCL zTAtmbt--wxB^ICuiXF;n8UucZKZN3{uxyO;xNY-*7ca}z>H9v1?6GibXW4=@X>tt1 z@p|paPWYqqxAaMvRT#m}M!6o`?oT0C@}SYxwnb3=95gV83+>zo0|Rrn^ni*S%7I8X z?iU>L0miFZZAg$odDlWh8-<#9SK0&diQgd#|68Yv6&?1NLv06$7`4MgAs204*~1SB z>L`oauO0Mos7v-O-IWqGRX^g~opjo2dtWfT2uUFEVlnuaK+$5_*{NgUw;zSViVacR73jCW=~NxtcV-UHhVyjR?nS}N>#{7G!|=rd)ku$;;S%@`PP=F0+jg_j%x0B z+L@cj73%3hhpf3AFKou&LN2~Yb%r_7p{_enD4mFM8EUvd+xD5L<_yUg7CV%KlDgHl z>$Bwgt-*X-CZzX0^T?y@ZUOZigFdrHw^zRtMSl*{o~kOF^L${C)?7Y;H;QepD7@X7 z$nSpqLdc~!vy!$4CyPSy_()l*7V9#H(n(@NSNLMcWxvJ+pS|Od?H+4_&3-A=d;<=l z7bs0Va{J2EkmFhLRjcipC~Iwha_v@oSx|og8~T!Q4gdC+{V5!SkOW-yN~ktg)MfKL zebm$=ZN{sC9!Gg|<7=T3PFixI$B`)Sbzc-i#n|o7rVf zMi9reL@c}L%}~j_8yDEFZ+XGeGE}!MH-b6zDuICC4!Ic8>1Ou_lMa#lGqe0yP+!!7 z#n$CnsVa`4J$8L8WG!8`*mjJEQZjQdsI=`8UI~ihL<_sqAzg%>&p&|L5n{<{%R%Y;NY|MT}ygs&ERAN!p{0v$?roQ5sZf^-vFuAhBcKFb&!&pS2`_W@8&P0RwWG%QQDBZWq$lXv{|H&h;UargY5lLf3Pe&?{c(WY!)ZGD zgTq*7p%frL5Y#K?zO-|Q$U(N}CxH@dnB<%0LQ(YZhBn&vpOOyeKhDDPi-1n)xQLof zX3U&s_x>|*Czsb)k*8x@N3C9Iz3dE}Pv=^@!&oQcr8SoQFLIebXJ2exO2UZTS?XcW zI7Ep@;jvy5RF7?HxBWj0>A6dn+BZHArMT}%v=!MNg8E;Q(dTGsU+fI1N3m713PI{g z6ygIR%F4??htw^(~aN*xv;C6Q|F$_m$-N zeAY;aVna4+vT&f zeRl{9NJ^NhbxnAIC=4jNFy6TZxrD5~ZgC=-hU{jvZg4F!>zi=u&5zevmC{tJtsu5J z^$uCN31DDP>=bepprU?_Rqh;MvXii5wf3Pyg*q?7%k3^97q7##%I+5ADQ{&oT33`+ zHX19%Y*>&E!RMm3i#PzawyPb= z%^Wryw+rgkln|zn5t?J~MGOPI>ip^7hJl8qn#SxZrTP1Kd{ow%yN9ggc#plQv<45i z`6IdZ(>(%|m=Z5?H#$r^PZ_WZhslXGy5;%Vo?f=OCRLwp8G;%BL)@me=rZmNXW%YV zT4j0P3H)sGQIfJ}1sVSHueZ+cl4M3W?^g8K9v7fVZuK(VfEr`Wd6z29BlI3QvvVI; zCyLsiou^IS0pANSfdLmjIM+EuCyMc7yY59UPrPuhl`E-ogUe$%*|yW(4p(fF{#3o7 zUY(1y2d!rxC~8Gjv7NkcV2&3T;(*t#6x6F==Tz3GYqWj#3k({?#o6{Fhj`PH!;UV8 z`qufR-VvlYj-faW%RST+`+NNbwR#EDzaJQg-Z?qiC98wAqR@EmP;}k%oc4L6W&bd*g=?84>5AeMc6Gs7qNqD<+h<0G?R+Twdhy8C(*81}n0}6kG%%Ti5rNX_m7H_nTLNlOIm*aHLink} zzHW(}e&#TS4)B^o_ ztl&+*@Z9VyRtPI~i1~1w>-@I_!wZ$pF*2%;M4>UMwsE`V@Q_ZF7ucPijulMUDy12B z3zyg$C3!d}0}Y2i3Q`~Ka#_-GM+63xdgW^H0*7dWC0o~6x1gS%o^>Tp8y7_b9kZ1F z+R4#UeLp)L8Q4fFjX?=_f?3YUF zjbO0ZRB5LQs#7pkC|apQ(-~eancra-J2Mk7F_P^8hl!p-St2H!GuF7?`lf}a#aFGe zXO-4pz{IGoNXNI`k9Nu!Vcb{irs7&rXohTh5sY3q9Z1j0U2s2&wrN%9&yEQ^Oe&hK z|28AQM0s~&Xx-N^$tP4&WIN0xmu@+k%ev#0)_=^)*NpXy!&Fv-^tFF{tXI2YLfVes zI4(flC^Z+^p$?_%&BuR>VD$2)%(O<&j#mxKk?&U#EGZZF?!Sav{~WcKHNDcDoGyx* z(#@q>h20>?|77YWXt3WMWQj=ACr8&=ho_T6*uh3+C3xOoQ;!c-u|ZQ+r#Vb^<5+LR z?sAwIE<}CLjy@qUBfn7Y4H3-Eh_9%%StpXqcvyCm{o$nW@FhzZ+x{nqLa~W#Z6l7E z1=QH+ZcvbKKIb_RB^@d<){}3QqUgmTbNJdz_B$f-hvI!zR(Pt{xv@Z^`NL<2tSQr` z*;J+J8^t0St0c6TbC%?yKw-5}ZtFS?%gas+xyj7c@}sgrP(59;oXxs+rGFNM|8&+| z`yk@o+7~OTw3gGo;@tE)Fy%}jt&6QKvAdL}`I;=7Q5hOhfSi~!+{_oY&sm|YWO*oU zoaqo90d2h33i8Y|=G*b-1V)yzWZOE+a+u1fjE%3bJVBm0YtdqxP+Hw8S!tCXxbGY< zTUVr!eR2e|Sz%XNZ3Jt__@W3>7K)Q*8q>t~15wnETQYSgdFgopa>IaEAscdtAQOjD zcJ=xGtU}zp=h$NrtjN`!?iUUv1Hy&iq6=VVez|BVT~RK;V+i>xb*ZNuVieAjYjKBK z)idhth>HUABta~h1Aprf^UL@;1W^tp-C+{i=@OXfk+9Fx#r2Owp}&q|UVBJseL|>{ zG5_s;X@H5L%0wl;`vu`laLs$)Wg!b0rQ#Cj3Z}Ex*!0UitD#Un0dI524pq&oh;mEE zWJLbBAO&Of8MfUOA)U%;x0WkIC8@RLT6jMZ3@#rRu3|FgtUr?C#eqfh?)R<HF{d1+0~c?y3xDIe znBeb3-mSBU*Aht_@F_g;eiDrPZ+MO z*(ht(TPLX2B4zAUTHhi*ZTREN6Mu-xjw+Ub{q4HI5Vpu#$!E_G#ZyEk7*U$y!J1bh ziG(O>n1WRl+kc!PRn?hlT%-Cgx;`+ldZ1BcPde0*gKc`;A=5^Jc(q>&($u+2mfB%A zkoFtH^j)F0-JBn{z0sf8J`Ege~Jd&Aqp+jVVO(X96{!R zQ_is|zYRHw;Q@Y@=}_k?9+W=%hXt8CAkLp7%^wxg8f0m;htWJ#U< z*&$9HQfs!@WRyEm9bemEyWbL+(L2z-&V zv+HjQ6#k610UV?*S{#Z@4(G|ho9KG+E5zOFD zG{atSnBIrw*?WRo1G#Ao8c1mJzVAGfL-p~jF57D0JL(Ldcy5+(#af5bjR}_LA_>N z4X5Lq+tND%1IIf(YlB1F{CBngsR-PbVsx>`|rl zPA6)masHFL15~%8%6YP2=wwc!abjA2^fkkXf{1<+F(9w$S*3%(`S(N>%7cRC-5VgC zb{gx<`-0J1;|6Z4{o;3llIqBm2YNo^22mKew=J^{&rBa>)8W%^xi8cgt7SG@WluU} z#fo(o+jaMc++-_J%?B*UAs(@Gx43skcod^4-cc%nSdJ*_>^vfGbXmKgMnQHB%aS`e zd|E_$G}bm?&pSW26WV*f;1I>IWLKu0of*|8uRnH81PSa$tllC)y%~0zH!ct9L}|9& ztTb(q=MSuoRy&*=Uz_afx5KhRZZfw!hJ9F91T)uZKm8wvshU-asWW9oU`F9XvcY0O zO)|Jh-Q-yq5hyh2;n^W;%3;&&4yDxzeXWY>o8=HI9W8CEazZZGM7nbQ{mKB*sFL(| zd9Ifw?hY@1iy*I;x1F{gtg!)6XhU+vpgrZxu^x^jtL*N)kQFb$fwI!Hft!B5D*I8s zXN~sSZ&uA$YgD2S@sAl03+2x7ZIFv1fHLvCtt zy#{!-L(Ddl1!Zq*tBb?4)Sv6paRfJs!rz&C)R8vkxtzT8aiG?CLP;pj z7q6YzH%h&>4VluQ{*EAJ6RZxmMOoYw_cmIAXQfI9ZH=c1_-cQR=#9(mDKC=9BVzE7 zed$@riEO*KEYzGNBGkfi()14@Y_VNU=C|9eqVPQtmHy3&z!>Xk#8VV$z4ornxDK^@ zRz>BcwUlV>w6&s8?#U5rR#LpJ4T-*ddrD9*75@@DRE2WP!y!_log<3gv}ETySD)k9py0b5q@Fmsrg zL-w&Cr4}w*Vrv^nhmQ%}J#7j9EQ%V&J{d9Q+Y!ue)h6qc2(p1?;Gfe7^Zx(8c7rVU zZa0{#Tzy6uC1&S%NFa7ax=1$Te|J_sCcc;V3G(t(e!nN z`Wm?Sxw{1l|3&lumJOb(-C~nOx!Y_EkK@Q7&FQ~5lwvst$ycJ>>~^U~9ntE{2t^o~ zNkM9kFK@MvmFAD|UYLQn-?ig51qvb|EeqUiTq_EHO;M}rHrqp+L-AOBmmS_7vON0y zeS*}S!vD(w9U)7KiyJd+jzhw*Yr}h)L#`Aw6Bi1q8EjTs(r$>?h@z_6YwQW7`IOFH zWczf6TH^&3teEykhq30KB=*^Vk8)Y-akufkuE3nktdWk|_XWdf zxI?=GLu?Rd(X}#gJWdq-bwX8eM7=L!z{6jNkTu-n48%TRh_Bvd%0dg-pb?T~YXZmka-rYxU2+I_LY z+(cfr&M&_bkqVh0V$JndDX8`iH`>u#z^S<=f1NFiU`}sr<%q3t$kH!8 zi`_;8b81DN)ICmi7%yv!*R%f@RDb8@OpfJhB6~Pus3&Ynw|x@9iXvp6w*6S(*EfLk z1iVl$5`}gyT|D2Gdamwz3bW${?Y2S`4IDHQxkI#YXFGo9V0f+ zVX|XDE+OYUcb{NYo({}!Y2y*1aLFcqbU}n&ra*>^>Bu$7F zRp`>^DnVK|72}}HC=306j-gNeC1fdZ>E}x8C70w9UQh#Y+M|Jj5`e36UnUAwr5bas z?lIE(qZ2E*2e3~Bc}plYiN9}^E|9N$Jn%4UN$T*3Adlc3${v()ar_g3L0de8T(=2Q z|1ks=ZdO|D;o=-M3dK=Rkq1u>P8Fi)ALZt9sT`5D$460-v_-q_9bZa?N>WYh^gj_*za|PrQq?)y&VB4qVJC+3Y`b?t zuG|G;w>Be!>3P6D7v!JbGXE|+zgcr}ZUnr@r=8CX(on7XTc&FM#pYm?V#mr3c zn11U}hYxJjZxYmt=wf$SBnjF|XCSbFqHM*SVRDoxyhR#yXT9f-!-HyepDh)n1`UtG z7RjXd{=lD@myU!oMUa-BbDn+o!;rl6O#6u^Pd>%YR8sr;Xr-6ZOsR=6@oEVXGt9)UD<1QHNYK5H!YK6%1pKeXGI_*epXE@;Z%pO}9L7pEp>JmI3#$IXv0zSYpF2acH3Rh* z+v$@~8(RwLh8-=)TaA^}+kZWsT3v33e;U}56KGGMDur!Jo{!6%4 zvN10eUtKK|@*n@x87g~R+%}H`4NZ7>Y6ytGY{DjW}cXKU6VeT{!$cb#>=jx1*;wM9+G2Y_Ou}FnKJbl``$N6 zhuMCLWeSiH?;o=oPfwY0lnp3Nb*Zc^?7Qu0hx}W)wr<=hz#OmZ!D?1Y3040S z7-B06<9reiI7Cn@k^Soq3D_v*mE2vOdAtzw1$#@72PMk3+D~^2HK*F-K^WPjogxa4 zMztila=qmWin%4%4*GUrmLZGVgNdRpW*gJqP6)ifz@Ks#PMe$o2ePuLd|8m%lB@FV zEoaso7xe5_?;eUH17XrkHMiQcq9FPR)`hHhA*PhHaB|mJ$sXi>{e8}`cZ6w8GGnWb zHZ%7OFy7aQ4wl^$!7SV%4ciXiA(y748duvoz*F#s27kNjThTv|I5In z7;hv}nuX%1)?2UAFr&Ar4c9xKwQ$iLw&r`G=HLYiKT3AtUQn2&;=K*&c_AI7{!$V7 z0#9CId+hC%4Q6Y4eO8d1i!QNC_901&8@#Ho^@3&3GA26x6U&x2Kb`*KUQL3SE?YYY zZs)Zk&myANgM!pL?J&8Qf7~;Pxv3d!_V>x;Z_Ho3$SzWvrn9G%|JT+0|RQ$$?9U; z(;>@5a?3jV?vKc&U)d|@x~OmkH~a*7wC{;dlagcIa=BRIFkU*0V-M>Qr17y0D9w8o zy0j9{rv%ktsY>9EsK~zg<4_d~o9r6*beLSRJk>=EC_$<^g`4)dhmdAOoq3@>swCvw z>7IjsLRwX7*i`C2ZGR{fTE|K%fzpqD8ld7DRI~>e`V9AXGLFB}8Bj#i2X>c3*7U?+ zr`<2ev*IK8IQ^NY+ZyaRrFja%IXT{5AsCd7jGLx{9qM*;;qlw&;u|-f@_d0@H)!aLRK9Dxv zr;9>W>byi!@!f)>(qIh)QNHT6W9QyX;9(=q!{idXbnzS0_Cva%mZ(a2I~ zhEpzW8=QgdBGH7Ts&Lh*p>l!~XwURDT$7D)fR%f$%t#PwA2mBXE3u)-exr04%v?+A zUZ7VLhQ#bSw$o{$)>!d`;@g~mdVulDt#X*K?-`+Ja6B#fXYFaTMET$Ekx!Bf>3@nySHwzD#y%^gr%ah{rz)+c;+{l@gnXMX3LVb* zHkChMGp#D_Bh^lB1RJ{0uwIVXP}BLC}n44v`1P ziIKL$8(lG6#QH5pVzgZN@CI-i?)tPx~ z=)GIYA+|d5>0NYXs0y1tDz}#$a?ID=$v>_lH@(14aHfnkb;heoc;kvNOZ=XptHX2S zt8%$E(zN`jQ`AL8ee$Mr*{?$VQj}z3vtI|O4G9;j zl{HLS8q}XV1LBcn=762{8?S1hO~Gw51zC>}RBB$#=tXf*-e4Y{8|u-;On+^xqQxE& zg@?wsIhl9lZ;FXX0W z|D!g?p>z-%vuxMvLT$_ld|V1(Q7VdR!n>-IhnJGl1R-C4I^P+3dStM6qaYtK zE)PqURNqip&_-MDyr`B*CBH*Zf3dCFQl4J8V4l721(}`D4Aktj&-J0!`OB0{g~oq|@J62D{eD!Cu?ZDcbjBY>eYn?z>LWYbnT3`0r06o}vua2MGk`ED4W5 zhN3ktSCDEEDFh7%`)X-mNc0ye#Nv*(2L|3V*JwIE|4S6QLRv7b{Wft&V9?GC6*naQ z*WKwgpoGnm{gEJ#kS}Q@V#qP7?(!O{O8B0wRgh)){Dn9V92Vd+;y-X|;!Z`w6APb@ zybPl&+0tv@xEnUQ=C)5H{+|sbs z83+iH#6$By1UD7Qrt@d_g!;MA(Dvmp!BBtDAGZts52b!(Y-Nt+dJ)N=*fFOZ^2Ef( z4R-FmUSUa>?rHiRGJWDD#vY6yo@0rzcH-~C<9I`AKY?jaz0Zp`Y?dLyc@Z4zksAG6 zK@FPqBUa^E%trDFcjWycYu1f(?LSKEEsn3&RC-(nkWtH-4CffJar|SI*%>0jlWJTm zHSRSI2}Q!?qRq?`rdY*F+utFhJvmxy?>kJ@pmVs^mgENJ zSSKOsCTv(R{hGK2$t7yHta|Uu3p~lvF(xk?735i$&A-+@^ej9@;#}Mg$|vi9UG`=* z<6m>g)Hi}U4lljANX7#9iGex($zjV>lFq}jMc?ehg8ao~NiY9cjxTmAaE6?ANnmC= zWFF9E!{vgsA~D==uOo{A#2$!*V&hz zJ{kW7I~4~u{Bmk5wr7b$)I;g`m4ZC?=m)y(FG|y9_8G`$WrP|p4U||(x8nIS$?sGa z_!DDW?0ls)Uh3NAB0ArpG<-G`*q-I&()T)>;~N`;pIY1@i>Nn0)X)96?#|81P?n6i`FY9t<>9e3?SQ_q0)84iWy&-Z@VsmYgBU?nsW_1q4x{YTC6~D+l=;)#Oo`ZNA5G2cwZCb(J|spFI`PqeYJYjm-LTC z(EwPnDV|kt)oTL8y{XI{IwOcD!{HTnWKGCT^ly|d#s-J6>`|2Y>@kP&(MIiSCmrJX zMnmP^T7OV&tv=KD>jF#*>oUyN0@bj}ar94hsdjKg;k;Q;&gRw57KK0ZHqHs>c&^?W zM+JUttmkrM$o8oZHSV79SzDETb3-Vd zXjp~T8+#M^p4(eQ_07tGSjQ?m$ssOkB_7Fi80)B#{ni6fZtX~-Dbo(xNLkkH8Is4| zB|xXe%LZ+=r*B0txMwSLULPwb!YhgV7PN&&#`;$#HWb_Kf)t$37W_R=Uy2L6e>j`m zS4zOUDU_0;Q1%wP)M45feZuM^Sl6U!@E=PsOYU7k*5R@=v4mr7rm9orr%!L7IMo9`D?DME3~>9v#wATm2+8P+jM))UFG_Sek-VMDd}P~YTLHV8L&grXZWr&1Zl9& zpF8!08ZgmH@*T}{7#nEN{r#&B;~V;8W4Si9*B{r21>3mYCrHCm*?HE`7qVD~b??4G zP%SAQVFx6mg~uZX4x&i#+r&F5-ZTgn*l(d_= zJewt`o~d9@;aRCHWJ<&Ku|wnta(lS#)=)4}wK9>v)y@^9dGsNlO{yN@*g#+)3Z#_8 zf8!7_n&fO#1_OU=bvpw?)5022G~PC6;tAP)J`|wb@2wxUg@V+t*Hql1sddNUK+!8k z#7YkwA&S1zf*jpRuaDrG^{JX#`zf zAo@9nabihVcH8d154jkkr7G&|`wkPexG*fRlN{n-K8Y*l5r-P{wPW_(KZLSahQ|21 zuYUkYj}akIYi>uO-6@I|yU~1NdM1n0-_D_b3~Xp#aa7TVf5U^J9_@X)39ccSu4g$` zJrW<3{y8wj%878@XtjdOrRSb$Cp;8#IO6D@=L(0DqnWYVar?7FCMQYLpL0myESXI0 z`fzwotfhs$FE{6QbVM1z@k)Ut-xrI*&P-Pa|F1ic+=II?nRmtuzn97l0H|Ha-zxa$Tb=H}3jtfVz=j5M?<`Y$(yz|51lHE0N#nCmiZ> zgOFW2KS@~{shxc|P3|WO?>4?AlMggpcG*T}iw|!`q9VT(+dKsuV>4(#{BIl_LfZ3B zw^OdBkmVgA%?VRGI8oe+h}5bsJR{;V-1Z(cH z-wTqJAUM_0&xI^v?&0jiZirwvst^SBbeJse(Py8AsQxeY z$6uvjBojxg;@;OnMHg`exm9V#4;yf~KL{5(NfGtJFtBNv(8hfK*P-yjxY1Tqo_2^B zh7++MBB{#6Y1glpdl>3Cs7NNb4X zjUx`Tfyqz__ZQm7edcg-5Dz5zHYXN6A|uw2Zv_q+huXrlN3d&Xdu=Ka% zf>gC^;Zn0?cuJf=a`>iyT98lWl-V{t6}aL}J$9ASGz`~s^KDQ--BHq!s_V6$cLKz= zPol6shg=)P3)k8U?}l0y+;Wp0^)`!N+?s)X+QiR>@pYMr&0SXF zkmX%h@PGH>NH-L%i-(No|^<@jRKfa@A5&B%&UHo1cy`YjY^g0*tvP z9U_xQ?r5JyP|<2o8vIu%n2(>kejV#y z5s|@u2F`bueHrTHgzYotTBqnjWEj=0&352dfd}tt@rp`W2pu7c`low^Tp(QN3^Ldb zlJQ?f3`JRL=TpuQj*?nM{^viT4Y3Vt?NX&x8=eE10lxKLhZV$BuC(I>)mjc@=P6AO zFz5KK&}*VZvnMy9J8Ux}T^G;b>$6=2DV(avvb{WuBY>_rf9X(>N4U6rJt~N96m#g+ z-w5?!o}l^RMnN7tIXKQjt_8Kg8MO9u7A|szMBTWb+$)?RF}6w)n1is*NC^RbB(2X$8&iJM?Abr7UGCSy7 zor2qz_}X*?*A1fR#o_-mcff`t*i87JB0F%Wkjo^XFllEx#PS4ZNA^2G8o-i)`j2|F z)EQWqG^i>B)!P1cdpk;R?y$+IU^vq>Ojz6t#(P&Q=vFF%GRRtQ7wjA!#91&_#E!St z`bDAHs84fSZnjH+sg4?%!Q?xXb9k(R5Bnd_ZEqpEtj%`F225L}<8}+V$iAg>^oSsB zRusk8zD-(d_eME0BY>H1%RdkihU>U3DA_%f#nGh}@svZfe5J3n-yR`1ZKSl?4iKcf z_zUv#%d-}VLj8a;MQmv&N-`>5wr8j()}5JJjV}6kLOr23n9ywKe<|V8 z!Vdec7r~)W#(KL}P=BhR)t>NlvLesEV}XqmQf$1;2$>LCi1B{c27Tkdc8X$oc9&>g z-T8Z7)w)J&i_#VS_Lb7Sk;E{9mP%W@SE!02$$_=0+D6+XiWrC~J<(~W?j4}+y9QU= zN{5L&j0SpbOi&{S&(RY~Ynxxs$sFy53Q`??Dp;(Q4QK6bZYP*c44Mk0lVMU{5JcD^eYm@!WEIT2MVyxqfmu-OMkXfeBPXf#VJg46HCH zLTL-xC5rwBmX{L)_7{h`mb+G65^vGu9e8GKo*)zx?S;B)xs(ciS^zd}%76fP6mH>&uFKMo9Xm5VOrDu)^F@VPUjI=Zyh zjvV6kSFtBsVWn|^qP#RG{pHo~3fZ)P!i$XsjBTa6pwW33T^xPCw>pxPOEc6wV z%Qx9q4z*htdTw%{z<#eZ4UmC0 zfsY??D4$>(GwmgZ!A#t?Jti$ALIce8++Aog{Wm5#0S>)+hFU}y8rtv(H&Af(lrsvMVOn?4yp-#rv`RpV& zIKQx=Q_JlGCDYxT5$kDJ3%?SDn&@rC_1*pihhtoeHQ7`_dT9yn z&5u)>vid%@lq!Xn7GsZrYPsr$YsV=*+U72o4kFZ-WhZ zafY8JS0o10Pl;N)M*2u|93ncCK}Lte_&}$2XuHpXnSMxZXtkM2t6|k$a$Rtopbj|0 zghdT5U2E*Zh{zeRZv+8=y84eIN_574tO84etdOveM5kjB@o8dXB0ure@CX#`w2isa zVKS>hrl>UzV_n2Vt+o%p&MlO%?#$U-r8F~sC!*@C%OERLpnB6VK=S*7oxkCmvHv@Lch0e@{01NCwLA{`~ zY+3xp5d&I$czQ2!23bjKZ;Vj%iH6H%k4Tr8bni4P3a@$Kb^t_oEDzCHMZ}ip(@ln zexjs?=-N^Vg+3PK;fqtl~cxyy%<@LNKVc4|H^ZqpLE%dh-F=oN84 zKke6EIkE28|IZYJe{!gNa;QQUxc|BaiUx2?g?*_s|A>eo`gjw+0sjA8D9AxhxKJ3M z8>&)xkoJy zleL)nk0tw<@tm0rSgO0jJ`mKXoNb5S5UPs=CeY5mZfWtJp@Vr|%Vo z=PaDN#C~yOC>zHo#?}g(BPiykG5l)AR^`}lM1eWpx5Yk*n2FiMiQ@PLp)7hKGFk3+ z$Oc!@VA7C+vARC{L|N*p4kE~>uj|@=8_Lo{l1W^06Oey%+5+oPlKzTkr0-jL1t}UL z$tq~)R#DWVnnpW!VPMAaijUlG6jWCBxRpd%I6p`A@8>rMPJ*gRvfmxS+;v2tvgaL2 zqDRP?Hym;~C$+@;Z=o!02sc>zs0Ktyue*g3NgxrSeY(TAB%F=6E11MUUZnuScL%8K5&Rf2z|7B+!ktPW!0M9BB(!J zI&6Mm;G{i$da52rSac6WK){LuJZ*!)+T1u8|+6}0M*Oq6=Dbc zN)#$uL}0EG&kdH~Qa)L+4q`gj=L*(oU{7%0wNZBbzPqlAk2TtxPH7afwBrDmn z$zJoUcxGvQ<*26_nsJa{`4XvOkE|57?9chZ4M2UA!{zATz=SKuXK4qOfz7 zZXscK^4Do-;Hu&^(dqD8m)aw_p%8yJ+2(>Y(NS%;9@N7{uoVWH$nd)hWoclPr*foXZYAU|iZd6jQHaqDdr^>Ab1QbWpLmum944yll2R{NLa^X*yG#(y6z_UiX?+)H zq%_w^K*N*GRTe5axJQyB$Joke1;9V-Gu zh~!{%93l*#9PYIJDm@pC_^fO@Ur?>+;3VzMU z>2s^Q!TPU=R3ba*>~6$!cHU|z>0|7S>2{3(Ek?rxC#{z25JxD|DejEm=yF9MIj|}) zhakT8mc#f2?&EQyvL@upl#!A7u|qVSwSl<1I^?1hfX#wEC#e6Xg(FK{vrkQ6Ko-Oj zV~v6^=rEbKXt1XuhK35og#D*8q^gTh1k!nRdac)4vKrMj&6KB#!oS1KN`k<4-GVwK za5=&IpcB~$yyB%47osy)Vw$h&LOsdSb`Mrrx7I1S>tZF{x?bIXU8tFG#Zuro(IFwe zC$WRNT#&|V5e%Vr@dt3JX%3-TZ)eqF?JRsEPK8pefN?VyHGmOhjxs0$kdWNalyHd%91fT@~RG$|}4 zs3E<)-sU!kEVS};WUY^2siKivpP*jN2x6=v{-Zq+bZ*~~#Mjo-jUKLcQHTT-zqpUS~iU``& z6|z!!wf2D0{2kWOWY=1IRgeW+(?@fs_Os}_248x|caud=&R=97C`s2%JsPFiuLSs@ z{QgP)?aS@x9*7#i>u?BDW`lwnEgZtLTD8x7SrkekTY=&(Urz6XNSjmGZeOCbUbF6_MHdF~a9mf2FJsb32yVMq%c@)o$pwVC}P7d<^iaxd8uAQPyTn4`A>dCEeJSbwQB zf9gC0gGy=$r2noI68b#zkGx1F)lI}QLk#xa0ib4Cf^$jPO&WxDzlf@!xFM~Imt)@= z3~cO)gV~G5>xCjx)#NzAkhWNpAT4-e)ZSH+3PPAk-7Fk96nGO^X|=Qch695>x{`bi zuxX;GU0e!d<#SC0Gq=PC+wC<$oZiq&;R*m%3>Rq971e12u@A+=rUD+a&_d&lOXKaY zBcbM0O>wNJ***}|pGyZm)ZhKqXkd`Z6$*2U1jRrYtNiuas5dwRcfit+e@aka;+?kJ z7)g!Iu6#RwJf!36R@q{use^8cXXfY>|C%W2|5qg6o=Jzv{zkl++JzIIThyRMZIvML zz0aMybitxq_*Sp}Jp`UJpZIw5mDYeu-@$OE(xv!)B2s;l;~3tgTO1}D)~2!t?GJ)9 zhwYDB)BQQ(=Z34N$6j`*cxT-;_LlQ&P_l=`gml&)Lj7DA%E`-wVCZXJm`25LC11Yn z^gwu!?DEqAWbM16(DHaus~xPg8repC_iC$fnCRcEK(-Hi*#vhyC1uh)&iP~DM}n@? zRNJ2dWXjOB_JI$Env>b9>`|pde>0h?wmE+e%vevRa?0&RK{4R_Ano|dzWs1u;I2{= z_m3UMdRnx=U-C%ck9Dt(_g2TsGb}?C8c(03n#OIXzW{MCuX*U*X~hj zj&UWnRGfPA9i_3+^)~!yfT@mbc~||-V-8oa){a?`pbwmE*{<93&JeF(9Y-T((c@lQ zcDoE}YaAxWyL6-S7r}5jL`abfo(R0m@EW&Y2=XctO{KQoUqkwBZo$5(v>KRGF6qM0 z9C8Se@W0GCr%au0aiujw{@?S@=}(2?x(KUpFnc;cW=J^#ILBe|w`>;)^8b=OrCj7_ z7WuU^5Sbyqw$$!*23<0xU1V6DC>lYHeb(#Dv07YxW!lbv^G8>9>5RX^AwmuXwR_JY z>iUu-j6D;W6C*j=Uw-Z|-rkhN+;_WYJ$K7;t`6<{5v)L`s>@ah(uP=HgVlICp4VwD zO7nNH>BVYSe`nYWawie%FSfm(^MV;2c7dl8s94>gvkp5GD zh8p>aQ{q({-H%!BOMxdG>vTXW5QVzAc9eADpMqhNF~d?XhdgcM35fK*p#IE6woAx9 zb_P`Kq&NGmSG=yZx$zb2?I4F}*r&K>`;j0^bz*#sxLybXQ-vGit<{SAbggqDr_}K7 zi6G8maL{?kYoS^!8RhNoNkMvMI{MKIUJqG(t?|MczyjFp=Pv-bkFu@L8?kNm)r6;Jqy!6mb9dxvNo-< zW8Wgp>o^+&*|(isxWazo>G;4JyGm)z>m2YE4G>T188%-;%FSA4`%IGLg`_8!O8bSw z)bJL(0NaOx`b!&HY)dR~>h++~vr{}|Ex2jEouxFD$ag;mN6#mmv$RaAAlD@WXQE}f zy{j}&zTpPDDdk*5J>YC}heMAV@PR{Cx&Mc)_W;wgsQUi}DT1Q3LsLMd3n(FUP=o*> zkU$`zNRyBbDUd=Jc9U%1&F&^!vL(rGb~njxvNhYQs8}dU^A!XX%{~o9Kt)92#WwqClrAZ@UwAr+ znpHYYqfq0L2hh*|Db$ec-5x90Vpj|5Cs5O^C5h>~*clRKTXa%-UC=*>#F6VCk#XIL ze-4!>Bq5^fZSMpg>E`$MvmD5S@e3lV&roi9MQPd~!xHwOA9&WRB@6A0zl1CW;$%V0 zi6FkwarX7Rphg)dW3PKwyl1_Z?u5hfp|lOpe+2b2DPiw|K1Y^4@NTGCOA_t{TO3kW z?{&`mA(vq5$-Yt>6jX7}ocLKJtk0Cc1qL=TIzIhJP+h&H(vJMQr;FQczS4~3>_zqi zB{lpk&fV3v%Aw@D+|Lb0u&^?doB_Wmd^ zU@sf|lh{L|z<@A7WBK974)Gnzb@MTw0BI^t_3Ub;!$)_bRF$EeOj({9&Cptd1?`)?FgwkAxkiQ>1QrXKWfgxS++=NHHu9aw{10^ zwG=0d-AYqXT~fOptpt0)b7Nfu?b$07Ozx_Su>zkUc=&+_!FSFG=`>^iB$|3XBB$I}+hUNCF!BKxCodX^5!sr6a=2FQI6CaZR91j~BmmAJqm zBh6*1)$a#0jozJ6j(R9*CjPSfZ0Y`?4rDWE%~^}XczIg;?s-A_PTKxkD^S$8BR&@> zEW_*|H8870;RZ?`-U;JAnuV5Q*yL$RYE`ZVW}qv+e$*~Hz#%${YgsZ5bcnkzqRw^+ z>T&7YZI&dQK?VxD9v~u0`}!9`5mbkCk@PKxnkDFP-sv!H_dH?`3Tg!`>tRocgkBza zYMqGah%ZS{8?ZVrn99saRgcaX z_5~#w#T7X8`o7ZXAL33Hb35*kkQLieXtNIuSqu^96g%~>0MSn*p0b@ONF|BVQG3nP zNp8IVpfraUAHzLi`#ut#7I8zHXvT#%ez*@06|%Y}wh@2LVZ2}iQMRqgLaweuvB3P{ zkw6}|92KM&m8Q+~v1~KGbX0(eUE8=wwu>B&50uFGAmI>03n`KudUU8KmfvNwm8MA^ zU6UX_+eM*fUCf|E@x@~ta^b=5`4mBQ>R`5O44wbwz<~K8V{W$!(#T|HzCEop%kRlMYTA^j%(Cwt z=M{2cfcM6F!B7J?L9&zI@A$x=vv)_P9T-6z(DJ$HafqQ$tgY7Oeide>x%7o4Y2~=y zZ9^hzdaTak&;IN%iSfNP`3m*XiR>c-1MgQ!in~<0JY07a1+68 z2efZ^RYYn}nO0VoItL65Iv2cX>)c&QTx(f8*djyQ(;5TI7+aE4wEJ9 z=5hSyFkam$7xk@PoS!`QKw2Aji$W8p&sw?2jyf^a#ls}Y{EEYPUth95!|oAeY-8nH z?Rn2i^bx_d2iK9IAiXj^+P()${>EnK8s+jDh=}P?=~yo~IaHM_?ze8GrAgM%#Q~IW z3wm9w7syi3_rnS7Vn!Y`#HzP!zf+ zIaVTP$7>zNSJ$J>Yd4%4$|hR2TE5a6id{o^hqPZgL^>bK-en8E5jc0RZn687h7(gB zx!w6k1nWx`S@OKoJ(nLoyQYf-)nGoFs~VE5WVy}|A83zlLru~lF0?t(TQVw+M}wUi zcGMZ+x$)d0yVcW)<^j9-o1t35HF0~2INSa%iiT(HR{N*YdY6%wM1H#+cvfJJZ|q=b zQ(PFGBZ|J04P}z^)TaCDNf9YU@8S60)CQ_f>geA z*(0|9xgjf6MWE(Nn9cXzpQp6OxV2FN=2}6GV$&`~8pB8aQ5OV?MCm*08C}a0MV-!9 zl2dEa8DdS_*oVmw_?Qb};8_zUOt2!Q)z>3qa?e$Lk=K@uuZDj4u{iwVz?{n6WRG|{ z(b8ryrD+Kg7YR>CYVEE|0%yFe(Qf}XX*IhB{~2Xg5yAXA?FBy)q%5zmO8~yPCx6HJ zvpVG>Z;2pXg&6RfOG(n$IZKyVhthhpExRP{d&XfbzeTmaFG#I88bGuqlbgRfMYqM> zGLp6ZFAKHpX8U}i(#*YhT}y1Zm@O!=!t6|Eh-H`C14`41$uliON$O5i=Md^$Lc2pQ zhoUze&D9R$YC#ydwc$%I#p(H?PWCIz(xj zy`D|F!po+AU{>i++PU06)C$r&iSkbA>y5(Ft2^Yre_QF}^hb#2DF7&6NJrUV#o*&-SF zB8TH6HQK!WK@h1*u$kpYu2rMIMFs~=*LXdd#f0{;zX{Uj)Rt_PbL|86xi(PPj^O7~ zKf^9jcyzp}T8jT4J495kC^QqNhFtjoKxyOW(*ndKaC|7k_P-9Ou_!^@8KrY;Y?0DD zSNn;LtMwsj6oscGHn&M;zW#czr?^Ei8n+A5T?~_qK>2p1Wvur7>48nQc$hUTyCJ~% zdX(0(Ebm4jgR7^MwPSy#i^l>H)6)anY^9lxk1n*BlA0+4M9mtAm9DjOXFySl3TqNL z!pjt-ToUuIZ_Nx@zEIsHsItY4MD$8#<=YRPA+1in%3gBF4W&*s?>J1X*_s;KWbX=6 z^P*WRZ2y}>IzH5HM|nDl4eQZLt1(*eYwQMxGK8um*190?arq=$r6gbE^W_Vs+}U)^ zjyXY9RH|?&S=4S@BG_4w!tM2s1*v4#JbPJ5UPuyd9HQek9 zhO6;b+v*HKydk54--|+hm(90#oR@1@tvCz57mA|WuXEyOZVNC*G-$S-OC2V6Z5&Hh6?7;}OmuZ^E9cqcJX|TTw(mcN4ToTJ4^a0=Z_7rcW>*dZPTNFKgpdXDpv?HB? zLoZ?T7`RDyLkXYME0)?mvqHYkb$M;lV?Id~&G-zw)_6L;X@~tlX?`BM!$Ce=UYMv^ zZ3z*18jD48ZN4Q1X<)dN*T#I_J%K?^3)tpABS_`(tOk44vk*!q8#daLbI79pVD`qx zSnEZOh}6%SKta8JDoFkM>=7n_^<~-Iz<_i)UfXFc4&zmvBy;RjhYE1RE$|8RU}laj zUT&u;Nkf(`U2Y{x^AIMCwxR0;)yq2&Jq^eZyHOOnZ053AXi&Wxv5&2jK8-C1?8)35 zKat-jO6cKfv+W5XDTEa%D$xJA5P3&gymC0EpbM#E%10t<0M}MaoV0ymfb8SscjN4P z15A##;&{h?jS6gS{RLV&MOx@5T2!=yt(;Sui|r~Z)J@r3`5wv+m9UbW9BPj zd#?<+$l-~SZjU+So>{BovTCi+VIiRbX^ zXJcWnN3ftq67>9sVdN!KTX&%{>9uBcY9o*7>3XLGM>E{m*c+ly@r2io2yyp5LPmoL7CQLZnOlgf*VGcT+c7mWfoyk*Br4QuIPEp%-^~QRK?f54{@i?(5 zPtFL%b>7Dn?pB9r$m^qaRA$JvCu=v@jh-cAuKspw_pIdDM*I9K*k}Xy z8B80gdX`;;4CuCa;D!s4+J+0>99is58<%rC7WrbXJO5;OSv8HbO zVm1^iOtxb3T4aX{GV!LZT#g^$9wChnC#SMbF79nz4iqZ81xfsyN|TEOHv}}X!*Zdh zbvTeM@3+$(64hFsBCi#MU#mIJp>ddTgHvcGvPsqd)x5wXG0tF_T6B&m)QK!GzOL1J z9AbgSpM5ny@bf(jx0q;j27cnQq6TFX1l7om>(IKD@%H0GlJQc{m?7n zGo#PaPaUSVlqLo;?c_r0^k3)0_`~!EcI0B}wg*bChO4e=Q7#wL9O$<=#0FOTqL&2K z!8N0r(5(N{io7(L!0%|nx05JgqQa@ws=6qe+B<}RC$JI;`e4G@{d>vsNKhpFmfDf=C@K5!=Za?J5` zGPlB(E3G+|v05hg4?2|hdCW~a1!>OA8|}@A5s`jfw_R4|Y<2Y> zN{H5HIijeOxS;947*LSnv8)_>$FZkTW~J33Y4bxU)~6jn&njvCA8}{g;M$jAnirqzjQMcM&QOt_DdzHQY!}TnYuYIOrAQBWL8^78b%BGsrZZxX5`J*ub#y+&h;rLLC zE)P!G?hk5j)yC;`hcc7QF0%6-Vjq*jM@o+%FPPfcYk%?d?#v>4S7{AsVIJEm=@q^2 z1-amhaqDnmb9f3DZkVascO6OtfGfXDL5fQ@OsHUjX_bhg2W@Dvf46vUalUKxoU$Y2 z#tV1Y7k84@5VWJ&*Ka?FVB32AQgR*QD^Ug+8w52CSa%c_N#ky#BJ%Xz8F{-iGIwX> z+E1fOiW+cxZI88vM=0{@V4uC>5Zkd>Tc#b>7IJZCq=js*_Kk0UOD^RlkBRF@Yqzru+ERu9mbmr z;@eB@qRx;T-#7-vZg3bQTyjH!z0wtO`A8*;8mw#=P@}hYn?zh}AGV0XGq0Lv9ZH5S z`){DW+j~L@t%OY&Fz47xQS{4U)7*m#0zoQRIop1yq=pjzyVH9E7mnmur0q6AjeA=) z-YGP(9~XsB<}FK?Et+HHo=^7#M@sgCzEA;2Bk3R13Q_^q!>O9|F6O>|C_F8eRcrZ5 zr>91%bm%`n5SVrQn*L_lF`~dcVdB}=;ps$groHd!Nz-q#s|G_QTsg`w-{nK0NMe9U z^PXP5V4*#twB8?)75-NW_rUxZ(!?=`(gi?<_hJN#(_;CL1!)GpR_%!4P>(KfHkH^d z4(0Ba*|a*!?b?psfn7fmn6amd0fQgv8dr=3)PcQi02 zHVxT)rD=Ggt~=SV%9chjuUV(MhaD2HM^p2o|3caG{b4NBGk3`>yW)r8DeT2KM5Rjl z?LJYcIaahoU$rxy3J|}8{QT@yK^mJFT4&e%D5U43VScO9w3aV>Y?YN@ow*)jxqtPs4FIYv4AAior`A$tbrYTkF7OPY34sni^^Nt`JnmRcy6i zL}_k}&ig5J&CgE07a3C9cwH2J4XjjbOxZi=vg_X>QhYXww)1{QTFqeNO*Ev>{5)hW zUADq%l_m>)P#wAc?ofAe1)Hqx7haHaBE~ZB3F<+kmB}u-s%5>hlb(T?J`$gQo_$w9 z)ispsX7UDyEWX;(WD3%Bginl!I{aS|16OZo;C|u^+Q||rkCW~F&w7QmMO-}C2Ep{J zcCg9d+n{sT-<&78YAmsV!_0G`NOH%h{9-=r5Jy{_D{ZaAcz&fKi*1b{0;i!q``PoM zg$Os2il#kizZXSsL}X-kDrLWPSc!egus!8a0zP!A|L0esIPw*jdO!U_;73NJq57pm zJbudV>XiugaW_?M|BRr*j5XQOFNWGMyXC7OrS6-Z;&;WkU0W{-&EqJmZ{fRwj5??7 zkR~fOR;jqx6)%UZ-I+V=Q>9gEa4Ruv?66k?lvHQ9&!!5}$;nmi_%6dPPrvege+`k} z?v$xET}d7?W91U7SDKnog_2|7F^626V61~{(ZHD)8n&Z<6F6nM%C+1ShceWlC!Q2k z!!xSwZKate=@nLXLUzEbP!N&_BL;Sucu&>JeBx_?GgvX}I_zGj#D{kz%6DkP_L?ZP z5d9PFuilLyE+6TVbFBuf?B}}GsZ`DS*M6JKoS#< zC2tE-iM9u6?|-)JO=lqd(CC)>q>1NGJl`HtS~U?9IxQ2j7n~WRaQTBeba!A*bXVGt(li&pY%EE3 z_;@Ib(g)g0+7$d&6lz|=iT|&jn`qC$b}X@@&i>()SV=#+1gV|FHWUjrB!?q!U%wSa zJyM*TY%jId@c^~u#WK3eA$wd+uP-E=xd?5644W;;(-T$etkbh_YA+M!ZzTh#V%(s1 zcC|yAAw|wwhiq+P>oe_-f~seGt+l2?I@K{`ZA#O&ct=^hpHNkT|Jz`MCTU!W7O}@* z`^=Hylo*W}$%#clfCqQYu7dP0gf?+(T zF100p0kO!>U9@PHJt06rZV7Zy{Eb7|@mIIlIq!yC4t}z;yG@WM#&IFE&ar*ea;xo> z_rr7Np}6!{r8UI#5>A&6`D>^-+1u8I5ux~ zi2bd`wACR|hot5*B*<^+y2*CP-@TlaN;t|p$~-*lL)M4UQgGoxHY$7{1~ryzZ%)eL1OZe?DNC_H+wgoCJk z{hxscwPs{}+~?X^qG*WLZz1e>cpQ;W6PsIY@LvHkm=baS#bGLQkcbcV_YcWs`HJIX z9IvXl%R_PIv;Pi7QkiwJ(M-ESkeS2Q=^cJveFt@>;mz}IaLs@>!(|8lrz#vv<7=KD#SJTai4?=CY*7u)hkT}<0H*d zdHAtI`i#}bF8MTYCi4gE8l}^m*|DYpTj&t|Uu`d*a~SW~EbV~b3eu3tEA6b$TojsE zvK9LUo9R&3eMEW46QnC7Yr?e|mkm9l&`}HLFR^iFmUGOe9NWBiVD?+woDKGgQ&gPW ztE^l*{$!ZNUwI7=}LQ6gLI#4-}0W+v!rGgo#10 zZ2%WvcHsd)P2=unyU){!rgB@XwAhL(Q>!v;g~JqamDnCW7v(aC^Q-N*g3M5MSW>W2 zyzUN5Mn#57k(}7^2YQ7aIT)VViw{>~8IE4i!&uV}o6OI91UDiRM1LS!rG;UR^2YRLcZ4 zUDslDTBwuDpPYiUm(@aA>0hx>B)N8nzHFBWYRFo5=rhYMr^p!yC{CYSZNwnGz#WqK z_c{X?knDZz4Tt#4lEeQaj-W#QyqDSON~(1nB#?M564Y?@?vkw#JCC!DbOzkRk9FAH z4rzrxDbEYCBStZqa&kD` z8i3m-eoAu8Jn$2CQeVVSXF znPn=iP9>OMh(Vt9l~6X;SR1dZwsVdR%(E9PwJl21OEf?cWg-~Q>5s3iv3}1bAgW}{ zKlXw%mo2o5jt>Qs&21?Jy@i52lnGMrGP=_-c4)s;tTG{TgX~r4)Cz zNWRvX3yVeM6{cK1%^pyiR#A_9I%W#0^1%^;zG;{JiZdWw$Ew99d_C~TFp6W}Q0*|@ zzD}FJpGCP1nF(CRrUbczm#pFcZtrZ`30_ZAg?6|1I)qcM{#pfT2A(S78%DL;JK@B@ zpW0S}-voQaVRCFlx4u7c$ep1M_j!U8r#6hJbVFY+N_yePdEIbSc-IIv0`{pOGkMB# zJLn{m`pt28ny55AqhDubu}yavLk|Z}A2&oW8}FN}K!Wr$OV@v+MXio_`uk<>xcKBy zVXCo2Mh01ede)9jR<3l|p-x_E`=0`$L6^<8+m%!!#_HvW^d3Q3;%(y!|9^#)>D*+G zDT{ZOY`>sNvVw4-{%^Q79;We6#2nmv1jP zg~^F$UX<4??i5bU7%}K%eEk`e3g0nZN8--{b&W?wr2U-fw>7Uf=L7a?>;Cf{B(TpkAx2 zlZc{Pkb$~yzP;-?az4AJ*#2;qGh&^MFy+e=fHXj+J(HAH9nCwEeTabs)hB5!5uFUH zB8L8Ub?M<}2maLN0{!B~1gUBUQ7J$5ER57K*0nn)22QPk_+nb?P_d(lLenUiev$Jo zuwO=bZQBX+VF`zdC5`B8zjH!GQsL`gZKnzH$mQ%dFNw0Sqej#8RzbC%n}yLnX$Ivv zC0UA3>GYSV$&1J$Vz}5QpBrjhx@?I(sx*DU1d=!6rv=mVqZ_G`9A$s$JSdDkzS6KG zL-k5xkZ*`csYDsX0=rs}<|Rj(?W*%b7WY{S!SR4YnFwKu^tlT{E;pt+1O37w_rr;; zL-tFDx{T#1|A}&QiuApke<5YlZNv1lOb6)o7lqnbTji?i!i)XUBU@tmV>U%lLr#A_ z9%W_arpgNJ!wB~8;EvDcUg8CZw!~Yv*iZzAiKVjHei1>G2+K0@?HZWl+t+huwOm0g zvlcFyy=di}xwc-IRx$;U+DVhHF)EO^KH1%5eGXGOwb}$c;;}5hqJ06d$WJ zq+nuuoi$7f59MQ`t030nzlx%1gkSZIt8K3<0z?T_0jTl-> zq)zpYAeF~a0{-Tep)6rwwZXkRf=w-0(Ai;Eh1~Js&Dv?5?l7L)D^-}y4wIu@y4!v` zf(4kF)tF5VWs!!bZm#SgTTEeA zykMtYdUdEcRn;LgiAq8J(C}1LSfMQM%f?a8p*wX2UvHDslyu>3*tyxJe2DtlO-D*cpb(k}# z3(v1OMADGRE3^d;`8p?8)!US5Fw-xwP3vs(b)Rv%~F z2$U9dGt$O_c*aI;XORV7COY+V>Wy#NXty|&pqeX^afk7>gz22Q7)FY1S42+Q>Y4wf)uUC9LCFVZQf)L z-yCw|btAFO$b<#eKYi?aL_FHDz3MIpQxNep;2+;_f2c(Fr`@F+M$*veO z%vT*^5G=XC>^ouRF)Y5ZVl2c)>=RKaj)&X{%nJOte~{tboCtP}B~ZLrB1qp& zUuXy4Lz3y8$m@v>m!}O{x1%&>Cb}M2U!#q|Lk?L_;y6Ys5!CWhn-{FDv>H4kqK1r< z4EE9-InSRR*!aA%;7KWhXb@cyakw>Pf{WUg*3z;c@($5@lUBDT2dwF{DJB9O8yX z?%$qu$j?FAN!@cp+4N5!(>@Z^PrG&4zBrFGO=3dg<3XA;M~FhpiQ=eFN6_}3;cWV8 z4-TR}Wq0hMpfZ%hC&jWwp+gYA=!bU6`~YJ^xv7c_yGc+F9jLJQ?AwCs-gL-n8gEZW3^V)N&FKlt19LLJ-)>wH(uvXRHA| z>DLhhj^rwf={TL3FMbjRb$9kId3j$VsBx{Sw!$c@c7xQ4`y6tQp_6H42AsV8l*#t4 zk~EO$^#3(ZI5g8M#uSND$s=SrmXVkjBT?4Pp4{eU8uC)wXZ3dy6MSBtqxUj zo=N2P*hGiyZzUf6u0tL2u-nfT9Y_k^`AFdi7?zT(T zgokQdR?hZfotD}T5j7xqIzgg3?huQQBrY?K%L=&+FS2Hv?2x`6A0j%PL#|yVpMArj zc6I0xF3%2SlRMJk6*mde1GHx8K$GMk=M0xR?Pz6G{TNewl>JsoTv;-Dc>QEK~@ zlB+Sv8RCGb#rZ^0#82OiW!YKIz_**7uw=^@I0H7Ya#HjIXONOb2M%*v>{m|V3rmqy z1>v=$B0V+oeewBq;Smy>viltsR2wiZz|M)Ah^+NcX#Q1K*_e_VrnH^4(H)0fS)GnJ+dsixK#&bn{( ztmLXu+qBuUh^datut{|R5-&m*t-p+57Ur=Hwr@STjLCUt$XfHpErB=Lw^`B0*Ex*m z@4!8X9n;_yXJm{I<8VNbhR>e6-0tx7B(xdtj@WRl(phdf4%w8^!L~(E-8$->wI7pQD+HG3{MENB-l4n)hfbs#s=dvPO;UG$6zwTQwl!N{t1`Qxe=!EaX zflW#jx<`@}f)=ja?iFI6&FA63W`{*X+JYS`sNxELmv)VHpD0?T(1u;=%-E(WDnX?~ z84Yt;`m{qLb?M~3rX^IRSCg(-pY3#toDXms=IJ#%;MwD?{yZBK*5k8xxy8o2qG)hi z%aTNB{bMVT*2G3O+CP-mi|tBlz3tl;phBo2n|aG&yc27g0Xw!mp?+WLK|BNS)nMW|C-><|UFRC$j5+#%ufQ2(^PPMB%r!X*ps zg)Y)s)X;TLhX~P$_3&MxHnwLP<4YZKM~qpltrMhpV%MnEN1Ww7I!7PT6KV_k+cE=1 zDfUJYX{~hfYijhZ`9KtENL6;(k-Z_E%v@!cdOEgsjoquXdJofG6zHCGh-!&W_b)k= zZZ_h>HyxrN$4}e-DMU_k=7(0Bh+RS|Ft51Lqv^ZZVjG3^sV`pC_Ii1M;>O= z9j1m#(TuRo4zcM~|8EndPKg@Qu6(uDH;Z`Efye3guCX}A&K7?Wk$)Jpc;#e-k!|h| z57G&aoydC*Wf4C(Y-bOIn%QE=5oRz{jEhLyIQr?v?(RXLPQFas$p4>i2p$k!vm!0= zmi1JfC@NS{o5=07b}uN$H5Khx?Q!7NVm0Dy`p#z7+G`F8vXRQ~v-kJp<|_p7=Z8X_ z@+aF?Z$*MURDKDH+H}Wo#IQFb__d&CCXaZL;WG4#?XFQEqk}oEt@re@xpS>oX}7`t^kf?SrbC1iy3F`pl-t)WbG&~G zGJ?8UK<5@$fLMz^_EdOcf=jNkx)@V;n<#41KtZZ1%bs?~Ybs{hKOAy#DDM!L|0po? z71iqVdIUMYmLy))AAf~nA?Jj3P5h&SLZ>t4mH~fDb#a3(fe*Gt* zs`$`$?eQl%lnPH{^ByRtcI^4)C|4)B|IfweMNfMrRRwat`aOqQe{wfiufteolOz@U z{?wWKo7I5*BUm$vTCOb<)Pg{a+JAc%^I1EAPd#h*sscOmXRz_g+-jK$TUZvS^o`CWg?C$M$t;5uYHvC%J^$yYQQiR_Wb<>sN zFPwoNO=9O(#@x<)HZY)0lKvtKQvK2R5Z7nfO-l3oHENk$zk8sF{+Q9qnbZN;ywZJ%ga8rAQmt>7r0sY_!U*`L#3Wug8CxJ?1dkRj&=? zcER)y1Y5AuV9>5_*{HL{aCXHN-opP0m2mr^3y>Rr6JV;P5P7@Z>rnd!j1)8e3-!I* zC`iMX68iX6uLg$&h=o3lAm;3dJI{NKTrEq)m0H)T^B{}R3Mb+~frBWgNF7|NMdbbd zSW$1gJsry|x45Sh?S1x+(i#A1cDLH8zYUPZQd-z2z8;=M0Crswgipa-5vev+NmQFR zNPp!(`|k-KrPu)!&1$9Jd5v;40{mVu_Oq8T)2F$9sGMQnANQny-<)*)fzc8m~3ma*PMZAaXDv&Gu{den*Nzq7e-L4W5>Ph;I?>UmfbF>7tg9jvyA(n zWLt;L6H)E2##5W8&|K2Oi$bh1}0@T}x(JX>k~jIbox-l@S_DT+oJ%~3)!?G+CU zcst=9TZfWEMS&sNGA4f?zjv6(Y)uSpw7nA{R~P2B>+DcL)t`<9rse85Q8X3}gZ4x+ zq>~%VLz-&2505ohTd}fO#uni{$(~LHFyVsp&$n>_b>X^xWUuybLF!AEH(U4Hp6+O} zEB_SOxP#PBYP!Q%(U62cwGph}gphg<6-%O&5~Q|Rcd1?WLC9hkMEmSYhq3j!3Wd2gf?X)AcG{Q!;q`QM z%1__(g4B~L-C<|^)3XY4;^kZH1BZP5<)?42f00W&ks;u9PMx_#6y9~(WV>HUT8Bcs zTv!c8ur5cRmK#3wiWLubjV*A9mZH|Z*BxR3C9S`M{tYuNlHY}LrPVjZrF<>*O==TG z-O$>gkHF6zBF+~4jUXEc!YyEIeAs_L^rQS?wGS)MJ35_5i>Q`XHc8&JB7&P}OSOII zMQ~1$EL=^$$I(N$@FR-o&C?NEbtN!I6yE%nxwb+{n)?toa|6zZ5k3yJSQhQrkG&|z zx4|9gom!13JU`jmEw>fD4r4WGY1|(oxOq@J>c2$Ha#>hyFMmQ=K80FOGrFWJk@h-o zpZPS@NwiA5@7sXG@sYY%MyU;d=HhhoLnt9l?sN8nqE4)>kon-Ff|_sTTs#udHGS`& zavli-*&gq;cPNs;aWw0~wGNZr0}^*VA(-CAoIlY<_JH%ww|AXUPIe0i?CH^uQ=S6hA@t@S$uX(~RK$NMu~b)`DuX(qT< zp$+XHc<6r};?jd=Pm9QdVNc z8NJu&(tpg^`c|kNvB;7JSB=42=ZL8(JngJVRu@+=4L_o`KiR(8l>UXAPNsk zt!uXDBId2NbfmrH5La#VitTrZSJleRXSPjphzv?oDB0(B8 zm6h|fqeB*ZMBSa9FQ}PUTZ7hV3g3@6I0calo@KNK&+;P4u72rw)JHG}lNzFmJ3sqW zjDRHn`IRU%pZ+P6$PaP_5AMbq-k*b#d{u zppJ&=)l53xgY8;<{}GkSZP9F+bz-O_IBv0*9db9K4bdA>t}cvnt>mP@%rzf3UDoK3 z^NX&VPC1!edUZC-_Z3R3$I{Lh?0P|7#|@RaE6s2UVnw8wk{>)Lihdyo4DvJ5JG*{+ z#p@X#+mNiPMU^|$!>UUWDfzE)3UXF_S87|4C5j%Mg_TBz6*@#YOIw^j3DU^953jI; zPIb<Fw?n@XO2yYT%HhroL6sV9V^NTb z)_kY1)W;g@)TFmW;h_`HIm_mr9{8EVY`h2_W+$8xDwLYSwlaIzA#ym5xAtU|JKP;> zDz-Dv49r|8#5ueFL{OuQz!i0eymmi5Js+_(q=m|9NCTGMzrbS3qQPMh<@KQAo8Ewd z&HCnF^(}`(O*-V}IgD*C&@@X!s**#pea;G;+@8vKdZ|MkxyvuZi4*)m1!D+JV%=5t zm$RYJTJ5A7+tlE+eujN_V&KGFf^UpmL+6R2Uq@R$vS-`kkjsZ8610iugj|o>dxIcN zn$NL%snWb1ZoW~sOyStE%$diB8YQ<&MsQ?3(Twan=fba*r@uu3hB`_Us|M_95vgz% z+Q1`9YdYl=$A&xYobv*dDjW8c7oQ(sqGC8vS!2@#DfoSSusyD{CdL;0!x1p6!}22v z8m1i$y8C?7`IpX{XBS)$s*1I3(%tiS1rY^?>j-*GTBSPf^xM1(11AfPEJUjvD)J?< zT8b|Y&rQ1!Pe%wIxFqnWGC05QunWHpOgD0ZJ*2eyc~yl%EEK><3j z;r391{~keJO?&O&i1eS%5Zm0R3!=Y%C-AeOj%D>&(WOA1m{`5ao>!W(VJV0Wu^swu zU}khAVSLGIGOh?(MhRZ9?kQbo387GJnh|ICDs@>e97#nJ5~u>@0mo zUW{P==41|%y~{%`+FA*`v)nl;REWVr5EIod2ALRk)3qhvQZXR1G}qd2v;&7K$J zVKRwAk~X*|P||H}kiUl?PV>sQuaC9m*v}lsi0sP-?&pGPD;l3=om?r~gzEx7c0)S+ ztq|mAD^J)pBMQe+RBZ2wNY})+jN0F>5Bz8%DcIot(*u+?G0s>gMzC!|Y<0C=B*^2~ zY4A6Hk%IouypX4rF{Ge8GRS<|~WF#J3g2w{+O+g6in>%A#HE!8ZqnSXGN05*_7`U4=C3 zCknzZBeB9Jjl@;XAa9Vk@2Qi5HRo7jEb&QT>JCv2~henAz+XQ*F>*lUN zX*pyi3fgo}f1_Y}|Nj4X3t6RH3Z^OttW9aYDB5&xBOFLOGO+Rf0qYiB?=@JKj_$eE zK7Sh&o-XTPw0+NTnC#wxfs=hJf_Q3e>#<9356tY4<-={Jpq{vH)ZX+g?Ai7CKjx0W z8I*YK3Ww-zDF($og6ylWe#B0=Gt{vR1E5dtB2DXvim8=?`SzXfhiceV$c*O4f|}lK z=>82Pdb@PR_vO1GM!(Mq=i2SU+O*$eKU0zlabv+Py0#p*&Vr)OMmr*}#|j10N8$_4 zw&5tRew(zzUUMkJkv6>gIP1(u?w@P7+!Lx{71yt@${{8Yx(~i`c9gr5Pq!U9CqM~P z5Uy1SYOd6ba#tZ)Ncr_c4`Yu9L{J$atP_Fh?JzUPo5r>rkZ=7(G)j@qm( zT;R;|WuIqx5p2&^uJ`VSij*_Glhqq4wiMAV#U%ZsgVQmHq*RJ^`ahA_LMO8Pfq zGhmM{^hcDel}YRSf_j~?4Bu(4xfcr4BC&bI_P;MMYtvC)YR5U0G9j@>_gWlsiFXpi z4EHoa6)bJEGnCfQl{KI%rx!mbV$RFZ{_CI4z-5JeabQkbwItMr?rRb+K>IEYkRXOq zaX7+ZVx*iv?{>3878e;;&vnRVRyX2{qTKEEv5l3Md4H%1Q!xq1n*=otqwqoUfxGtu zfhV!GO~3#BB3M(ZJN2^!d9@@CLa$%uwN<3QFPj_^Bt{13cdrPXOW9=2S6Z_bw@#dX z7kE~T$b?6)^sJmhloIS-LDf&hsa>8GZ{LA2p`{%1Y8W`$M|&X3nFBWVpw~mFN7O8i zdniEss4zq9JVAa-Gp@4(9}YP)R;-+}aH$%u(DT1>lyZdy+zEO}0u9|8s zN~+h;!Xu0WzT0azTg-_%N)TbNwLUqwJv*#c3qyPK$xtdrxG1#FM+Nna{;eM2>fQ_}y2CCU z)KyEQ@-Kf?=`0>GKGttP6Puo%->C)gl*|B$0h-v>VHXHet#(By$gm5kY_z?Pk4^h3bVTHc|mrm*WfgIxm}S3;yv$2NL!#ZHS&E&TiKrR ztcBOlu-%?Tn50|ZObH6tCOR=dcHa-1TI z-?n%Wm#Lztf3cxq`*m_|fbuHCxA@x*V?({E%r46pq(U0WCr^j@;fSH9P%G8}d7-%0 zPCNx1lnS)`MBQuhs936Jb z>R&c@o}Ht#3Z{M2*er)E7ZU0$D-N8grgip_(r^;*o_i*{sw6-za#1m{5{L2pa_N^G zTN-kSx66;$9(IV&WVRaih{IU@4kAL^20@Kn+GVPw1*ffpLJy=0IC(40a}_+3<3mXV zi~BLgwEXoUS4suA*m%}qDx*Q+_r6+2EO=Ifr-~l*;C% z<)H|-o!b93R_iM33K7*yy*-JV3cFE|DS!1syID!v9xLV-h}y*}yWe^B%SW~Bw^7mT z(qv_xEw2bQr4K_5mgz8FnW^}d8wGjqWJQN{Dy^C^S>iK=TGvlR;jxb_M&H*nOC*7j zv~aR0>L!fMFavtcA!7V^ccpz{1G#jBH2APG&CHMIcG*E9s^6t;jVt#`JI>j-ROHeo z$r_T`VAGr!gl7C2bRcXJQIE^YRn+#^y(%fVwJ8vXeYFY(=J>2-vlrNi0Ng|i(S6>} z1$oWyPnc-$c@~z`xcRV;9Lj_jD~sc+LqTecqy2l2AT35?5X)%UaDA>OFvQlhBp9d* z9i}qMV#s1{aTsr}RM_C#9I_vgHTtgwwJl$}4O34(fM^K2hTdDs;z5X?kpAcww(mxN zP|X(UD<0)g-&F)^TLs~Vle-NqyF;Szjw`v)dEL_!aQ}MPrce)Ouo#Y9@*E~}h_04z zZ4MR6hsFA7K}P(t$+lZbiHSyPq~MWLi2a#Vw-#>>bWUE1Skc-j+ac^qGep;v?a7p; zRJI@uL~k13Y8+fpRSpb_Zj5LBwg{pt$FJhI5&!l~wxN;$&#&_uI*Sn?OCKnraNq0Q zRwEuBN+=3_DL!%--;$*r%)#}Z+m*?8+P)^3UIouP*S;aZ1m%k;HDWxr5U%J_FMxWA zq}$gD(xtJDn{9_@#aCy^FY%YRgsKsKY4Xi-nCNa#vj2KiP@_3g>q5*-r?3@GG>)qB zEnWn#AG%}ydP8_tVr;X;JssbWZwaNk0nNxQ$^t@wJYzugPo*Wtnn`@)rA0kPb-< z*4fcpz1sExbg1kYhscj&9ohD-LvFbe{iD{s&6$g;b;!7WJ5c)uELGCM=5;Q)M@0Q* z8wprjYkzjgqtU;!51T_SgC(8P`&)ptB)MwDRx7P8LJ2zreRxY0Rng|=c@%_2nHbf5j8l4$rG>5m2u^GD?GF( zRe=lHwoqJ-Fk0K~LqVq9wF{Tbx7*u89@{;wnoAr?D+44jEC9+>%$euQf zh`VJ1>ieCYHmEeMPiB?KNyORx4pA+^ugHZCWi!KX?)U+6saFo^N9z^dd8Q~-m8jon zD?FX5?zd8bL-hIO+j4@#WP5(RqSY=D7`M$)Q5$kiG9&yH~HaiNhg_ zdvq-|zjlZda-A{WaLCpvh4+Qp5wC}f$f0ISI@ILq9KiS$L6Us0B96F6= zU9{7crap|gqy{<1q1?UlIeXG!`fJ!}_xu88UgoNqwn9nu1G)yC)%K)AY~;p=hwZRu zLN52|`jRy`WRD^jA6*XF)^W_XL5I35BC_&sK@D=+PneF3-+vYg?VYrAv9);m#<_Fs zf0W+S;V}ZV(&2kO7alEvH2&4LJRe{xr-2X@_BV%Vr%FY3%`ZbP_g%VqUM9%EYo&ap=X)bDsU$B}TKLxxz6 zImaCm%0LI<6JCZ{UC9M^H%>$ftWgvVH!&rKO6@s^iV|O3VQ)onC|3ub%vWGmD|c?U zW~HerUenEKC%$359sBE06}K;%Dla)4A1TxCTT!qk>pyElThn-w(cV?n() z(F->dStQ?1c`Z=5D@$(`TkUL7cxZBMmwx`QI>hN=Y!$(`e;aZ!Gte6UfJ4NIk{{MM zOmz-xS^SqEk3%$!1XA`4C%*2*@d$wgY_gz!%v`7E*Z=oAwEm62hM-6{fgOVMcd}@Q zJ@#fur`D{qGNsi&nbo>KuZm#CXso2vY6W>ZLj3slRk|1ZnJ9Gaq-nP4cV3h;45p*@ zdmv9(il@Bmm8RB2{Z37tgdl~MEw@A73OU*q)nwZ~e+W>gFudpjD@50@o4ccunVC&JW6@_~#nvYj81rXcc&2c22k^tst+2-01)RYVO{N7^9a z%otEJqozAKvPLKArJ}&V@j=tBSWxfYxfT!9o+bcvp3>^ZraD>Pum~nZQNh)9IwIYe z2vA?fqCUG*Q1z@Swg){c(V8vE$UhvSe4Okou_Kb9Anu}YAZk}QWRE#M*k<<&s%Lsi z?MF(}8;lyiXpRN)ANHb%Gz^VDWH&JaD_5pMg+%v|Wcf312ZqF8OKhwswKdxg7KN7M z@I`J9zv~R#9K>6Z^$gfvfAShywxpUX?Yj;Mh$s`QDnXjTWlo}NfS;cxa%aR-f~_4f<-g{{y6XfbXBowb-(0R4({us2g)8%p~p1e+7t_{&DQZo))A>f@cl! zU-F^zV zi8IHFv2(>p?qisF!wF|xU<;Jik6~?|RKm&~qD;xD*G~B)?InlFZL3%}?2f%cZoH&5 z-cn{SIwWRPtR&AqySJA`qa8UH(bDXEyw;keq2nVqC zs2~r*q9@+9PMe~CIYox&I|lSQIQW22L$aGG)okB!NLZ0nU6y_3Kyq2fuD{w2RZ<;^ z72?o(JH;V4gL>2^hluK=Thb)RlTl35d6pQs9T9*3i0<9rjo@gZG(HaheBh_J%SR`P z!ffaG6TC4f8?5-e5j^(jh>NKDwHdmDW^gtU%tVxpc;% zffC=?%2!w;IN2$j204@JOMRUv8me`hq&#_}Lj*bF1NC;N!*saG4!bvk=r9pD`l$%E zZINbOGAcXPhPP2$br>}>0J}2^&e)CeT(z*C)LOzD+I5EmwM6xBX}=R0h~zz=ipcxS zSTW1KYJoqM*&rRIy9MDV>L|LHZs+l$^Ki$jct>A8A{60QhY;MZjo@&G&an47OlIIU zce6De>CAoI1edoL1=HPEovI(u;`fRu>DBdu3HA@qPBssq(jjr|5l4~D(iK$U(D2|} z6C|gKOp?y!Z8_G(Hp5iMwqMy8*vJs?s#sWA~YQLWq8zmLk(E%u)IkQvQk71Nmsp` zYVQ3Npr&~<`esUNTYj_EDM`=pEt5VPdd%&jP!2b~lBiW38z5Fr@$FUi?c)OEDqeES zt%7vptQDMs{}bhnGJ|&D0NX2C)h1udxq>ucC0_rY@htfh94oa?9U_I1$=ESp4FzRp zw5rSMBUn@^S1_+SOjM=4jIRC~%)EdE1I7JlT-v9iXlT#D@5g~Bgr_b>*msfAu%E9_=+G8h^MVXv-_ey!;5)n15&BPI@(8e=g6k7ApLu@pZR{s!wjIYLT z9J1EPdE*mjdhTc&=OKH}p`s_CYvqE2U+C3Z($A*%4$=pv|Dk5*4z%vfAgalIOtB!)LFwlg|xBaoV7jk3CkiDD-`dU@7hA zkyhWgZ&i4bea{atUa~Wl*=OfCjFoSeeBwDl3N9o3#OE*Yf|yVAuf8j~d zELTb1FFsmp-AdEa)T);F>U!&U$U_vHXU?VMhM9y|HaXm4Cx18aOG=BpVzEPsW;j-_ z6r|?+Fvr>CS&2<+;)50To}j8yxR3PVj4<0`MSu}C*MH%|6S^0^{Xng-~A5p<7Uyc!CN6d`X| z&VM)s#CLg*a1!@DIN!v?UsA@4uko5{&@@n5H4P0WT1yk7jW*457p_=g=U*FgsV9LJ z2X`=?qNw=EkgwVkjNKV$Oc#FkQUJE}iMcaf?Ch;f;bkD3&{5{WpcIh0_--T{71O1j&k* z4%(<^#S5^`E43qU4xA{=$fD)62#&Gb)Y=1rnkw1ysVIY+T4xv^E=^Uevqx?X#qlv7 zUsqzQ9BPNe&CkmYll4uBp(^{s_nf&Wll_c+AV{lH>#FRA+em9?a@JWV*?a+dFxgAE zqcJNNq!B2vB|FwLFYV>q1B29h`r7Q*f-uMoryyH~RA0X%P!O2oVN2Hr^`dB0>d>vs zut|3U)mbQ+7`@ z+vSkhNAlkAj36z<;U*^(_2cZjqni7;H?=i_RF&Mk)6SU{n1eyE=D~kNNw{|9PRFrEHJ0l~yD9 zF>Y+I!{!I58+x{FcRP$_*P`cSHGK=ySaZDNI4Ge~o3eNc3yxb|p1J^vM966c z3%Pwnz&yCgz9qVzh*Ynb!4c$mH;IGoBD4!H_3T7*FRp)WUIdZcB1ij;!+3SKET=vY z)Vp!MMk0-C(a$b~LLXgr`6T+(*4yJstFs5jWRA1&KA^e*?O4{Nfh;>{QD9J4ArGA= zNZBNAT%Y%>SR1ZehU|mIfs^^C{oqAQyehOu^2?>B^Ak}h8_R98qnDCa?+#0ai2aZL z_ZJ4b8h{m6cY!WXj}<2rzbawV`ZZD1FC}XeL#?*cVZ63Q9z8lDSj^R8xxL|Ve7Hki zBi<9F4HM5l%dWXU)RTVGPCNgB@E|N+2m>RR7)zZ(&05R3sC+;abysq1b;#0zbMFU9vv)&H(=X(>Vflg+aUFvL zd&W>oE?*T_|)*(L!M=E8kA!-ga7v`1D+zs4d;aS5_ionwa|rwE&*3(~$We=4NLmyOs9o{n!v zjr9kCo~q6YX&$0jp-5lXDGQj$SuMk@O6Qz4XO_L`S+SQevNqs1evasC8z?ngA26qjFyLv_mQ5Ztu41v&f|x@tUnRDVy|> z2Of;1@k*SgEn6{b;gZF3aiVd9GBi^w+NE5Gx)b*5X(FvbK#W1M+{pY?Yi-`U6+!72M6sZ#h#;LHAV`q_(i573 z7$86(5KICHh;Fahl4N_$w(M>;+c(>bbZG(#0s?|&KaCWjE1I?U97+iSO^X`} zyM8XbxGazLW!nqR5Z{2_VVAuUF>s9#ZUh$>g(BRqW2Yy{=G#t*Zz)T5 z7D)Z!*Tvx}DBUq=ZND}C(18*u)a~yOcOM)%ZIvLuhclMoq@yHa#Yz>QFjWpQ+>~8i zRSB$g!D5EbsimaVbfTwBk88JCDT+EJJzv{g1ly}5H@LDa@beR5<2R3s>;X~qG4T^N zskFWs^B@lRf+Fz0M5IC~Ur%qmgzxmALC3*>k+I0L{^VZ9g6t&;)q(Sq7W9(~l|cDsODi57D%h5^O$ z_xEuHL|%gtXxmqk4r5Y|{8rwGH=Y$YqBnbxGH4^48i}({bvQp$s?$bYRjBv>b<1i0 z6pz+s{Er4`RdNwE zS!@HFC3gy#Mm-*Ns zbJl6>qgFMCvMEFoyKV?&bs9%iA8pEgTS8eHEbYreK^~Oi%CT2z2Ea+n?N6nZ10i;{X6(d~wu zD?ZL7P;YNKl&V$xh#lM!IOAg(c8bz^+mf_|WEISBexSXSS#-eFy7s)qwwDaRb^ixV_5!+4(tvTUB9K> zI@n@S)CIZ4QkuOYf*mu`OFw#(SBPWzfwgw3!$e7Gtayw4Hp;Cnlxoyq1c}&!556Dv zgtEy2&L8;c7gS^L%Q=8Uvl5#SB{<`gole8RC%3aqa{uRocG9a#pL*BY%U%`dPvluT z+1G6j&zi@F0uru2IgB;tW6WS%1@)`REwUqfLwe=XYb?vtOIfP3m8KsU#)+yjJG~F6 z)~3zaG$EWPiaM&VRhM2b^#=wyCRkIYjp!}|PovhXCNW z4|magU|a1FQ}{&JoZS_{p=_eE(U zrB*T%_;HP=1))?>3|xbCp-3}nEfGWhp!TI#&jtq0K6qc#e(<-VXgH0Rq}KG?!E=E@ z=37NA@)-3yr^xkCMx8>2JS~dGO2a^M2)W+x15CCx=xBenpoSj~Ctvj}PL}cBHTIUE znkSz{cu(rJZ{HQlrq(xT_%9Gt7axw`_U^kwUfK*{+};+XFv>vaFvVIb?BaX8@P=~U z)@~JKG@p8oz3MqkPKnGK+y36bh@2U*0N$M)mJ>U$$u1P+V@XvS;|!)ZR=nRjQ5#-v zeS$V%=6!I|yekz)XrlnVklb7^U%yW}MA|Pg)>rQjx!iFO(Zil`nCMPN*;?}exq25x zjvGz2%%Rrowe_b>2hMV+p@Y)yz7K|6&h=P-*yae%;qw9w5{C)gm}ckNE)T)X2S{wr zwnd&!a*Oy2PjAJy^3_UfjXc-B|8S@~F_9VH*k@mQB$U1iUlDtII*E7IT%~zk3AyUg zwRAm%Ck6{f5lYqK4z2#8XSIi>ceJVU4HBx9tD6JO` zw+Weon&&IFBs=pEX6oCHIZu3aNV}8Kzl0)8<8o_s?9&bj(uN1^A&05jQJvY|5Y(qZ zg%;mMh|>0a1`4ltGE3S|l-5XKOIt)XzJJqQ0#uP84U^w;iI`EP+?f^C36h zjdI1Pt$zVXw=BYbr&VbR>Lh^q#D5$nDr%Bhxn_U!+{t#FC)ht667XCz`g?*5^^)G_^mkhX^_7~3iB~42k`J5#fu8(`{rK>bxfVP9TdgZwt ziIN;UB7(RDBp}+x2qI%b_wvB`z|475V`q_|ev(;*mhI_yewI}^o92A30v;6PWu>7- zR4$a9z8~|4mgGxhad|vIq>YG*>@Gq5+G=MAlR*S`9j5M&c(!1CG-!`GOb&G>Huc#D z4mlFW3To}A39r9sSc*blOgStYNU_^%5LD-9cZJ&tyWnMK$Sjk({S6VsVISI>hrL2B zuNfcMXkALHZE0mHLR=!ax4Bon_{6CE9X#U@#Tk9sJzfjBsoE`g_p|E+RYSdEmAb*$ z{{M8Ie1ze|N)TkUE?i~>o`bh~SxOx6dSKL^x+mYh_y&-s#mZZ3rP2(OR9lnwrjIz( z7-YZkk{~Zl1RT_@l2b)?-kV-km3%H)mmpOo@dp1?l-0Wt?F~Ertx#6?W&IPn4r>ra zKShji5wSo1Hc(rB+&4w8>vcVG!aIR2xdkQH0(;sa>dnLrupj)JT;B4W<#x7`)Q2mz z_&~kA;yG8ZuzlY3oTBywCX?fWyoUNV_!fULV(9H;Znmw0>FwuHKeUza0eGjC%pSKY ztq$n#KvPWbB1_juB!ta`=3zZP1mlll+rwDr3^Cv z{yubvZ+kG))Ot3tT! zx6O~z)kukhdFxV#XvQdFXRjb{h%f!CZ0WXvHMyln6Kaznjg^5^U!M%5P~Nr?kFCIzpDV;EP1zjS@xug8OafRh8lJy~6Hvn3|l` zT}79}cxi?Xv-b#68}ERMqr{|-J42#-5|8kfbOyRs+AK`Sdwj+p*V2cRK082=uD$Rl zc9Q4BvNB>hE%uQ^dA3DI{F={tyXt2WgC$lF!Lo54Wi5hgNM-|?6O!aT5;61@VvuH~ zp9{rPZPTgtA}iZIz<6_oVtE|;d54G`YS!3Mg6iWafTNT>uW=sc(OCDK-4_+9?PI32 zGj^Z|eH_O>`U6U9j1a_Obfdi{m>#cenvU7g^Tz9w*f?JD1t3pSoHq>oaoftRz>OmE ztHuQdN7Z1Ph0Top@Qa}=TA8tqQM=Y5I|#*bUM{FENE>h}((AA@#4|@V$adH%6lWde zBG>la*&zaaj8(qp5Fw8O=ba}=!`LRvw*V(s=A-i?VtZ$*c|Ebwd$|&_efI*Y=lC&6h$cM7&bDI?`cmU^X?-YbAo}gu)6tX>Zb~1breHR`;X4)LEfrEmyfb zogkW0f@yUaFK0Vcf*QEEcH=m^o;X$$_Su6Wsu^Rb6eD)2w;e=L zUvZawhtfPf-j*THRu>);Aj3slvdbK5-`k5HUWc-N z4oU01-gk(DH_azsJq%_9*h_C*#euU@NUbQUjCEAl7D1XERylS@_BT;@Gc^BW&CP70 z6P^9G`{AL~{6uXcyWjGT2&E+F&h64Nee98eqBwZ1lllU?9uOi$n^QO-VNR|#?)1Y@ zW3nY9J?%M+wW2UwWvd)=%@^aI;%CPMW^5c|Gu_rEsKL=OlPn#KH8$A~jt!J}e|>zm zU6+H0h@!rqtiqs>EuT8_G!ga1@S)k=Y)^Zsu(Q`DaLI9jfxBU;4ps`%kx49v>VFin zk_7`v++%$ts8QQj%^;qiYnh)Ku)U5C#80mAWf=Vx5qT&(GM(A`9LnuWSAjj^kif)J z%)9=CPy-@jj?au>EdOk^h{P%1FE-bbWXEad@E4r(6X^ zkn4XryFJEpB}U_dFF%57CUlbi^of+!udJ)t_Bkok13g9uT<1`}wWj;5)M0WKbIJz$ zTf|IgnW0*HTaYF*Mwu8jW`2KisChZJWA`gfmn2aSM^=5`DFG6LD}CeelfX%^cwF;_ zEW`l1yMIxTUdMR?ixYx2T?@bCJY0iGIQz|0;o;HTz{+}?3Fi!_=<;rIjgI*jilVPq zGN4=0KE+0|P7!GpJNNn7QM+tms7DLo=(H^8ZWRTFL}N>$VT08=RLnbh;c&=ZG=ZJ$ z*wegtK{r?PQV_gd6xBaFr<tV(bC)P zw@Rwp2rbv$ts}~5XF;JM@xo5s{1iBh6|a+`{QFUE@SK0u*`d}r3S0w9Q&qBSkPAm# ztbmE)O>gr@6{x&b6 z2A05OQi8C8pMh8;(4jcWdIV@JtF?^3w!JJsR{c0Pte=Wtb%SDdnZra&k2XdN1@(jM zYejom25h)lxWsu_DpR@R_UovI`Yu`aHhK~E(K_bWcEzw{SojO-p)+o}%~mK$bC}3E z##(`3dVELT&*q$>a9`6*w0XAE<)H}MBwbmZEU0l@(U$guTWo)E24ukT^(FSMGsHKH zDnY`-M|GM?o5yG@XBkeD5^)2<*b zjnszXR9$@#T1bfog}^hEMI|Sr6vbD#$f3e$*3Vh4LyS&kA>89oSCnY=J{RS-=Okw8 z?7LTns-zpr0QtdH0ZOZ&IL~?d+V+ND&o&$D#C923oCDP(OxB zU6f+iA}zts#SY`B+VSaKOHGZJ-3Tg~XOD z`%y$;e;OQUtQLh=2^!C_k_zM&*5GW|p%J-93*(W?VABt5!=OA8{8*6Z#+o+TMy2I+ z=KtQqhZRrv3Y)}0!{#T~u}8o$wLR*TU@)e&<)Z6C{jys@jV}9oAV2PFBvE@-fb!VQ zqRy_X?;Vy0Zb^C(^B(CiS&%0$^Ork}l@Ls-$bN7G%zT%#PqRfzY7Y8D%^Ka|Y7u$0 zFLaBJygftBY94f>E=dF`&^*Yg- z;a7FBLvBjvu~B=>VQT$2+LZR`U&G7`CJTpcCr`(^a%`#6dbQegP=JC6VmFKz^uId9 z>(~7BMtf0^X7e+Zh7j+S6oqP22-Nq#-5-&kVJ|4HYTJh5RkgPMjsT@Q)66}}+c;2YxWdzPR zfo8_Km}2di%#gcc`3k#P>2%{|7AsZT&tyT-0G=v~Z$a&4t#;Xmi^$Uxb$ybUzTlAe zRoJaBWQSZ0IfAXVIz(2X?a-j0hM)X&#9!)Y86@es}P zSP}KswZ%Geu6CYJ4(rlq$afz3L8QZr9dgYc<74#;Y7Jk2g28K^Cx;t&Fxo%QTaa5Q z_fpdibu^u7wtFKum%&-Xo{gBff=1T2TfUbq$deC)FBJmS`m|qZ_q6n&h_pVrr6Pvy z*Fz5Z0kD6v9gE1N>xq@XRz=!5dpjjKb(2!(aiZw`$2Ul*dO`%JGbJ0pO^}{F6-D#6 zJS)-3)~(onTpSAWED36Ec9_WABI}(TK?=sO@IQ7?G&YX3nV;>Ix>S`J=1>X z#Q*IJCpTHOi28WA+rVDnq7onzAgZKuN-77<-}ZdF-ytdzaxM9kphnkhuEGlvAs2b) z38fG<2vKG)AddR9O^QM#%!f%XHV-NDQs`-R6vW0FWmJ(ZqIjlia2#w^4iTcOcee;q zCmIIsbGfKI@bhvpOc3^63#1QjSazj--_xnuCOgK{v5h76Bc*u+0Zx*`O_t+X3G6E$ z^sHFf8r`#PyDo4t3CJ05o5N)PT1^Ol7vx#-k!d?&J!y>;v~Y=0(!fTx&Zb3Frdp3!lH?>RkslS>c9qUp*s0C_QG&c>ys!jgbj3Byh#0uh#+^gCLv#=| z4%SBeRRcl7Xzw^fBCjGg)NQ+0QK4Eo$o)=~Zf>vzN~gPX6o;)+l~^PSZM^tm)PTJe zqXoDtkevKJQS?4s`*-!(9~{QlRwxv}Ue$pg*C6pm9987m2~Ob#jw>u(*wl)m^BqdF z>$z+v-13wn_}(YF8ralky*1$}>>ltoY_AK_MxA*%z)V)yp0$A@E&qlZMH@Xu6dK0q zLNbyY9VQ8&$^LLykjhtLp}49plx44@(9y&7HYp0ZH=x&WZ$qdj){!62*ksQ+j29GW ze*9Ep;Ec_*TeZ^qfE9I8B6vWM`DoQ@q~n(~c`cKb$>B}*jb@;R25zjma^Pgn4MDyW zlhtM1;&XGLRqp#6;7m94N;_F;^>trODr?-XcNiZVlLB$BpgN^?I#Gt}nrSO)3H&Qob2wFmf8V=Dva87a2C)X zF%*~S@^{ZoFwoueGsv5j7IXS)GluoxjZV?IssgVH8CK~0%WhbLvTfj(|2viL|-qS9L{IY5aX!1&1NrN+NI7t(IHOTbgH~bkftUE z=4^v!p(`lQm0eLmZcmV7{@0;I5XjNfQ14aQ%Gx69r)_#e%h{r*Z3XA7M-;WZej?Rd zW@q&UDA^yth<7mZ;h5GzavE!iQK!47G7DFOyqN-j78vZQo?a4Tahhog1~usTHK)S&J{UR;BgJAp{;O z)t$`RleR@fnu+nA@b@+?p3@F1=1_&0b)6t#9^ak972{^>B@BtN#c>v$*|TbqZ%5l)X&9qbw@mC78L;yrsz1|PKRx9) z)1eYPh9zfatT2Ma??Z}t+Dyph+EjI}aHtTF9Wyo)<(AKIBDQB8Y6DNF-8>h{#?b5O zs^l4QAH2fWTn<_GBSErc5_pnlaqYn2)G{N^ z-Zm*V#2s>o)4fafgP|UTK*_d7tA7Znht90$I#JvFK~bnVfgbW7JpGS?b@rgr8hB$} z$$IWu9u6?s!$RI~FFBM{1g|vj2vTjbGlO{UGHe@pBv6>sbUoKyVtYRtYTynjKDx$^ z6HLGU2Gm`p)Nzti}oM_@}u3>9((sW_g8 zKNgjDnNuP zTRSYX?PC#Hbe60p$i?wb0O-dnxP>{x)3K>6J5Omh`pZ_yi ztTpG{XuBv07k3v5LA|#_RF$z=u}p`FbpsM5=SR6*9F(!0jLJ?=Cg<=$_9Ru&1kNC+ zNpk0JJ)+P{2rP;9LD%t_>bj1yLbOtw$7k;;;h`vHkxNm=|BW;)W%w?CPfjjI}k#s|ZIR2}>~rlO4w>cR{U!E`XUR=}sEst3 zMc(cgMIWjn*Pc^aos3|Z9ePR-0Vzr^5{kCI&?&`Dm1ZhI7wa8XIAF|jdX!gDD1?NO7L|$MW zrq<8u=KdW)jc%;J($+T`-Bw2~Oy;Br*24}Z{a@!=xv$R#3uEP{c6ZTxhQ&z*t-~R59=`EH$VJE#5YQVpQ zlyOgU$g7=$jn*$5O0$gJ_%Gk$XIV1 z{{e+tm=kZh37OG^5N(%>UPO4G+6tr(E+^FBj-HN{wA(K8fqv7*-T26^Ws=28=o$Vk2KlU5kzXRcGy-qBwCH`g$o2V1kuz+ ztjsy4-5I!1)ji^rAgw+5oU`m@&%&Hu)dIFq!y_)ZahYAOw7xSxAqjSktE$SuSZ4y9Jc z0eRu;fjQZyI6s#2Mu41qos6Ao~}o^`0pB?H#@*mw6a-4^$-A;P1AHbvR6=>4RFnohdfnE`efo zl#Y$g*dniuql2vKZgGg9Ac=|T`%!MkCi#1K`)zLt(JfG!+5R1%CJja*4obp)YD`4B zol5}8_#=qt$5>8=edXUFS4MKIU6t810pFhuT>ggCF@}co3TpCQF^uMnzFeh$z55>HLoZgJLyc zalk;oP88nYx@8tqQoQ8_e0$sHzYY|30%jX%2o^N6}e5TugCMb z-Rul_1(J!&!P^|4e&a(9In>@)T1LoAb#?lYkoH;Wc%%PQ87bHnI?draMfOn{p9&Sk z`}?g%X`YrC#{KXdzc_0UMUuFNE=f4BjYC%zlje85)HN(^xt|U-B$_tc54Iz%P9HBw zu0cUL-KW=ys1D%ZO(5IbA~-RmQN90XLXi-vk@Z=cm1jCp(X=u;F@%eCPT7S{Lz1S;I>=oXo6>2)IV#Q4xON>g^1An&CMQYxeu>pmM8k~27;owS=i2V~qN zyQXbGX_#@OiN>BTMaP}OMYYd}bE42KI130ue?;NBo_auZo;yF=7_aWMCk5$^u-PFb zr<|9*BO)W?wv}ev2X;blVm4$mp9kt=BLZAgEy?nOqEH_nNRz+ zVc$93YOj7VJdt?<*B`dsP5}nHWjn+ny6!rr9u?(oDNocl+ldZkmsnL~R|={b2(8kt zUbVI97E!OzQfj~4IaJPcp&P;9M=%5oxG#dGow2n=_LM`3i*e_=?=GP%aix$j+czER zy1l)}zAvbbO}jFeGtTv*(3mhB{N}W&m-;!NJoqEJ%xjpKT1FA&jDM8ga zJP@yKm$2(CXTYRLs%Yo!7K%%6t#_^c$swKvwcxxU$cwL9z0~q{4^<^I@nyJKckp+K zLT(~=jm4DK&!xLro`Cn+Bfw;7kB+|A3Box)J!*F<&EP%pW-CyV)+MHZ*UyhCYo4CdR8dxcy{nef}PtHab3p({FVA3+{<{Bd?=l!NP7 zwul=Ad9r@XwOKe9uwG~25U=mLCF0MhkcYkd9I_ckC(~~GQq-ty#q{iO81InU*8*MUVh09%dF1RsiqohQ<^rUHuR=i22-25tj{UZb*=7@+~OTk)UBm0$>C`` z_{#y}`%ycM%LUbc>E&B~c5+1FS!d$~?UJtqW)7l={p~hEF&7LX1kx?bf_(x5SA!ZE zCkoPAS1-4_zZ!VeinNOooBJ9RDvopZdrWD~=wpQY9YJk|VD=gh?|nVcg94V^LEu7x zoBl6<13E7uk4Wts@UK&zQbwAb1f_6v|pY3cWEau_chmiX=7Z<9-BavLgl!yMgT6-6C1mSeyDPDsbf zCM?&}2@XL~I$4%)d7{(wM1Kx0Gh$pI6>qjy5!G7k)zRrX<-1(}(qx^*Y4qtWKRDf08?d>pr{ZAZM;_;CtJHg6iUYRZx%4adJ2d1na&)6|fWvr2qpS%Ji*kvz#IE>QhYAPA6m_y7gLT;|Jl6j% zV#SXPZEY9SmqSNja7v@__fcW4_t-){=1|9|b~FMG3;f(!OUH)E^Hx#RNBAe*=J1e~ zD!~y-^FA0nNx%GZhnT3wyK3#rM}%C3TVTQbi9g_A+pFa%E zxrsp6rydiYpB(6rU)(zdX{oMX$2}dxGUw1^13lin)(%rzeZ!FJ9ndl32vIo7XE6wc(eeoR{599r8T1Kh&h;LbqfQOXM4L3ZvVv$pU|&&@5=($yTn;!3Tue-h+15;>Lj zk*6;|^a%UnS$B|2vOR24yEuW;9p)K)^v!SPdLK)~T$~NLO zQD|Gd1TT@p_M*eohII&#?cY&ub%*S&_c$jM50*+u-E5I4dbM2c>ALL>L7p40%Ca>1Q)POZ}n(g;LA4mgIwZ&HM=^HV{nowG=Sy03Hu;q(@JW*=b!A9g)qVNfI z8*Q&F2IwP~qJ!Oy5JSFu54iv$Ro}!--f2oxofbpl#wQ$V&w|_7A72=9V`J&7$A<;= zu(Uawwwe211cf>A@ezAKNovxC6fq|fg5u5XinY~X=5GgI3`L%i3Dq+_g;gd|Efb@I zsea<)w#sn*6fvlVQXEzn+PQ*idqbuaf6^A3wmIVI%#i^2MTe55Ab)?|iv)e9^lx&; z4!VSTcwHpO_Nvl6f)y6=v|aP_08@p8hODzCmj;M8(RfLv3<-o< zw%%cUs9A>8GY(TVqY8rXt*gR==27r}QfVH9ix$c9|KeGxJggZ!3#b25|9-|H2Z_Yw zW;=apXoGqnp77>HsC~Y|6ALK}P+#=iBmYNb+kt^yniGw>L;lV6;VG`-d#|9LvN1~=4m>Cwx;!u-txVPCS)ZUj z+(@R4L|FxO=x^8`946}W|_2!U^S9?VIwjqKo1H>1x zNr&;G>O?`WJ$4Js>a*c4``oQwu)9)buRA*An27?Roh3+vIX3EY9v@-<`E}se#sk6Z zXKo9S#X#HPpF2cVK^8&RI3yn9{PeW_(IJZPa_#?<+bNrFMy-AB4zCu&&#Dev=8z+Z zme4_ma#oLg;o;u|W~RF2K#4siNcW$1uD#$n1P4u`%W~PB;R(r-Y`f*RAsuV#v)zA3 zn%9aH47rvOMZMdYPaFa}Dg&s2B5&HQD7R_#{g;Ru5NJVl729xTsD!OgvUf&XpY5`| zlD<|AzJnd|Rq?%S7YVY7kx-m_RQ|X3;e-b#2zsZL&Mf<|?qZ7zYA9zGqe6s^t6h-| zh2Bcl=i6a9q?u99Ux6#*6LSF?a`m-R;Hq$ll7TMM-VvnODM;P_lNUIVKPb%X-uWSG z!I9s#N~QIoo7bcErBC^;C{!CSTBm#JqYDCa9PR!LrPW!vs4R8ca}Jq%m`=Ne!89|(M*E#8s-FPx^%*wc41VdtmCOsG z@KVWctP|FR;@myD5x}*gs5pmR6ooPza$_aS+ct;j`(nLg35Ri9Y+xOda_B)#i%Y4B zZzgAD8%EHG*3K>+k5qSpN#!z1q%2rmsS_bO_b-dcX#C@_y`rQ#b*d#kGNc3i9%WFt zZsAtp?g2Yah56gk+)#dO* zY@Gl#;#E8`!&OP1y(tP^6>rJ4b!$UzoO1vocSdZbDC8zk>`$lFZG`S*EB0??0qJ$g znU}5$5Ya=jbj<$kP{Vb++n!n8bQ>~MZO?QiyyDsOfsK9=#eDyT1r=bV{A;wxK+DC)7UG5fQp zPb6s5t6mVHW1=e0-f@Vn9k!|V`Rf1GGf*nGMR$mz@3OYSN@_@}=h;yso%ztA+zlZm z`Fd^0)t0Mj%D!3`Al}{BuGaYJCwh{0 zp;I_mAr6uX-!DX=&9T-#>-TgVG5Q~rR#%L5>qxv!b6~sj=H*tVG+EMX!i&+m2xb(r zzT1DD^F)-u7j6igaV`;=DY6Cm@pIXWj(gbNSf%qIDN=Bm@%0?pr+l z(~}n2y-I6{6|YN7HR>n(f++eBTLvU(eylY>7Q*CgmmS^~_%$WX6Zq=q$0j9kPi{&Q91t9f3KycC*gW!-CW^KUZTTO4F+J6(43<8TP4- zPk1MO{>suL&WiKQPF2PDWspRwN1xTzJdxYKg-{N0S!L0q19$45}n{$N>DT zL`!c(!>d;y-6`eT3rz^>wfhRugW@*Fjv92zKpEQv{lce2p$L~iv)Bi@4 zmANI=i`vLg;Ec_1*^ZCkkpMYx$`|}K4vEeko5PuT1Z(n=wcNUH@u&1vOA@(Y6sZ0! zt)JHpqRu)+(NMz~z?{+=^l7aG_WCkB{^nSyfis4za~BE1zjXDrR-`nwu~ZXeNeaY0 z5%W|gn@QVg+$%&;q$MNShhpxGy&|H%1kxY8m2EQ-D$y+2QmsA3L87QtqusAt}apH`Z z#Yzmzt;G&`mNufx9VVu`wVsYgxzp8n)v%w_z^)2%16&G%DBkE)JlJlfmbL574pL@Li(FVLNTCX5obPDFyAKn{^bNq@A7W$&H z^L>FI7jub`VjB?TXLj;h@N<{-2j}5*-chBPV(UcF3t+>x4znJePwX!uYWx*%){f-g4w*=jnQQDlLFAVh zXreo(E8D7m{@ZUJgid3&X5&O$X{wb}pmGfl(p5!}!vW6E*2{kA0Eh7{eX06NTNpvEFfy9$vIw$|o*T5^2T2_-$z0^HBG28rD+D!t)xx22&T8#M;>kSkA}S1Y=#DS!l9xZ6x7(~{}^&vHDsZ; z&|#`(i0E#1wV*yvc85LUS@E29xwC)IVJvG)thm<3AA?iP9qYjP0MV)yUG?zC{n2^# zshSL1=P*&)kE?&{b;xNX)ly@xMR0Cya^ogD<_XG5$Q;g{Y45F)U7RDjMhOny*iNw6 z{ah4Yo==tR>b3hFCb^@zFirvi^{nk@3 z^XRLP{EsTF?^it<8>q9V9j5BXkUU$=Ia8IR_TXRqp<{{!8EdY#*F+(fMey#)CXi)U zC)?R>ItPgJV6>ujYx0IT(@)tTOS;Ub17y)>OJffR>IKV6Bx@o-ampFgV0P{gJrfwh z{eou1KRJbo9Q_tuW}onEs8hkJrbg`N4mH>~RV;We6ND1&PS>4A%uEwf#o59vf( zk$qojwS*t}M6NC|&xmp-2kobxjt|Y+*-Gn;GRx!Hopymkv};nmnf6GOJ6)lJ{T(la zN5tzUtX*k_>nF{7St0+GdCBf%Qvy%^~Ac`?`M#?!3eHC*FW9k^ISBF%N-8 zCX4IsDy8*0UD^DhG7eP3a6Vr>it&)`7!M4RK;ac_S3$}q^84&X&te7Dw@L~kf(QzM zv>f>ZwhsCoBT(2iK`!UL$ER&iCH2faqDXPqL}=7ZDyT#ZjTK z&;K((8D*d{a>(mIUPY0%i)$4O`v_6s$8#}v5O$<9#2U+S=qMwR1rY;3qKR3%!5R1g z%3=V!p)O~@ORaW@PdJ>P%@N$^jZi-;yuv{I^k4qCx{=f@%IXdiZJF|ynJK6aL$?ht z1Q^b6KuCzF(NI`}QkQOIU-nYmp~;;8b%)8a?qqqFz3EW;>m1GQd@~f6foDaR)}T^R z^y_m&r}W5}5HT!T3+$f)bjnrB(zEu;w*pKKmSgi{GY(V4f#T-rt8bI5fxLNx?fGv{ zV+B&DG?k=h{#Dj4$ZP1+)+jot%{W6WELZn?*#!$0*z=y1B+Myy9$$SoR2#3@Z09SD zb?jM7S1-Ae=uY@oSmfDTGdJ2|rS&}vs_>v@MG@>T(`EFZ1U1Cc`n_mOYOOo-y-+09 zfs2DtD~n)Tk6aG6{wL(hh1}G7`>VrvTd}rxZwd19TQjrl^Y4drG7E>BN{3$v;ws#z zAa&41>CU3ly!naQM1F_u2`MqTJq&6xZFC3#NN82Rmo&+LNNF z>&r52%?F-aSVBxDtq-66Fg!~uWNDEd=r9;**;+x$#yiWc%G1d@bTgbSQMoCxankM; z)aa|9jTh(2gYWepg=+D@rS-qbp_Cuk?(g$o&n22eL$#e3!O5IheXm{UP{wD)8FqD) zJ3Ev1eJiMEHEib2Q&Q!9KZZghQ$$brwWnkKjdr`z`k|~Z;3{4lNH*@jaUxp9@i}>z zm1~&-5!GqqS;^Td&EbVkk@8|*ZesmLn{|p5MHGB_*3RGNgy`4sLwi6#%}PfQk)yJI ziK0PvujK$|B&EF8jd_(L`6Au3NYBXN}Lsikc)2nioaiHM8APN~@c2IVRK5MtMd! z{xcBO_{tG3#jHqBW2nE$wt9L6;mNQ3EOfnffdbe{^zp7J6#U7Jw=BaW-|XiCoS({+ z-1JM^2Z&gaBflN{c_7us@(S%|JCN3|E|@W~M`g|K+0psOv$Z}>I+VI0(QzMlm>emR z$l|YpYUDr{%H5>7S!2TIh^8%8Lowox#FjyOekZc@v9SQb4a5dwR_q*}5+^DShKffB zs-D6zdq`=Rlej&ctBw;y=~2%;3+>p??HV{2-@MXJQCdAVTrW3is~k%9jLp`?y9Lhp zc!%XHEl#Wg5Hx*Z_W<2~{7VkwLxs9}SuLmr%nT&5#^s^Y_6Yn0-%Iv1*!~U`QVz4X zt%6L^tCp_3#bQz3RQCMD2K%R=-W+qfG2DA**}G2p#8J6zXQ6idC+j@)B|&|m{sBAE zv(UPW4z+>K7uuVjq%%-l_98`HNseG&5v0fsNL&tyvdVJ^{9vzmReTy8Q7OKg|@fwbt%Q|v2Bs=od$c6pR;Xt8BV^O0BF zxEwjIfZEVXB(!Eb<9k4Tb2R*`M~S&(4G|?DKX3)IwP_WFGso4-mM*`_QbJUq(_L1d zedYkK55L@P>ur07OsQJ9u6M}Yb+WY3)WnZ`bp$?J4%xtv19I_3j$#!Z43&z+G z*o6_y;WDz(ngn@}T%Js&y>|~s6zOzU#|V9Q!(s3+jkA=*rpwq)OV*evB5lC`DK4UH zzr#a4xN5}nilq9NilQ|Lhg9r@!Ha%w^lJohQ{+&$4XO z4q3w`ELpH1)DY{R)8TtwP)kyOP6`{mh|U$e_$v z;E+JR+VSsnJk0urvtwvtAk&l4&h;WvTYPZHmV0_@)|~yy)3>lQ?@^lXg6)Z}S?(9q zaK>rUbULo>9X}2X^E2fN#QKaNbzXkx0((PgX~Shq%lQ}^vocxU=`P~X6GJ_^UgxOE zcv&C{590CxIi#%-q@J5GOp19rRy1fko)qZkasWR=X|+ve+qrguL-tt`rQC6H$YmXp zCR&4_x~`_!9#mT2dTNTHqI>bJ&YWzROE#>tccQZ6O{i&GnN?I1yYC0xuD!9Q)45yRB_ z*yMT}`5yzy#jy(TwCSocoxTBrPDnysl`4o zg84%U+$5}Xs2g2Q77s46It+nY+NnsM9-mD=$?4Up>|4Z71EBE!~h zyUC&M>X6>|3hKjlcH65;Q{|N_?Q>@a@{P-u+JQ=I^pB&^T&4-Ctj%r|k!QvV8s(v7 z-XW_C!dv_FSs@o0JoAff=TPQh*ymm;NC!(JA0Mr#wtgfEPd@uxt35juP1I)T==)zm zeT;QYIIEJE<~Po9hRj)MCGU1#fNbe>&vuWXx;it@9`SUdXAZyJ&Nkk!>)9s+sg~ah zTT7i5-*gI}P%|!{F!%gW1hqkl@ef-RAXnltTl@54APY;0Bh&p$^nBTw$So^^=8Sk}a6T^P!8=Fm2G z+lw3`W5OhIsi0bd<48PdvuK=labRF5NFDhOhkOcoemnURa(Rn*14fBI3!HeXm0IXB zhZ1KZ=I9aBhp3o{mEjp6&;IQUvCbm??E0SvhG1@>SSb^t@KtZV-4-Mo7X*1tEx);GTa#^m357PGicPRs)K_!|aK4C? z;z~gKiC;!=x>6D7&RIgP>L1@|yIc`C=V$wpv&aJk)r~n?7-cxJ!5NZkx5)j_v_q`M zIZ#=_mHwbL*$GT8K61!OjFYfkbd~4M&0sTOafit4Qu&SchQoyyoPElPXW5UJ2Id%1 z5^%%tsvzHa@y+&`tG$}_eM#1vog6X;YYXzIL;0ZQ-1B|}#|!4CO6`VgyjD#7iEzBb zwLlG-mc0L^*AChjMQ1tT)U0r??HswLbkYA6(doz-W@hUPY+r}CzSn)qX@c|?Vn7Tl zxQ((qoFP%aR+o19%K`&?qp(ETzeG`mz1@kP4*U4J06E6Vecw6P12wpp*kUDNjB_hO z@PUgQa-)?P*?=c(a(Ry>w=A@TnlO6vio>mDfJRW>Hq zc9laU6{*Q;n-bJlm@1O9bS?`|UgeeN4aA1(ty_@JTDjyZd)TwsfyJ5k&%K!}EkkL4 z?HF^)te(Ta^5}^Uc z&pShGvYE}F9eZ0Sp2#8UM7`Y^LE>KFg|f~eem!EX!!~+*U}h669rCS$`~)<$wD+;a zXYVM*^>glk&KqsTQm#g6eN(*HAw?e&G3|P9$>h+Iz1{iBBrw8mQ zhpc?s{|)^%6h}*kwIPWJa#$2T2gMb8b0+P9uLRYT1YN>4} zOpP4zk~wSbD2MFAiOy-U+>o0rtdYm^4+Lq+xi{OMc_jJNsSRuG38nQm=uDyBkx}5V z8r3ACv^$I;-NEt8YYrKA($n9j5N6sOJP~Mf{3%hGg_aVX$-W|>=HbzXom;&iPfbkZ zS(9g_)($DGR=dOGY`5<2-*wKF_jKFiMP5nsq;{_Jf;^HI5RptjjpK`*VR|A~G;1Ym z!c)${acG~?wB|~pRoFuUdQp0(y1`B_0qSET|E6lTb&EuS0cV%-`frfW=I2+itj9hii zT#+56G|k|iURFcx4r85jan3yt3ThjGD>8+EF4jHY3tl`vR%|<$hvF#AXf8O=A=+8Y zn|6ppB0iTP$Km%G_CD$F2=k$-azPX@kR5Mq#3pQ1t8kLA98sX@Ygb6nbhOoRyG2CmRxC>{Oj;dcLo8`WuS3jDQmq7ruJgy$RmU4M>|uur|Amp~ zm+Q&ZFdpdAtcQH;NT=|bFhG)^^A%ClEq!gW3An5wz(hxtdi#V%peBRDTwAQPIB_wF z$BW|T01*Z0`s4{gIOCZecHoAP#c^K;iz5ZOIZy9b(I@)9OUs)rftlz)$&oI*sMX63 zZ9yT@5`t-KANf^Fj+igmersYnIY#@WiO!PR@#=db&qq*puDk5G7GfVI!M87puFfYI{JCCdK$E zKG++wQZs2m^b-zcy`GzG9|*!3uj#b&`n;0indHEjZ55;q2>$W-i`}$s*Y6AsgnQ|- zVux%cbjkbP0J++~V^xWnM`<%c1u2hzEIJ=3C{-Y5(?Hp8(5vFKJ5p)~3nBqMW#v-B z$}IBCWG!AsHrrx{vK5?|z`=AVnpnF@BIss8-eA#+<+l3}X=Va6e~wg={*sk6Kh0@} zXqm)Y3M}CeUpm?`E*cJXBsSODD@s#PYY`HUR|WMW#;YAF?kqn0j06gY%h;xM5{kbd zineVN#T>|(^;yK`=dx_cmQYo4WJ{`M*uFX%3Z}1P3w3ROpeTCQbgM3Uf9nuQrA}3) zg8DXDRraK(lQnhrl&4ep41C(t^RrdbgBkUNo5pPn{%-`5kv4E3HOA-;Usw+`!IG0rg`j zMFXu0@A>w&C{&av%d%UhNoy7#9ZJ>J^Pd9MN&g};8R)?*>Eeyv7(xSlm-|SgszvJ;u3T~R zO}3i|%ACrv-IeA+d<-eA-6Y7nNLL4|XO6r5&VXDYQ88@KI7Dex&WgW$e<&^uPvqUl zIb;S@RPc8kqW>e^q&**iS)VNJZ=7R}G{mnGG5w}hM9;BL{UN}3!I-2}2RV$@u(6-D z&pt>l?|Aevgs1t20}P9aHz&Y5tw0U^Wu^5_S;M-0IN~87iy4aG`i+)vN!#C~O*GC? z7Nugnggj81^^$!jdsrE5^!CHAZ-|Q8=lQnEL*ws%GGxF!yz}`k}WaG zUmS9WpBP+Yr#(V0tzIsB#SH@VFq=UsEFpLM#iLNvh;?OFs5FmJ;0mOr4-2Zt(!o=? z*prs?W6p!_s5Yv{{V^1gr5e-m`-0RTr1_Fv?)z9^khlOj$%PJOmr2B*OC0hAF4#4O)6K4O@5H|nMGWN)^1i+*g8l0>EcbniT>A8in^#_A zcM8CY^T~O9M?Ea44lPBQ|1YG~j!8_Z+tW&}&GuCh#jvT)j`Z}_%o;mdX&RB9)h1&b zvg{$J&;oW)++>?Q?G@riv`5YzC6wJPA{AbK=u!43rPc204f-BOKjU>~6tm>Wru|w` z#E{+}+6sqhOZ$yBBuG`MnKt{|-TYL(_C{U(z= zC3~sVUm>PnChhB*iK^;3D0=W%pM1+~b(k1R8!xYXK2#WAv&r`Qn{!r7N$cr)P6`HjiUIvaj5Fe3iS{JYnfji&pUbwsdmy`n{R3Gy<@ z{At_l=_{5lwCG5V_wp8QxlqkH{va9gNlJJ@v(tfgzbr_o%$I98p9CAd67gyNZg8DfS2Bm9Q zXQOwWA=#d4usAOAtVA}uya8Jw$Y47C9DByOmJ->0t8=Af2R3Z4IZPIC3L3Qkz7nd( zajri2qE~_ZFjxr=wcTF}&x&DyaGuii2@VKSoAd0C4iWsTi(V9@7~0QE>`ejvT*@}< z$Bri8_Wulp8DvTJ1O=}Hd50w{SK5rH6Pvhvd?U~km5pXf>lcQfJVIYQF38FgLOJ5= zgsW-dUg%@I^=2rQ9L}+KlvXM9A~&_$?r#BU3F-yx@pYJJd>aZ4MZ0IERlE~Q#u}#V zNu}w$dkXC-B{k8NwNonAQXwCke-TkN7qHl@vA7qFuj#Pk|4o)&UXcJ+3&6)3A}9Ly z2=c0nPCMP6^z@P&Z?tc{7ua}4a&E$g1$p{8XW7606LJVSiW_S??fn2*JX2K}HYZ5i zbO^;EF56rrm=}Ezcv!|1t$4^`GIuuF*Kbb<(uA9?U1Dc{=$yn{LJFA?K?2)WbXluI zY;okg^w|iO4r;G@%}1fE40PG?mpSB)AvrZ}&;2*#vfWf*%GU%n-87HHhX|KADZP^m zw)t`TbHp?8(7|pu&(7TzibiwOoL!@|dM-a-tJD#n3J~8<+D`~83+G2cApK8nV-$J13nkhTeFFrt~6EY=(lmRj(*3B!s{jH zGG*U0Ycr&XrIel9k}WqSFgYZd+n^Doq1!S++_BH|rfHv&XePOhakVtky51$un_sCmoHZqy39VVLl^kYv%xx^f1B>eOXfjK#f0!yi#=TOGFtQnmSF&vfO zwLU@G8{>Dq?-xIL3UQw74-RE6$1moKJCUnzm8Us9Hjrm$io*NFx8QBN&W0jb*~XBt zn8VokM(xI471a1lyK+z9wVoaGcXtj%Nv>-8gRdexRzy9huMk;^#GzM;!h_OQYh!lF zE&(PoXA$4qcDp)%LA~qU=82*&Ph{N2LF;w~tp}VMQCPdr85HygW3_MX=KNhXas{|r zkhx*Wb#|AM>Ylbydqip4#paeZ z)qpVxKG2v*e&Y;y&_kWmTAYChrwGYXm1Fk2Qxs6Fe?kFi|0W8PA5z7ow%q)A3`2B3 zRtst(O>2D1^(-pYL(Y?SgD`9lMn!NziW~caJwpv#;NkhkUKgZ^sBFtkF{`MVEUJLCp&%MtTLF$tn=Sz zpYxpO-1D5W&n}6+ZPutXBCl_(T6u>~7QMo%EBMTr*knVZXb?nf zGQ@PVM?Vdlem~rR;0W;(`-ggTLz364+l@O#(a5Xk1K4U0Ib_Fy(x{aQ@?`~!iDNzk z;+d(|x>PZ)ho=-gxG z=pB(B`JtZ;Fgd+Zjx3%Rq+6B{tmrjQFJx2r)Pc@c*(@WSe>sfCtMzT3@j0&(lbD=( zTP8@$`2bPXqz$_3&2gTntXevw-}-zgLY!2BEm@w!L`j(poIm`9kQ=pcRG_@iW?(Zi z#eQ7%lFRQCtsjr*X%0fo&JfS_sXf1T26cRh*7?nYD5CemvioMG=|j9!==Ssv4rL^c zrBe47$(3j=6NN>Fa5&Ms?ch)fz0GKP!j?JY^mlKbSTHPU`>^TU1PCYbKD_h&L9X8=GJ*yDp_^^4 zH#^iGw4)BC*O1E&RfmcuhiDt?Gkh!qXD1}l`?bSlW1;4q*Bs(dXr80i=Z^?gDLPsQ zVc`TB{;QTPxsB)=p2_t>JlA6X%0Ofw829gSq!%5WluY(<5B8 z5i3#jZKWiNWad}S6OT2^;^5cLlZow+4j}d5G2tnQ&e@mQV-DHR%Nf?61eqj{KlP+j z?3rT&>-_9k+>m7-JTAa^OB>Q!+ea|Xdh*G3zJPxI!7{%yn0IazB&3Y`+2ccDu12IX z@%|G62lZ)47vAdKtxJsowm+e;a?o^ra& zmYx`DPj1{HQ%#-JGow)bxrhyQ!9`(C$1rFz~TTr)85vR{p4@-G(!{HS1yo zqiLvD#r7s|>-IA@xL}{@Ba!Ey~2iO+j2T-4Xr8y&vwgFJ!{(fMO3G? z=CFvfMX~`=C=%DAP3gJ1ukXoHg?oOLTs^U?DA|=|%fA*NLU@U@@*HZLh6T{)z8-Q3 z2!V$a`;wrfolNIt4a|`wBtpMIbeMjJPwznK6keT2c_XB5z z`cu8N3jUboFxk0TXW$Y+4FEJ=%QIPHISLR_CrC?z!K*Il>oW`}4WME0CqZVr3$MKD ztgGyOX9sp!v^Mo5xkYJO0EHir$xIIae;}w|yKFs5zafiQTKe^#$-wL;X>%NNZpg(* zT9y=V3aTN~4H8MsdA^W*LI1EqN!BSTs{PdSXUhtTx57agXWxG&AID;EZ4?y+!pXUAO{AR2ga zd%vx5$W9|3ZL=1KQkFr7{p`#9K|4@6AGNy#^~L7a*m>U|ea!BM>cBF)gDYEZfei-;8AM5CbCgMu0t18H5lk!;)M42iZbwd5;T1O~RrGPU93 zxkMCdkbmImR$D2k`8sWBk-qcGvUM4r{9$<$OgZGmwX6B`m7#`2Zf_DNt!FtzS482# z{_9Zt>X|V+@hX2Ddp}+(5~Np8!;9euulB5se0=$~#v!7KD<}k`?lfS_y1*w^{ z4YBBDO!m=l1`0QcEFek3P~ChT6q<6;Qtr+xuMgEp?U%S&SvNR$HM;-lU`T%#MFXd+ z)e3KP=8h5_2HOOAm_DW%LSNU~?%xUw$!)XA(MB_ess1X-ijU5~#y*J=-xj2}j>y4< zTI;u=@EJcfWgojKWU>gjM(X!Pp<5D7!)oapg0w3h?X`n$_N;Xk@;>rx1|rcz0rQ^@ z=f^inM0NL~z$}enVhDWl7BAb|rf`JEW?!Xj%576&zhc#)=)A`rp&*))auT_xL(cc|Q@&p&x2#1W6gCLbq+~^(B{Q4| z$_S@ibthT+wFw_lh}jkj?V{)h9iHTg_DhG-1)tq)UtCVErkM03?}PCy5h;>nH&i`j zmpSAklGyudK^`HEkwVrHt`}B7p*<;%g0FZwv9-qjpfvR`okfkK_NBW5M2AdXrOpuK z(aABaqVV0@Ad&xmD@yt4e-2d|n`U_>L zXNV*HtkP7)K`t7?8Tp9)R}|WVxIn%Mk6Hy}A$d;~G=v4V6Gewj(_d$cJt#<>iMauL zYIVp;)uTW$Y|lGP^fV_&*V_lygk0Rq>ddsOLre`7_vdCo^;@_tM2?n~;h$@gx7j_{ zhT?p+DJ*u11$l1FS~7BP$kGlS<>;RYswzx*`B|~{`K41NeI6c`n!r8x!K3covYlau zUWCSHxrq8j21-+H6*lgWXwS%{Z2fo0)$dqSWs~=ZbaE=s9#fjF)54`omMyZ%2S7ZW z?!oF?7ylo72nti(-OKU4w_Jc;l>v0ghPb36*;uS~Yg}|5&7McA?1Zy%VXvieU^{bW zJkVxuI-4$Xxm|ee;ZSOxjWZ6^4}By+=>m4)!CH_8ujNAg`AinpQ(41y$fIP@%6O!W zRZw!&Cq+>MwzVZjw%9WcnIGiz_#HuHQYRe0Gb{Uq`=@6 z>|nBR%uW&1ERdF?hxNd&bshyxs;ReTL7Kwv6fECuNE8jJiV8aO-Ulig_TSu%KjO)wAe0WhPn<(m)m$-$3bckd; zJ=wbMST2g*k#h>_Tm;T!cok(z4Rt7l+Rq%$qYlj@4k-5ap!9|$@$U{(jqNDj*!xQS ziKzQwS#XdbPec%5c^@DOpe`I-pyq6~ZuRBbPGhVmZ6%}6d)QDoFE_0Zw zCvwb;4GPk%+gILghphK{rcs-3um>Eb+P3Q{KNr*s;N5mGZGAq|+s8RgXD^ufSE+5338s=P~Z(Ch#PO z62ubvInN>MtL$v^9iDt#+8x!fAoVA9)Fp<8bX_^)41DZ_eX<`p0|q;I;E`hAehs0B zBt9BfJVF$`^+3LjD6O}~+yKeaPa6Y7$yW|W#+w2}I4=dYNKpMe(nx<|Ll*Igjc)Q9 zxEMjXVM0(XO#5=yET5+Eh^V4e&M1B%2AzcXv4{_}zDJj$n3Ubw9O_Bo>WNeQN{6T| zCWp)Hz?P7!kc@1ZcW(_)%1-T@?3aQ(R9;Y;%d~;Kwk*7IthB!>D@245|wje#v7KFW* zb{-skC%Pf3tBb3dPh^+&f+#c0?tz)6-j1BOsyCz) z_-EYY=_KZ&k1Ne1lHHq;K@my8UKB--D8tVc9`O4DBv)5ncWwy~MHbxv6g!m8Ck|&8 zY$cZ`F64=mo=#QH*__h69g_HXMuJ)0?mi(R55l1wE=z5n{=nnmH}eS7v%$_82s~Vq zNTPVJ!)RhcoBPFsatN!$CKIiW2LD%9<0{f^6oH@dGZ4M`;K^h6LoaFEfrJ; za|#Kn7RPUM3f{)M^ONNTcB?3=EQc^76E^RV+oNc*!j2mymrcl7ciqPQ$gM)E54qBg z9toPKMbS(4cS>~s(=i||P7IcWG|GVs7Oh!&n=}W1sZ2`iOyAb1<^Kgy^z6Ah%y8`T z?E&ga6IX5zW#B|9n|OQ8VX6UZvI_gBp!%HsacPbF9cbR+kC?$~A+2Wib5Ur~s za>!m$c*^o+ciU^u%={@c+P^sD$C72!sgrOrd$1iRW+N7Y(ib3Rpc&nb21CWfAq{baUpW(=g-8h}uaYRT)_i={d2n)ye}r2EdG!C(Oe&m#u${W*J$TOf*$1yH zw$mO9P~x1STzl7{uBLIs{^7@+IWITnI;C4sqo!)y-pFL(NdsqlANwv@dTztI#Re+ADD@Lf!UOu z=r9Vt98dj$SJgSI?cC)K6RidGq&*>+{((y85k`cljq@VvMOY3V{9!1LcOxlAJSs@m zJy?NOdX`L)aF)@Y;Y1LP#l~M9CQEzyvt54_3PvpPdnv6SPEmyNY>mU@q~g)oBM#@M z^3p;5o`jhevQI)?o2`+=vwQv+BK^rNxxzYDm)e&^QJvTiRE(gF3Z$LMj!FB6r&F~Q z{7EQ?5^B70K#^b9{WKIzB6(WZXl;TF?BJ7AL8B_`^3H=DqIYo{P(VZ{OvdD{Q2F=U2*Piww zJU`t8JO1aP{zYq6+D4_RKf!*AE1KyHEN@GWmRZUn0dnOO_m2+QgUH4E2Y(UjNg|2+ zywcPoXQI_x?0$!d>>)WGc|=feEayWsdx!n{mw`Vjm;I%E@L8{lgT-M=YaDSqm^Lkm z3%1#@B2s@movY%9|H_MFi&0!{y}t(PH>}#k1)$cAzlfqfY~5jB`AuL>6m8K#=R84} zxie%-t~=?goB`R0ta4X615+&Ko;uCkEsB~n)q%0BU$i}xDTRIsmIN<(DGuPc!`Jrb zJI{sZE?l%~jjo@c_huB<$%pF;4x@^axNqA2BdA|?yc8QbxmvvN1t^?mm&tnXQh^YW zF~*r>+M6Dh@f=y1Y$Y`d2Ac(9W5@-WF1pm5bYutm@fV@feYkp&_1?9D^iHZ^y>)t) zR_g5?w%6}M*3K-n2bHE2Q&w`CFys!SV%+ND)+7V-wxD8Q=e`t}6C1i@)OC%+`Kglm ziE>-%P|h<4>g-)Xz1ux&m)fo`1N7?X!vz09FNmUFFSpKa`h7_6%-w1?dpea}5z;g+ z8tqL?=GrP{X%E`f&F)jt(Z@IGo^_+>uwe-x9}hozCBP_sgs`Vx^<4bSa_#<{pnm5v zqGtDD^_sv5_8~hVhSSx8l`HOEbl3fMw@B*FjoWo3%D`k-VX|w7tx>K97+y0sXLEkF zhn#|wE9^VvNO0I|NR+27KK~-fgXM$;Kvi~p*Yk(5~M;GdEq(WEoX=a z+f%gzc19KBm1Td514QR0D($e{Q$T*R^Dn*H{;qUr8>VP7!YclQGuPDOxXliE z&0#|&!3XTn49uF6?4{Wue!?aB?90IEtWR91&}w8Mmcie|gPA-8#wb-gY~$4Dsk$v=6)o*c#8E_JBEmXl{?g73AI zDcMpPh6(O#(Mxv7u`sc*#E$xNsEtKS*2!BOaz@oR_VqVIZ6TDl4we^XD6Fy=oM4gC zDhh4GGk8cNbK$E&&k9wvWOhgTSTb&NV$-*lN7S1Q_UXR{h!hEhNxMamN|r9V(>8cI zDlf8%zXdkN7m*(9w70!f;fREwrwj73s5!B_mA&fu&LI6q=9kh8Ke3K63;p)r!&8Qx%`GG=i!y1(iE*8xm0y zM(cY05Xb)~z(i9Y7E`vze}QyhR8(W-p4OW1?RUIXR)aip%?rYo+SY5kzw1TVAnvKO zW-X#f)^7HbWSYnm6D9f5@4C==SS_U9gGwpd09T0!50~-$3RdrRE@iRWdzZ7qgw(Ju z!U3N;0LdSw!)r?*r)RI2OYBU|t56RqIw{PEa0c z((aJ3{^SrY$;)7crd)R0>D{~?{{B!&JlY(0=h$f<0P_0t?ThyTl0Wbd-|+} z=h&XR1)kMtNq^1Lx2#=l=P8};ejVZYo5!c-auL-5SzU?{u;1 z?H(al1Amxlz@a*WsG4(v8o!Orl29Y(KH#H)BDwDP;QUmTEf9sr#cf%-^II>->#%jO zefJDm`o%B;zSLnd8(}xfm4fuveJhDl{?J~5RhG3#3O=$okQZ}X`|fOsPV8e4sroLo zTP{?ZmNCdv?VMLf?C+vzMB%M3r_v7jxI;{>*_2%*s9x@Du=_ln%$l`grJ2z=J%(4} zE(K?o?2Hj$@e5g{7}N#m{miu`HaA&=C_IR>R@7f@|MuGO7j0MWL)JmNe?}7XvhpzC z-IG37z#L0ZB3&#Ve>MEuGD`71v(brB+VnQuMA?{#>e=ia@yJ%&^AiEeTNQKEXB?)c zH_9{LVGe_?s2wAy@1>t3ahpU7T~5Kt4q`Gna(mp1L}h8+*QW&4u;N;Mr0J_U`}!v# z(r?=5)G4hO&sm2Y%^q_YO-ljSo_3h1>_vIlexG438kObfF&5f0Ka;P!uc<=3vxeGO zFBD$mqE7`VD^2bvO9a_du_h~kXw#(YH~!Aq(BIQ3;plx~(~v+OvOsBV!!{P8kSYG) zU20Nv4F1yJ_-ODalBzF5ZCf`~bIRqV659xcHEKzRsfJ;F54-FaatTSL+o1~_@+Cwi zo%WDJR`R&A!+zzEGv0-FEw;A=^%D@lHg}Mho!XeF-)8V=wK_voF@(9U{lcNnIQ?CgaG1h76aE-pb0|-Qg`-w*2({_6 z=oz!44-GAPYR)cDk{<{i4hdG%7s__Ih`b&PwzR-k3#zj!)?@6a1Lk)!415WA=y^_1 zKhi*n?RHowE7ij8K|AyC0C6g#qv&Uj2v8wj5t|(@s1KMBdW5*f=MZD=MiF@&gEBcW zEzf18nNnGD18hHW$VHIcL@hir@F%k>h{P*Z%Iid-J$K_$dZp6(sfhZ;O?QXGWN8!1 zYxaqwLN4-tRz$n!=uj}3U1^Uft?ny}in>KxnH+m2!$1Ipp(^QJoO}!n zybg+o2zB_M`*CY4*jK$>Y~*^)&|tzCeOXn4-1l%`q9-h4^Do^*&e#)#{)H!~1tnOrSB za3ajKlG#^#iRlVm?tD)~{cHt&?A>rFXulPOI^*J|#6*oj{^+FR?Fa$&86LrEd+lzAL^ja(R-ef&&5m<8mI>01w4N2_8zuId zDAcuP*@~t1{x65R&@@gqVmsn6Sv;t{W1YjesVi>cq%Ekv-7%Udnd559UKT||Vp2vg zzHGBOI`EWGtt1DX1$K}__ODWUJklZRVv5K1ghQ^aH03Qn)yo#ms_*X?)I*C91DEI` z>+I74gPf2sV_u(u)!PveT8^OjxuGJEPDQ8vDZ|jcSt?F{7S!*|`S*mc1ZIM15mHM| z&TbHe&PAx9HR4`}an`m()0jQ)FxinU#q2+y4s-ZfzHA%58tP5-V;=mqPz#1ua>epH zhsi2TnksF@*U6>VB+^6`cD81I3a7^<_D9h*LQ00Ck^(#W%ur!8-D_X=bh4@gz1cud z&UV{Zl-AfwM__k<7LSNX7o^tpvvtDe%|@LfGrN&-sRzC-3eQ3-Odgwt&I&NGX)bCj zwq4E+H6Zy!s~L&9?I()bMl9RnUi;JnAf1@%UuQQcO%`{}F{(4`91@uw_kC6;=w5bM z{qlyXkPw9WL(}W@b3$3vFq4%nX6FWoi$a+Xf6ZZvU}NKh_J|-4y%RIJp$zBfL|j;C z&p4FY4eRP#&Y2=s^seWHQmMWQ`>LmtxmcMi&4DH!BrHS|4)@r$=y04fAwgXFl%o&% zelHS@t+xl2repcBq-@Amqev9qodzQ|vd9@&KZuYM7jLyMo*#?IzBt9et7K<`G$~(VlULNqDMv%07NExpc_NWlQWjrD+>( zfmj`Nsr9fZv`v2733mOA!?+8NwcG5>OUO-YYxJY46+T_W=~8DLR+a`;UIX$54IzH1 zSrIj;Civ7h$a%$_D5`%`1Lk^GvM|6zPKPG66_*Bxs*P^6Uvww|?$nIEBzWZR2P1)h zPwW}`oCPw`(nZ(>U9vQ#tuSgX3stc@PW4yVu%Ki~n<{C!tU1tBEk&Pf1LjKFg}ovM zec=^t_Nu3oWBB;=dZIiWF8A5jzX6+GbD$KdY|vM=XGG+sumP8q*G@t8D$(b0yvnxV zBbNt;c%W03YKt63x!v=#+wBuqhTLQmp_8WU28WXLZrx;GxQg8L3M#Lz9`uLXuFFNI zC0wcFJXbgJsQO(cqPiDv1%;h&H-`?Q#=&wCpOPTOHx))Ih&&R+<`Uw@5msEpOBMxwWM9l*x*@yvmcWlFSAJ6Z1%soo#mK{L#D zT#%I->W`;d@MCR1&hVgfh-cmpEQW`{8_g1otDB2l1VHj%_Bqkjbmr_aWcLmU!l+B; zpDhWwbd0o4o_B}_sZ7)FUK(Y#C`fS1th9A#k2Q*C}X82Xj9Ak0|Qy z^;IUn#+l)h0*%KWd_EU#^oT;`ldnr?PM@y`g`_lAv#0F^ zrRk`+s*M>-=4)>_1u=m%E>UOu^!=ep$*z7G^B?#?fQh1V4froPlpD&PjaKC_8bSJ2 zYYh&QGb4(W@th!!)6893AlDctKNu)jVW(_o(_SnwLxCvjimHwFn9>^FZG%#AKkgA{ZYpF7 zO4M$(dmjx9comn9$$r@ZGL{m#O}4*7EK($#nsX?TMl0HOf{1$VTebvI&x#xn+Y1!B zahJnN%o$I&^xRxctxHZ7*xh0{W%uLw9OLOJ=1XFiCASZ!hR0dza5Rh?UwKdLXY1T7 z_sV&Nsli5Sufy|Q?}`H3od5PaF8_NF+Ae#2pM23@@} zJ0dqQBi~DI-E3zI(%(Tq=W@}tBGSPe^l2x(ZsT4Gcfg43>{kwnn}xjFKEE#TE9Tkw zR!J`|6-B?;#B|&;VvjgXA>Kf-c)jz>o^}Jjc#aC>N}2WZWs5(6USsRvTP*-6yCOEWNGNY>KgQTdn;}bTxIX+@8u5!6`1Pmahw{ z8*>ykL7Gn|6$R$ok)=QC>2t5V%KoJ^oC>|tD{rELy`U4nHAF= zAJ*??$_^?}i0x7us*1BVGq7!8nZvBqGK{a_Ofz~DGoALj zLl$0XJ)Bn_+K|{#ZU4-qF>&~8MWCl}&4VScXP7H$aXC<}#CrB!NrLI?I6 zHwKt$8j+~|(y9PCxyp&jI)`jsu%fRIZ-wdlB?_w4iQZyxjj9i2}ZqdCcurY6@`A5H52=dnG9?kl}zd*o5z0Y|Oscu@J*oO@wg;EYRJ?1FaEv_x@jdEW;Gd6vYI z3PY(S8xJ<`)&gnf>D9$eiKYP)jTDqEtoYRk$-pklW>OrO5iTkKF*bZF=t z(Frqe!GJ_S>M>x?7m=UhjJhFtTA@QWaS}+4X86}*yMrM?uf3AtubPSnr)=jID&%L6i+9*xl&11j6E5=0B@zF( z=gwmQx!cyzl0-Xhy$kFFW5e4(ip-@$_WNVDn`2EA7Ds)uv@xVq>(xqcZ0#Cegoi5R?6C}7e zQVawHA?%A(o^yNj=@1W?U|eje5W3&s1&P`kV+1|nkf1GT8B+$979_D6KXY4nG>T=Z z`n=S}Y}t6-BBFY!I17C!S-AF#Li^R=f058MbQ8*#Jy6M?X%rs$)ijx{4LiY$M5MNufNbAUnzlu)eQ`~CQq0ce4mDYU(zGp+*D*g^V(C*W_TJ~ZHu1>C8G&8`MowFsFw_O-J@AD&fl5gX~|{svJZVf z@Nk94BAej*EUCBaMO4$b_Sm%2+M*5Ne|kHY&6sB~lI(k;vqeAQsFQJQyXOx;>R#OD z=Cs>&4q16nDYkD5YJfG)FxuU2t;&hPz&iMuw^&48XVG%vMV|db0L@s7H>G@~rQ3Vd z2|Vt7KLV-8MkW*-??s1Ll5Bza}5*Z`1|I<%^ z`rWHK?IS-8=~PXHousrz=13bZY!KQY54Dp;)T6O3+uCK%IK=EdfqQ{x1$i`)4mfB^ z_O#_^fj`0>aNelBBdC#p4#s4s%sFm-${EUX+wJ>aNj#h%=MmHN=TxHK0)58rD&nt6c=y&X z1JNVZmZtswz3@zU9O7Vv!t=<8_jwkI-o2Oro?C5;!+4HWuhfP!x$Kki`13tMUW6@^ zz2NB>J-5Zb4$tBh=mGaXo_IdgjIZKE?XVsDLV)o=7H9MTX)|&fHtl%$kmtj0@j%oe zWiw*FwD@ZIixBA|PP%OPbxraMQD`$R@?>M06l8$mA?PG~LqL6jm)8DDIl9L5@Uy=S z1(F3D5;>)*+WC(T**)Sdh1x$A)FO()zKB2#`>w19GP`#~ZgsCAoIy??P2k*#=#c zt#gP^CX7gIER&mEA?@yaUJA^z56f9+y$<8K@`#n}^p`!iY(N?T=QxxtaLW$6(IHM= zW886;2r}dAqP&7AG*i-LEW>&}lt?77 z*-76Nu`|GtpZzGq+=@a>w>|HWFD*Go2mU@Z6|H}T_q)&`hB#4wz3uag=N438<6=t$ z`3bbLPOeK3*^MFF??je2qUQH)m@3M|+bD~q?rt&SW^_A~_wb;j#vE#-Ayc>bl9wKLR7yc79t>JikaLe{|v z8H0~Eoq;E!YhruP2Yy)xva`*}!1^*p_I$%3cB0xE?h<+BWvx1Td?W)0W_9nkQ;;Sl z2J7s&L`bKqaza{jL0@*%i_Zae?{=BPWasoeQ%SEwyh<>V z?R7!DQ9(gGFo$mu8;--XqPki;GDTX)!09oT48FiJ3VM?mBn(v&|>nErK5Ds z%k5vHt38APCl)T8`}_e)`s0$eKTCLoO=PKvyhXHmqnc5ffn~^G>#ffrcFfV>c6&9G zThJQFzs#0uPgI zJTns@4dB|0ARE#gSuF;=Tsh{co=y>jBJ%W3qS3$Y=~atX+ckfIpBlIlW}~ZtxkeP; zG41DP#-0(>$ANA1(Qk#UWmm1m8H;^QkXiJq3opCW9`d|+kcc>KR_~B_fx3L25aeZ; ze3+&g@ER=_{1sxl^;}q#$#&-vQ8X~}ilph8f!NS%Z$0I?37P+Iw-+2Lv})^=<^L^H zwpW_OhrAu2T;HMfc=|s)7eO(1DEs|0!`wALU2m5Nsuza%U8m&!qR$zU)p^p-|AU~u zg!Wzh(GyOaFIy^zr|~ zz?*V >1k2XmrOEyo6iBRdRMdi&Qq5H-;8tK8ROC%zj>DZmNA_c*0oBnk}ta49~+ z%NchupX)NA0N-WcorcpHmzH ze?M>hUUW5Px;M(lplQVfDeVTiIEm= z+U*fR_~YXKxOz4&Y_`8T#owzu&vyTCsDII&i*TD9isO)gx3&{K>ulzNDo>xkYLWd! zX{zOb!PSh+I-mD~$?`!BfKxu=_0*3_zICCXx{j>^x`Nz5Kjjp*ci2=!o$GArZlMTY z88=5(=Ma54`3aa6q!UtY!}i79LpmO6!KpH7L@pPvwKJ4tKcYALe+*XaY2`9lP6)mL zn)V2FB<8l-J$r^azBg(2D#;OI)w0`eUt{?~8emv&bAgsa zS1nt;#%>jaL0^h~%5rYxFIg=`dNnop1s6!%S`# z8>-FrwnG#t=f`pE@ySq@Wm+y6uQ!JccoZ$OMu&vu#kH9YIz(E8=OueClZ%QblFK(U zxsyoY3+zLmqAK1uT32M(?n{~o=%`G-R+M^a7s zcGP|$7hgLu;b3+Ms>60P6NE=o@3o%}4AQaRG-P)m&Mb>YY)=uKHp$7aF&tn4d*#jLS_bu+Q zvjx=^BN*SNb<~zR16H0`Bijw13;gW+6b+=wA*)0ZPo;Z(-g7aSLkXc-P;b2h>#;9* z&E+j|_DD3o-Odt)-zY^i=E)YjL6F8X|Ka7!tzhqP9;tp4i=xh9qOcHWfOgnHp@u|d ze!Onj@&$R^0(339JS(aih;k?F2ZFp88Vc;MHS6s1#ZXovpqW0)cE}N4zrui^THZNl z=N{~w<1<=Xa~!7X`{t)d?N^yxW-o*Ti3}u=I?VTf$;);XCQ#k|hoHm^$MMh3Kh!e` zw1Hmx#SUd)!eQ?>f;1sni3NQoYhAu(>MtD@vhKy%#SuztqN!{`U~DG|dT66XE7(gl zil8^|%SLjftJ@bvQAg)(XMW%dk?-pV4-ahf6Pr@AWeRc9EeaJQQI)^X?+VhKrT7Q` z^bwgXT=}+Jiy+&z3$I>qsU3b~$U}!kej#5K9_W3(~f* z*6WVyT&ECX&;!pe6Ggwo+^kI~t(K4Huv?HvLb~_^M~6DOwnR_OHaaATBW|qhEy467 z`cx+;k-d)z6fWf0TPM4R>{d}US>VZcQ?+$DOw|oYUVO;0Ar~W1oy9H}R2$M!Gox;t z91PkW&ch>8cpH6K5KTii{8;ngzxF+M)#+7C9;->%q}X`1RZG{{l+yI!*~_uDeN2Ex z^OZ{(4`ZT#h(Z$?L{gpl*-xxDE><8_gQB~Su9D6i02(fc~gmj8vDWtfuH+~Fna8A zQPjL0BdC@03fl49D59FzIx1VY&zu-w3iC~F4~8AGglqRR<1iY)Db}FPW|(WTBxQL! zlRH$H!k*@ulR{P8?n`i+?GOtj^aJe!CzGpJL;SYCr=yxWJ3wjL6BP_=M|FJ$GTd=@ zycS((1ILF+2=vg4yTGp?+K(O5Nlohe_OD zbJ=dEgj}3f$*Qu!VX|~UN>taMO0J%smQWDrO%jTH5os_k0}>;BcIxQ?CL24L>Fpvx z4fL6ExsW56%j2TZytoS&+@8zTG(r~1kALLFPW`!(Zzb`E zr86XlbNvSA^aY`E>DNs&F{FhJ8$_fOM|_1dTyjo;($ip8D;MOQ7cN8Lvf1>AnU zm7W_|Q&j`W#yT5z7_{l^H-c(k7IM`}?d3jsUf}2W85eJ`g$~m$m*#9tQ2Z4;f@e6p z{`^3ZW&_%8yy2KrbRc&Ruorlpoi&LH>}&+7gnJ`q^{A}S{+gi>dM9sGZUb^If`<_i zt*f(wi#^@jZns?$s=8?9>ZNzzZp$4eOUC2cDSJRr%?rmFrra{8aE&Bw975SYCmbCNoDD| z+#4azW?S}qQ5Yif&eAu>e*_%5wECQTSc?GNe@gPKIH9zHFE=BtQL% zz`(~HkCa-wpt_|r-#&LGY1YpR?^?5T)m==?$BUq$HCK%qiIq7_b>l!{z^VmRv~08O zdsW~>e^ACr4++xzaH>{lz`d>x3>^6Qo$U@m$|lO1ZS^%?cCK9NnLih#Y_g&ZVU<+j zZ~dm1ZO@hWlk5!4>mm@BU2>-X;$3j>2pYy=j_a_wx^X&&uP6uRx8QKpGD;T zqM=T{FnrGj$aXK;yWM{5kmI-Z=&uWg^DRo262@)nP{_i+IUCp8QP;Q~cZ1hC*&-F( z>l`KuD`oq1#*O6i-e+TpSgN$1&NVf@^?v19aaNff_N|bWsI86Zl8YRs^0&mbU6wCM z56Qp*^=WPt|Ktp;0@9D%@7sX^eJD&4bqe~bD7=2WJ&!FO_kp?zpAwM<#29#N=(J@w z0XbSMTxI!6YWQ__N5id(3DG2qhQg*Uc^v4=z%isuO?K+dA7NJ++>x@>)Xf=B zS7}Q6m77u%xwhMqP*tLMIFZ$3UlSBRH(@;kk`gv84Gc);qsdwubr|JsBcQyE-Ri|R zVPKCdM?o5ytlDe`-A0;`6>)Fa){K)?ohVtRz9KpeKAHjNtaCv^6bP&p5@&UI8#-5$($nVcbMQVgx%p= z8933H7;dq5GO&HK2J_eMq%4vH`Qw;dhnXBx*n0*JR-;^faT$yGXlT;5i>M!YT~U-j zVplE?5N&$4YIf0z0J(RP_ulUd(#5QqXh=(?=?~731z&Srv7ISTU{5_#60)uk%Dske~PQfyfhz+t&T;Y_svMuK5yRF7f zU*(VMXw>)hZ9#P&I%h+eD0geR8gim^5%A@|4fE_S`?nX2hDwMxZ||=5g6ZwE9HSy$ zJ9iC4S}XP4o0O(glfAu2?bNAHh{EvWr$bOF*THFx4O_Gpy2k%dTHZL}Fja|qD1LMW zsdFv9fA+pNaK_`90~Xu94pA23$F~av^`+o#1{pPKd~Z7g5`T%2PrffOaBN1bEfcOi zzY{8yniF2W4ss}q^zMASO^^!N%E*?O#>|MqD@f#6R)Nu0E1JsOz`;U)20Ms1W-GQ(G3uMKJK3GJ3gH+2!?8|~0l6bJ|A6HDYh6e)! zHWqmCwF@5t(k);A>akX+w95A2*+1VV1^HmPA3ed&e%OlhIs|-e+<|EuEezGB8q$F)E)xu0k*utjU!?8~16ODq zOnNi0pp3JO{YFr)+1Fw3C>?&h6Ya>NP;X+O1a*EpT2S-%U@4ojpivWXBlocAyoT24 z$?8-?w;V)n5RpdUF$S%`L~VtAs5nrN_{z6zb-w+q1PXnJjD@D#M@s{g<|Yvjj}@d^ zJ~ak}jtt!3-6$gMTd?4&YwXgpPyqiz`rK|V_q;_2TQ8{y;9fcH$`R1-+Kr#SN~?2- zd8>pnO%7QJWG%W?FpLZi#(L9Dz4nCj%#Rn$Pxjhwm7zk!ck<=BMvxx-GUKw+({cW! z?Yj}WI$^j4Uwh1O_C-vRw`=U!$_N~ed^P+C4TX+LphE}x_J zSxX42kwsb9%jwRITb+HY15qcB&HYSo1nVu6dvsDxZ7<;?_PiVveFUTyY&t{t-&5P?wxT)qa6txc9 z_eJCd&&L1hR4r-sOkKac7T#YMU{p~n?@U`AGNcvB;^ho%9Zj|q`>{SS%lr25q}?n? zBiT?Yz6SS-JDmZg0O@Vc3-a8kCfEM1G=uZ}WviCku??Xj^(o#WZ+DoSs!TL*w+93@ zlLzhDO<2%wC32DdQgo_YO@QNl8bb}qwyj#bj}@dT$k3#!f};7ioPlWV+SfnhFrKcG z%1g2-6vsL_-dtiQZwfHg*DyaZY>x`^AZ!zDO>;;u!%xSVErAVX!MJ0_t`O7@jBocj zOy#ER+fG4QkWh{cAX}6v!X?*ezn$FbH8MPH-Yuf{V+*{42-iib@ZY6uoXg=7u4l8Bp8Y=814UoZK$D~^O&M5_j{5q9ApAj1ToQ%rdYyl$oH^23`+&4T z_dJ+Oq)nr3PQ-Mb9r9T8m>><ScmqeUDqF=tS8V6y56CN z3>FPV4y87tOoGoYbio;5YGd?yFc%sHh?LZ#R zK>X6<%=V+($faE&wzamTXFDY-n{xH{eMh~9;ughGKf@u#<=|~a2A1KSu-SegNE>8- zJ>G|BKuxQE5s@ZebD14H7Ai{&;aRD}dK^lwjvCd$+sUOl+!DpLxM(c2Cq+?jz83+p zug@0l@JfoZwafi>22QfsL*|o#n~1vGX}gcZOg~1w{Whw!euf?SiIQ#F6O4&M7p689 z;?39IHvwe2JZmlfPpbvsCNLE{Jx=b{EDGFsIF*yrA%{_Zspiu!O?q98?NZ;qa>|); zC&6~M!y$8}tdjpLNVDc=3AHvI(y5kG`-0MXL{(i}Rcp69jAsXR4Y15PPheGFlgZlF zA?2+X9j59_r6Ks=Ozzl@dAtF9c_viL)K54~do~06dhld!&k52PJXzQov!v;9S5vF) z)#BeTqFU8}z5IY3IT!d-qjR=iX|fcdx1ikK6x1NxqA0jQU(G)G7)1T9ZMjj?22BIg zqEISP&%vz0QjZ6S&Ve?ZpZIQo82IZhafP7n$`{d+<69`_ex<(vX!zEb!jh#H6ExrBbuq2q!los!ohLtlh^3q{c=AFi~YczP!a zmoGVIVz^JbEM;5l)lAv4_C&{kL|BOor6e~l8qgKd>!PTUGmSDA_~MVl;}EzYFpVlY z>`qa{P>w@`S$p><4vR_@5asrtI)B#oWKTz0sYn#~5h6-Qr#b`C@Wm_fW`~LLTsh%g z`Ln>B4uD)@9fEw5D^}jM#GZT_z!`$wa*{H)r zYoCIz{YX#)WjYJ9HEy}$fmw?(MG(#-J{|aFsFNvmezJ@IZP#aZ1{hjD~r@?#~_HdjYtg??k50M7qUpF4@ zu{#~&Tq04MjdNjg>B1F@ZnNWG2%PGV(pI}nkgU|UGCTN1(i(LHZJW!JfzDx~=vnFU z@AqphBI;S_nN>~M1;6!D6=e$NgYaie6#ew0t%*(R?ePo)%To!u$W`{q-vtI7aAH+! z_Y3M*#cpykKOWd}K2SoN;-bZS^e7M&*9@HyJp%Q4`?viSWc! zE7oo`c4X3F^^S;re3>9UuyXP3w#~EHx5|}y8qy(5Zir=)T{UKLD9g95i`rWpqR>H% zPP<)D{JDADp~=V5peR%x6G!?TXHJw=6UM?W`$MQI9>R}Ysihny@axhrYFE4#3eICP zci-!zHF)qCRah%O4%kJeiTV-=jVbRdS-4=BN15*dF>CzP&&FIXmk!!PqUa~)?k7iS z{Vr)=gZ3YX_!r>dWc&Ovp7<+`bW|lI>mO|o1q3)*hF20Jt)Y_o{cBtgl7?nBO=C4_FI7y2YJcSK^x7$dc2Ho zu@C+=Yk(>-1?qjvGUEYRSF9P#Oww6K@g%_Eh9FfK0VGecuH9ljP z3DT-mX`THcldf;GH{B{~qc=FJBIq5`too-`P>#=&%hZR{wUZz;>(ZxE7YGFu2txdgB;3nE24s>4&(A-Hb{1B zCbwcRm6dO|{@WWfKAtG7&?VkWPLU)duOyMR$#(w_JPb>}_bc6u4XtjPKKkGATxKm9 z@n7VS0L}BWh4!*TeK_s8Ht$fHP}y_#`FBFu1Sb-N%ZEEmZW&aV>}8qUkzC9z>;vzH zM)HlQN&GbXI0e0SpzJYD*=0fc=XhZ0DqF2IJ)COGPh{t74>Rw%$r4tg_XWF&KKiUph}G<+44kTGv9f|Jm#%kBUpUM6F%Y!XbO70Tkp=^jZUcG)e! zcmNH&Lfg+FxAr9q6qBJH*Q-xvwM^WyT>p0mSt`6xx` z3F7tW(w?|Gn1dfFqF$~%U*0QE5mXzRvJ_EsuG040GcfSUCGyJcafeYwV?0o7Z#hhs zWFhRe(!F4&h4J84yLRu8PL2?6RB0aVs@7%P{NO88L=C8_?TP9}tINRA#zb4C9sjXV z{2S~SFHl%!-C8gTxn_zE8H4qm1^xNb2glGN%mNusRyrQ zx*Bb^JwG0v5|<9!e*1WxI0VJz;k|;qOT>-Ylb%lXHrj8M=2`!@C8>l3Z+h`mUylTk z2YoWs7LW8GUAI39>J`wAA0-BSx$R@2{>1uhyYW+`KeGG#ZnT?}(u-|vO|od0Ig~#$ zMCSv7`U%-MWR38u@*R8NzM+WhcbI)s`#Ef_lTc)zPX~yRXVf`rrweKf<#G|MG#~fR zC+uYW`iOzG8 zfW>otx}shziaKn^uw+w@I0KhAQq|<}T;vSuo+*Ma9vJvp&k}>x_D}{gp6RVlLE5nP z)?4jy&x*5W*jCza1bvL9W9qnHvn`+Z1`LfSagXt;!};;92$i_~zd){j_sTjxIvIqX zE(#6CyGC-T+b$H;aN-DorveUTvQ2Kbdk+c)lh{mfpUjXJSE?d5wWp}5Lh66Wa6Wb)1 zE<7r*5e>;wQS{TL1));gg7}Jv>a9WoQ=SkQqS7f& zTfV}zib%Cu$qFa!j1vPyJy*`~D+GB8Ha0B%+7eHSqRwF(Kn%(OETMbsh?7DoW_oRD znossp-10TAm(Xd8C@Q6(R1;jW%L~p&zYI}b(S^E}(yVvKpK|JHwopJFm%fvct5Za@ zD^7u^rpy$fn-CKs=`~R(zjVbd_R&*G^BNrgjT;L?GhlN+0?NksjSbyLaqG8=PCd^rzVtPSJUW_djr&?{&>;#a$;KA@ zu^^RjWrmncC)l^00bRTl@db80Bh;A~tLLuMf8#0<^&5|7MK~qsb(k2>OK_q2ti$=~ zI+-J1b7pu1+Lv)(wFH7IMbXbO+Q*SVSKTbi10qsq+&HO_kgqz7=J0G#YrC8ka#2@I za#`@q*$yTDuCne0Kz{7=7Ohxje{=AS+tD%lhk)L`8dZUE+v6M{zcAjv>QFvCw;kZi-NE>2)NiB<7W+97XGAmdFjRE^7NfKId+xOl$F?IIA3Cu z+eJ}(wzYA(w`G?&tjNK_#%cw5N48Y_bVSmUBbK+Fhsa5ib)0l5Jk+0PtFmX6K4$kr zkq${^4O?y^A>tc0w@2;^+w)j_7^)%8kG3b1Anf0U+GiNZe3w|iU` zAX=93$f!kvY$+DrX-9m6q&lOsk9fJVxB8waG#QV}(h&+av2Te;ee7qXcY4F+4hfcn zFW6@Vd4tRDT51cf2*rap0l$lSSwuah2-h%KQmOj+m7xY4j!0hks|?KE5#extudAHD zq#D_nj;H5~qE15O^Pj6jI@MWZ|5cir6=4ennYKB5R}{T^PhY&F$M(Gj$m;|fU{^`% z6;V8MxluVDqXnt_lC_KN!D~ZSR8f{#*J5kF8M0DaHrj62k>**l=p>NbNH3mWp=eux zeHm7tXiik5uQ~fgSN$dU1Sqv5uMdzrzj<6o<_W4d2pLU?H|{$ha|)*Y98dH~-G2i- z`W4WE#>6rY**{v%B2v&J_~kX&jvE6-W27-JYO1kAz7;4~r6fAa(D1f%GL(^;Y_klx1H)})F$=GTS5_3=A^RpD?y$WZ=J9YE)H4JqTtK>f;ppKz#*ra4h=jAtwD66fKIkuQi3FA23J=5p*ao{oyj?L4K`bp@q-+yuzAK2dmO z4&U+vFy~NVQzvumht7`|%6JU_8s+xirSPX0$U{1aG#l&?8?TyNKVZ{J^Z0mdyKJBy_pH>00sDc{L9QDO$TCl}R@h6<7L~OscGT&2c_qlM@F`F(sO}-s4DR@dgT)V8=RAo0kObJqL&Gx9E2KJU6cFihJ zcWkqIrD;r5jGN(mx|`NjIsS!2bdh^%+P3`bEq9O9+0;?jAu8dh|aoX zZKyC@U!c;Vi>SLr)L0{AQ;X6Pn-mtvhME&sA&!o>XaPCwUb6Jd&b2W&%I0LLDEcm@ z3KH2pR^l)o9mF+{J?l_HWiAg-yDvNia|)YKS|fU0VcfRK9&^a0lxCsB?+*{s(G*W~ zr~&>=6!EjCMpgB&2Y`HEE0&>kvh2Z-NBEb-raAkIAYCSdm41{G*K6s2|D%BiCmr+H zckUyo9}G{&dA$-~9pVglS|Q|-UFHyNtcY8jm)_IGBJrJt~R6txapEy(*UmLA>_IS%gRJoGIAjd|?TiT8#I zQ*Lf3$~}Xu>h@e07&N7?C*WDG9pe-Y0HPcauk%__XjEdR-Kv!4$sDJoq4>VMP#pOM z5sBom<^7^)$PsE}Vw1#;qgi&Kh?JV2S)ZuhY~K>3y_YPzb*bGUz`}-BU8dwE=fVx7=B3<-Yrb5vQtm{U$H+HL#HX+Iayzcm#WQ9We*_vweO2U?#`S( z`=O^JKFv%zv7yj@BD%H%)3xmOn3tJ_B@_6m^QUsELz?ga5njWyiVy*(!@pz^{w2r; zU8$L)FN^2vD#)wimqOpr#ib6j3nW-rBFLL_+zp~oTPlivXRPVCbRohZe(iqjrQ+#v z%@&1aUP;~-3{I@3JV5-NMtFr?Q~}iQQ9Ejjl~zY{2Y>_orGmU&ybU+Z)u?Z$4^7Q> zyXdSOC!FvV`uiRs8YQdIAslcQ+fTh3#3A$JmG;;5p;j&y6D`BmS{eA`wlaw;(Cj^L zgV!*zE!tENWlh^EQPhyRES=&v3ev_zJy)?CLpmyHv|T+Nw;)_rnj*>3E%M;llYuP_ z(w!W17*!I%q2ER_Ft0!nGXE~fE2O2iX$$2&szPm1PLG1VT<#DzV2SEl`(-A#3pat~ zc2IRF9&{OzW7s!Ep)qkb@rrsaa>&OV4-=HCCh%j>g8C;u$*f%zUNFk5u}-D+?wfNG zC0p$`4vCh=orwKQP~Q|MFM`CDZ6Rj7t*Q-0aDk&RLVMQ-nCvG0XPJF71AFImV*W&f zGjpPy8L>+ors@#g4cRI|KBc91F~M&T&`*o-u)9U325&ewCeF#NmG;lZP#3Cq$<5vN z#U`&By;d|aZ_2>AMj{Z{iw;vI)6qI#E<;>y%b^ za+;W(@EfE&5%~oWTc5r1ew%if+%S~v%(m4XAs0&uJXBh*L+k-lZ4>sdPOlIb$;9@e zPlb$eCe zQV!9;;XR=$bp9~Au_pw17E&K+%ERz)XTS*tfA+)Pz`!jx;d(S$5A5?A@(3SXV^?ko zFy7WLw-}M2rWlT9)lz@w1Yxg>sF9J+vXHNU!l!NxrHJLJ&DB{Bk*K8_8f~Ex)%LbS47Q|2xnv+zbenYL9~MwWn{tzl#g^+(-fwCKY}H`M zMOjA&`g;ZSI&vOm(F9W0MM{vbTt0$R7)7YL42QEJu0P z_mz>*+y>Bk?DBXh+{Dv>vy4QvATIba!_sJvm(?Oq85uO>1jsrt@dWDP0b^H;(^;IE z@QijHnVfc+!{QUOC-|ghB3#FlO1jO}41v*RB(&Ky*qaQ=Mg)c^mG?-ZaW7_afbA{U zCSREhOsbhw^5Lgc27hCm_0i%R;StBlG)tQaFf0+`Eg~cgd}%t!OEAmLGuQq0;`aC%Nryw&} z9wKiJ;*;*czvzAPIkS*&9o97eB8e;FnLt@fhrvZJfP*W-TaA{?Q>_evcb_qx{4s>V z#YcfFtjEY?Q*jC0#(r!v`grkYhd+tz@S@4oBSuz^jFaTQuxmiRUk8j!V-Li`Fyxbt zad_#GB<`j3;OYkx3*sYCfAanN6FIo9m(a;5=)1q7?bt#I-qpvwssgYX^hzh?EF%$F zm5Y&XBY75;mMo8wL^lwB;hv?d?q0rH>Rtl}-9Xv~1}dbk6cNwJ{I4AwGHYq%w^$LM zQ4$Orah{q%?c8^i=R#pMygko;0|a`E&oCmINx_>+`t3zDN2w(VJ?cw|a`{_;7@rGK zspf59HjBV$4b3!=9LPYFRbxUkR5(KYP;pf5ct>k_8St$lMnBgpp<$unjnz@fw(+|t zLN8#^#zo&=?`cyO{AgqnF5QwrCVSdd5;T-ZV@MWt`t|Yh8cB?mTTqGbHMMZ^zREMP zF-*lvNTUC@Ba+3&|5UvE=GC&ouwUCD-G+lk7`qYU9(B%O7V(2JLniv*CyXFmo*eao zkwF1&#eTPmLO`W%=; zh9qSOyrZryF(xN`yxhxjT*WIc^oMI@C=`#G`rzzfcDzH5_yRJ#G$IqE_HfDGLJn%F zx=&_}^oo+9+LU9yw9?)h#z|RXq*L4OBqd2@NMfmzUM4$z1z~j7FN??yUn_Q+`sH9^ zT*5ETVu-x={sofLY!ne?2V%>oSVZ`vzELqJJDz3`cah|n#CJ>6zZJU`h4PMJFQrr7 zCFY!OK-v!MaH4p_$cJR2eZJzL=dk#MW%3dE>_*>!)07~eg=9OjVs^^W|It#U=EQnT zPBId4QM?+MOOwR7;VTrg^vsqkz6FPuj!q#4f}IhRlF7kL&ZV(%8A;UA?lCXxb0z?b zNErRe_;)H#%{k=1Aj!#tlgNrn#1rDy^X%`HNu5I4D86-ggGnx#oEH-rl1GfIAYjYv zNW_6He~g3Dprmv?b_dg?fRgAQ%zBu<_Ld)%bmOaWJTCkHXe2}PjmT+6B8#z~)+F2h zq@_VwhEtFuCy~TOu_VR;vp3WymzV%H1wmahYz$R^`CDN#G|?o<dv^^yGE&aW8xqb#~L7@NIH9ysZ)W(Mqw| zoO}1}i{XCo3pwn0&LFcAo zvW<-_?dMl@lTAS4>gVGHa4<2hMVE`IVLG{wWJ;e2_%($xPK?f@dyAH?!e>|hZX8;E z^u#>bzOhCSMIHN1P9fy5EB&2bl&38S@BR`=ymnTsL=&%A88_HCaN2TH73JF&9#3~0 ziG?3skoVsVrReTi3(l1%iFsq=Eew>t&E{IKc=Yp-#KTFPmYsLix-`)4&XV&<@^&;Ka4c1WTx~Mv-E{BbRX52RBi;NY zy2Wm@oA%_Kg%`+$yKD8CxXAjS?6U`$bVqA!#^5-aqj+!7JSxAIEB^>fIlOp>M@MA$ zJwfu5)KrRqC$Pw-<_BWGCm&PKuPpT|Yi6fvlj^9g#VNc{wcF1;nTirBx1@+0Akj#(ZOj54hM@uoyq4<`yMnX>= z)Kti;VQE)AJ(@g{c-$+O-zh8hwPq`kJGxFT-%m*6(f0T%cq?DnmGJxOn%jB@^>~{)#or@0Pbn@$RQF-+QFs0FYdg#}0JY zZ^#_%#$IF*BoQnxfUV5uAvw~EKxlHtA5e-exFs#J=pe;cUU{DUm6)~q8W?G^+DL}{ zL6o3ZjHG2eZgj!H5JuM!OBkPl*GciNQfp+lLlg&<{qn2fg%@9h@RTOgP$T~)#--pA zXo&`{E`**!1Mv3GZ0ph zCg|%-#?2m<*NHK>-T4!fE$VAzgd}bjhfuM|n{eqKr^ElmG1@F*x6#xC%8lob1&2?h zp+GJ=PO*o$8kZ4sMwY=PA%zCP>Exh?@K$56gm*&{YLKx^Iq`Tc#iq#3f)?lmZE7U~ z=Ld%Uwj?>^MCH$Cs8eD#m>Ey|l8=pax(4_^-~1#kz59NwWA2c>Nph^p3S}Q+e0Q;D zzE3_kIcG8tcILA0$yzN)8g^21kslr;ZUf3GY#m_BECVKhc@V&wq3INrciYF|Q7%6i z>2(w^6H39UDCLC6?%@F~25~e^o+1+F<-DN0P(gN8GiCh#`y>4%2sUex??AwRzG&j3OsL*}*Aiqmlm9 zC^jAPbXbbl4%*B(TPwPV0C(v*S`q(DY}n;tlDM1)7nYmP)gm{yz#ndq+exBmCQ>RZ zi7{$88pR18_3e+CFur1pS$)uXS~r-$JQ|e`NJ5@k){<&^&&-&BJAy#dWwP-cm1ihP zdq3E`xdO|EjVY=UO|+ z<)MLdhsj~x;a;wZ9=uRVIIl7iSi?M!crh(ry=K)?d72dOplA|09=hE8N)CrSu9mCp zljnm(mvN*dxfd&T%5&u@Vv3~=y4Xg%VkA-=`|T;R`z2cH!u|`JzXOeQ^4n&|%H?8` zxTu4q(pUn+Y;BiI!TF#3Vo<%{RE#Oc@IQo!X?HbM&IyYkiIL!t!$2qImV+>0JSt6N z5xVNY&^^_IBLLo%U{Xir3c$+c6<0$M%GUI(M-A0 zNM|aKP6iuZZKY|`OoDjaHQHLtnxTQ=1HL7PQ#HO{4*iP?-@SSjvR29?M&ir?ap`2x zNZ5Tai}K`OwJEGG;0#80zC*874xP=qM-g0A4kHI$g@++*WMtiXvXlS)_pU=JF75Xg z$k)V>3@TEB)-w6lNO)nm?Fn+}^(e)wL-kwG-%vO3yBn-ZN))E)veZZ%HM*(8(qW|A zQpy9Fj2l&$$CUjEGEEW_cFFCFp{w0}2`D^&tY1~@gp(3Ac9!FALJ_A`T!xyONIg01 zDfB^@+V2|4(B1VF@@Yt>r}_1Ta^TGn)(n>mmae{c#j4vRi!6Rl;Lq7pfE?G{RAJqb zOFpiIASBc~!i0Y3TkJ~ZDX_$mbtE~*ppQ<6rA4;f(Ox8D%Tyj;IOdjp$4Do= zl4-qPU#_JXAAaPvycHw{pKskc#Jr_QAxv-OH95Gb3j@O=?*zs(Mbs?r2$K-As963) zCK^0@@#@tG^mw$935C>$7+>9U?{)4N}eW(uin`! zR@^2pkieb5P6a>uqRm^Q()hGt?<{waTH;D!r(2I&sJ{uWHfzc*C zz82r3jm%EuI*}B)jU*abvSyV$Nep3X#yVlwo+Ag{3l66&g3ELbnR6#}cNde7d-DgY z+_Vn4&PX1kA%fz0_i3qv#VZnppL)NNSd+tEM3$1|;3gwOfMIv4SE>wqy^~T)jO%I! ztDtNI`BiBU_#FATt~&s0?s~cW0g&i0{cWa(q3V||av%V+H+qDBL=x9pbN4dY;6W|o z)(oM8g(Qlux@f)}O^j#5KV%M!;tHsccgV!UIDZAaZFYVL1c#=52q7c*GJcXATxoVJ zk1mnhKMWG@l|`$U-m_FLAqC~=+5x$bkbR8yQQ?HC9%me$az~;(<&KcU;ml5E$h@uo zrX-Zm2y`kZl0-)kfnv3cg=O6xOw&{Fh>8ZSE9=z3I^62>{84&;Ykl|RY?klKn|h+Mg%J&y3|Zf$pz%_Gpv@M!@W?N;ag7O zzl5zJM*D-rXbsCJ<_;`|5<3II|MYTxcD|gHsyKk3N$F>Ey?`@e^_nN%S5GIT6vGs{FBZ<=~hf zr*!m0>CMn8w4#7t^I4f7`7df{#sY*Ri&#@QidaKFr_gkl*Nqi1NwkbJFt3mq?qRM4 z4u;alfy#4+-Q*luXBAKpaQLh zF!;3Kx_~4`DS#h+qTxlC&X*KoPClIfHDtjHo+bsJ)rGygkMKcr3biRWbCPioZ!|I} zPvN=tV@6`p>}Ez}UXeD4RG`15I={VGukH?%%2p+cosnj_ju`Xr?q$nY%1tDAW9j|W zhhZzQo)fcy#iCShrwkl~ovb>fsn!D=yosSOrs0ByKYx%ioSG8_-gvgOljOML7=E}B zIfc3Ueb;1|)D8~68UWP|5y;i@TJkxdu%CoaOHT$T&Hfc&a`bV=brLaEQ(X~cLBu%&-lM9uuXV9IDnHfPbQakqMFExQ*g%b=iEz9xs`9nEn| zQe?XZ72s)NW`yzTb|Hroc%>8BYI#t+VO1o+Kp-V~Hgts7bRyRn@EuNia zRk)QEjH#PNCO@Oi5+y|2GuYb3Fl6BZkFTm5UxOgg@=cGnhqM;nV4v*KYh-F3Gpcp= zX?3TsOy=|}c3UE{iWpaTaX?lR;t#uI>Fu}6APGL#Xq$r-=3RqYzN|G4&sL7 zB&eSi5T-Vr5fpf{N)Z;?c6PHCImHL68U;}8<~%H2lNV5GZR=8vhQ zKQS%ShMkgLIer{`j$|yWfMqJqA`=(IT8GJxdPwqZkOEcLC=@tBTP8PsGbV-@YB)BO z{0Xat7&-&pGEI`TV%~22)N4S%e~`~f6dl1YC>)lXCY3|YaZV!-U>_idgAYv%RIhm< z2@^;pN^PA&Df-NlG-RHMeNbh`X)rMqeqDv^M$Aj3A$Cs-ouziSQWtU7Vi-?+UBB!} zHd;a~VgxY2U!L2AeaOT#ShV8K`{h;==!rWOM~}WjBbnh7aXo(}i8bG<6>Dx;CasTw z;uKJ8Yy5@V$w8-Pu}JvAuv^2|(lJ+!8aaV4nPR5?m>afwr2MFs` zntnWCO#U^Hm_8QfLjLe9B>7NqDvQa0f$k%R?UeV+)5Po~)&WJG^7L~m3?C!6H-*XW zc6}Zk3s%O_}J8ns#Jn#xIdWDS?7SvGw(^IX2 zOnd~XQ;1{c_y6kiuT}+vg9y52e6;av+5vc&Imm&$Gf9qB4%Row`B^W?}FbX zG|tR0@fw+2HTU)ZuNrUb*R6$yGT7trb(BvI+|vGkybNxA17a8%taLe%?j_0VW|t$4 z^DI(dV6B`?CLdhSsH75O1~R`)9Ca6Fp51KiIK9!JtU|uCDmOxrz3xrD#O%ZfKiH?g zrKDRQN7diaZ!75(wV~&7ElFGpSy|-Ve`pbW&OG>WUu7h-bVLT_8j}29Ps<8IR0FYY{|F z1>BxG1QKO9)t5{F3s)cchCg@@0_ec(bgNwbJ}`z5Rt3<6S;X=flk-eL*6x#t|av- z^2P@$j1y-MStpx61c^Ikz*C%(rsl|@AA!T6>%ie9u`0(HYc>o|%zgM87V)2le+)uv zKVqaBj?0jd=q)yi@~0WS3bIgRaRMV7$svdTzw`>~CLf$t5=k_vB;MY&(xe_gVNBjI z(y8z9+Yt56GvTf-XCPZDjKr708*h~tNb=bYSHakti`D~taBxQ-#r4SVKh{#WG8+mM zY+mFxayY@qE7$MNDL&gbQpSpMd&$ln>mw(@^2%dmWGO7;y=;$VU~?h zPJBfbl7p)+he6m+pIK=m6fv|Nm7PD=)`G~Gn>-~48tKMmW3I}dNunG7AI?BMCW&Jy z9H_~!Va1GO>@H>Q7uIZLUXY$7#U#;(EBNO7Qj7e7L7F0!ex;<-SnPG?$-Hl@v<)`H z8M5-r;X7?9sIOvTjQ2@$t|nH)n3aj0ZvS36 z_)5TUnLCA9ayYL_dwi(sPyazlyfwMJztu>llY^!?(MUv#VO+tzeu6MYY4IAwRx#|i zcS+aJ;A30HNl=~0IupP~fM4KEH#kpk1a|Nh<&teA^YtJE`#u|@6kXzHzkCGV zu=q>Pj;GVyI+g>IU3!)>xSb$c_Z!YCm*>gHgs0;lqM=UV-n-=`6ZWdRWgRh}XIGy) zo+N?EBl-&7$IwFZVr6QEMjBR29DU(bLx8Qe5`nm0XdXqd#5`D(2#-B*@BJEyRr!sLxr!!6- zGXcakgZ>-NO>jSyXUN1=U{~j)meSekCFAg0P&3SL!E5C3W;4ohTp*im3KA0#XQePU z$RdS8MyJYXlv_44+4MnYs#dn!TuE%sxRG8Kk_b_pTqA=d@!$QwcH%Iu53;)?U=i+y zE}x@QFh8fE!mnhav%2A@?jf>;wcpT0Pmn^AxLpt7_XZ8SB@;4gV$dY=cxkIGwH;lH z&=RU;D{aRwMOu?aSxb_mQW3|%HZUQ6$v93FOTaO@aviX>Rzap6Z+ZghenE1iyGEp( z7}cGkxL~Z25ffXTT+CfQpC{j&)97(o2 zJuWp?guj*7GcD~TF-AOtgqBp@OZ!d6orq@$(>XgqhFyuxOrDC&Hx9l^bdE*r;s;rn?=R;0Lj6LRD(TFMPgdZRpIq?a|$K(7hA>g5n=$(zpO+9Zb@ zPRqD7L?MPBE156vrcAuy8DF3m!8xm>Yn#jy_%R=QnQgb=kbuY;mAEvs_T86b|Rvn15d!t26>QtG!n!Wb3MD)UMgmuKbghQ zDAJp&j(m<-6r0gFo}Iq5w^qa}k?ODg_X*oU{HI=t+gC~bTggHD8ChEz6t_qRNzP-; zq|pJyxZ+CbDKdG9v0=KsP7TYytrj+OZ~&7{f3J;r4P#z+qU=o)cLW^)&s~$+&GJ5% zd`^Ja0v4|wKHmwei~P+_)2O_@zc%HJm$|9ave^OFa%~s35wfF^$XdkE5*Lv~gMLOY zPk^yif-72kDAOFFQS1jsYD_iR;6PO2{7fs5Ov7Hylw=vc2p(s32kS+H(o`Byz8sP% zxz5;#d_oc}xX44d<`Bg}b%(@GB(1M=Yag-Y3Qt za8`nO1iNVXQ;0(*8bd^67XcrBGBOw&_mW!W=_62zCk)e59QV*@@fC6?KbB1M-A_oO z{iS!Ukw5=Qh4I$sYZ&$~HOArhs;{5_!kzxC)u8}}hqtUGiRws@EPp*pvD;K2$%X@D z$;&i+)AFVAm!tI(h{*`kA@20*$l=`SDuLCcH~rR@Y2z)OXpyr77|mt-xOT$&--`u{is9e5&&s8Je~ z?2`avPo=|y1>z)0wCZ#<$zG=b^RfujJb?sLvFSKXW^Ad!;M%(S#G)S|>7n)J8x6Ye zw@%eY+zi~)X~408PhtAI(@(cH`=%+ZXAFzUgbac;Q-`?a8A^Ih5obCRIZUleVmShH za${cEBzb3n$^URq6wZ9iW47bjS_|tBX45{5Bm`*o&Jbzcvh-YVaBG*XSSgFn1BTPy z)kwFt=EkK`LYmEG)y85N7s!X?a9jqek>^8pp950&kpA@W>|~AYA@?O86_Jw8ruhnT z)Zg=h#n93@ZZKC%(I_JoKTMJnt1(wOG1q$qqcl8zk4z4HQU}f{*Ixt@SBHlN-kFTO2Y;v3kXbqf-%hOgnZ%`uOtkDzr<&hXGdX;o69cnT^)g_j zn?C3@HOXT}I<2<>jCKi(L6Ee}DTM~~=fn0hCNKVq` zCfWW{#U5O!&nISByyPAP*~6sA>cMS9RLDNu0+;0m;ce$rnTc@ptWY$)Tlw1s>= z$FZ{5hl3=>Wc_B7#(STWiF;YN;%=F`5*WRr@itQ1B3#^4S8EGSLK>3>-FuCa_^aT> zEoC8@(a*il@z-io$aN$|hTW{FJRRb)6nTy~Hlz&5`rB&%wXkkyi@ZaOe)5*%s^NzK z{l1i4XP2%_U^=O0BmJr@rW)D*dabfP>K3wBNIsdkeS{iCoGj_O!CGjJMU!5$7$k1o zEg6#MLp(GsuN(FgdgL8q4hVe1DjN|GO}-`vZ8{nKUh1HHaU)1hv&Mex$gmV<0-EwA zp-c*oV@8Z*6lj<=wB2N-(X603B4;lJiEjAVQzsDPjsJg-P4V^O3D^fL!iD&VCu7k* zkm%7P6P-aYb~kTIJ~t9(aY0_AoOd%yV?zxmmIxHToW|H#Eg_$`Fqz6wqe&!jk5Jmt zLxBgT%faEuAm8w-#1MA7Ivs>L{+^`fUw7aa9I2->57fpL?NA%TcZhaPuo~OwR;x8q z%$N_GkVH?omn=ZAUKlCl^OtW;L#BPK%fbmHD#CSixEx>x`7URHKn*SM0 zh(qtu>P}+{J-rv)YpvC1#}bN?L~B7woKzBX>g4snPyqX3X4(1pK9%=7;-F8IJ?;ky zeKHh?2r&=6jP4O}C`>E9`7FgfdIkOX=gVzILU|rIj18j_wk4fT+rrd!6 zB=)j%{{|)>L{aW~9JG!oX7{psU^#+#`befN7gCsa5i>AjIKWTH;i?VUxwGQ_>&T!W+xb_ptt?LSs{OeFdrayX7{5hq$Ak0z=B%=IvKvfm~Cv7kBy1VqkbT`DQiyEcHo0SJ;?hcF-}e-LDJHJF)%@S zsh?0q$GJ>$@C{#j=S^_-YO-?P=~w}R&&gPZHs(f#g3bbY%t-988A0WfOq8NiNc4n% z8l#TjX!hGIFmXXQGA37KtAHM>Ko7$e^I|eNX`)F|O3de)-s|)g(^RpS96tEoI%d-x zvKAOYC$kvIJ!ri6LsVNh2^IwNb4)&U2mwH3QAi?0ds3d9m#dONN`+jUr`RnjmtkUD z7q4XOjJ@Pi8MD%0s!%ZpJKXH@Q{YknO#T57lmmyOQ~8pb4<;7CvB-RIQYj^kc4uKy zig;e|Ao$KW2$_XgO|%-{p->ybgoEKQ^^;j;@oUO@$rBzT)Yz6 z;th%+g$aW&C`+Zf6a;oB`uc$`6uukS(Q}_sU2Qv8` zdSIlk$M4N;1%AILa)RJ`x<+*n10svZ#}vCKH{+T9i>&9Fm-+Sc>55F)}2F zG?+jdYymUmwMLM7cRYJxg1M^1;dka8Zlb&Qp>%XxT z=e`yt-J)C%F#!Ha67twU`7k$uiNZr2;Gnfd%a+Q&h*1~sD6Z|k>kR9T`l$#++GS*F z36sQkk>m|PF^zW}vT-~~4(AS(ykl)#TSSNC{BAJu$Z0J3Te9&gi6fu=oEYY!|BpRN z!rj~*N|VD$;)dY`d#5ZVfp-5-Q1T|E0R)r%UT}fz)Tfo9fM+bAQj(~=WEo6`pCbV| zH!nw?C+6KZPP+J*;0qJ&*HW*cT^fk_fci>ZoawY!k>3}um_@L@Z%ULS2O+uhmfOs| zll_YYiGhaW+##)x6uLC|+GJRpb`Yk1t>K^??`>k-joX`sWf?Xh{6;)OCVPeeaX8I{ zZu(Vn_$=ZZ{e&X<#%lS4b#$YLZwKBndyQxv0DG16(Cra#(6Mlu@>A~D?jm&jzt)&{!`c4Ay*cA^+2 z1XRfn7zcYobb}$6J|hR$b&D#T%34|UlvWRlYvquqwK`P5ZYMU|1#*4JX^zm6AIHY` zlf&v6Sb(+3-$-&oW;grswRAay2j_{;DifZyu=|FxfZ2eu$V5eYR85YG_Z&!c+ec*b z2cHKM7QNsOeY7zPFi`8Ga^5e#Gi8c-8hV&^P-lylhsT=UJ{ap1N<{&zoez! zK)ILNDOarniOV>RiPA~TzUn3efe2um(nBWir=phuC3b(s$hw#peB@P-xbWgT;jBAI z0-{)$!=y-FG!mh|z48+Ig(R-zmBmZLYl>;TGDXa1GEx*2H1Wy5VH{X$VEIo?-@dO~ z%ftQtSeB%cB+D@pRbwgUdU zog~;NWGlpR;8dQheN$^X!>v;DmX>2RBrZ)g8iL@!5D<}_AF)eN zI~?stgi}ekT5eGzz2=7Aw?61*5$4uuA850TXai-!7bN-MhR5Xc4;4F!@p6S>ue(*Q zA?DByBi8w_JQ9-WWlqw#Z2ggzdT`83#u|ep?i!m~x3rt4tk;o)o~yf^TF=R#)rtiZ zn3N-DAcJoB2tS?RXbVnqH#3@HEuBaXO1(6gIa(>q5|EvHc03|QJ6JD=cl;@2@%{1Q7ut0c^9E=Q12AI0yflri# z7lj|6EQukRG)VXRz(~KM!mmIy!%tP%8}DS=yAMh7W@CwaY3fz;nKfRIOy=$K!ROX^ zO*{`uH~9i222eA4K=n^6_r1yFltvik#xA+UNE$cw7Rni4X(?h?ll>#UV@@VaY=Gcj7-8ffIXQ1 zoi-BG#6m2@NC^1dWAw;@sr32ZYqM@iqX!kCT0-9R1DG7IgjQK$cqYD6?jgo22?5}c z#TM>540DL2tI=HkbsuCYClGXen5;h=No#ooBiZf8urxP7Sco%?L?CU1AdyNV5i*BL z+pkHYpU}kzWl@@g|KTTa(52<8m&+}N{rD2;`o)@sBSChJ9Q~_S#NyB!Yx2-k`37@h z<9qjtCClXp5}arSecoWA28femHUtwFW7r40mElpbAmmIZ_$k$1aT3zgK?awfxAcDb zf*6gsrCroY7LTEp_#1(Vn`iO|DEr$s);3|K5=#hFFYV+|pbVae?eduk;7tIP4a4B= zvk3$^=-CKixS3XW>RPbAfzn6zBL~yuqQ$G0NEr#V<@dHZT}krt7D{5#=OX~zoGrC- zEEzx*EDt1~95j6sHVZ9Q%Ds^$@0+ugmU{7}vXGc#k=%%*4Qg&LBL~+);?jaMkoTf%3E0FI(G6`|-b7c2jw3J~cVdVJmt{~ATXE;lq zHSG0Q%Xfx@hJ5*n82XBfRy=_GPm5)f-9YmnGm?(2Ee~tAB!?ZPfmo+hl7#-@q{A7o zyWI-??kOg6|9~XEdoc|J73WTNHI6N1n?1Bur?VBB{#isDyOSKW%792X34GrK{P=FD zvan~9v^`aTChrJvFhG)zC9zaa*$Wses%J{&d_r`e3bR#ME6pX%@hRzJ?3y@|{GYa0 z;bb13&x{ydY)o~TgvTY8?BeXL6=C4c7yl#sXhpmb@jiyp7SmRzkcq};$EO+EH*sH( z7%F5k!%$IazGgpg*vX~=`5Q4ys}kL@q5wvz>&QW8=$6o#hM}c3?bl>-9I%~0n}G#meeR;NO!yoPBG5bkM;OdKg_^can}rCx5>z~HFIwkVHN3|H}L z+r9cRdDlocqmjqbCmx7WbXB8hxQ!E%aFELLS_r4mY9ylyLpAkkSX$hQnI=~p3}JL~ zCVo)9A?DH<>MpnwAc3>K8@^vJ(;`0+V;s0M&rXKlGL2ATOy22uxBN^2^vBPLV#|k} zglupKIP8B$4(*J0A_?KeNGW%|6*2h$Ofut0@`0~3B-7Bj`o--10&~XwVha9FISbp1LIe{2`b0(S@T;Nh8v4o+! zz~)D&q??R1b;KNJM3aK*5}C?I$U)tpG?CFP&pOhYDjQ&em70*OEyj9W_WYAJbrg0I z4-?}y-M(q4Dj0VT+sGs`Irzw`8H>)&vjNyxJ#>^-oPl!WZ^XJXbrMT{Gf?{VXz3w(7zs|B+N6M+-!M{B%g?C+mR#i;A2$)8+d0TUQRXAY3QV?;-llN`Y3D|3uKQI zj6|9nlx}gNwnmfHxY$YQ?&R?5>Ah~npd^_9rk^_;$2CDWIe6kLa1`|PNx-;w1WLg9 z3ghn}Q1i#e#C&t$-jiO#W8tk&)@HppWQdxSKN{(z=h7)NVkBCaof?v=Q&bp+Z|*cg z|C7W(`OS%(S}>t%Xy$VJ_0zydPh6w~KkszK4npM34{>ItTt&>=8EA7km`1{~Am*+^ zl7}2!Q~xJ4f9%Y8oJE`=2z~+2FsLMaa&Wn|{dY>i8QLsd3m8A*p)*z3%N%kv%E~x7 zoK9sq^u3FbY^|Y{*(DIbVuQ0xAYqU@xn(5L_|1r^we8tjg!gUG*dqs$L|uG!=PzA+ z%bKNIO=0k>` z$z?{;PB6Jz63^93GYDjD6~}zR9F=zpt6`NUMI_NCHu_#uBAyEqR=hzby5}Sn%RdYU z8Q61M-5@f79k5({KE!Ax(ckmGCK7lN_4mkv3m|~!&Qmnlr)R_2t)JS!-_O(bJ%o}y zfEeTK#C1B|DRMbUECd$cdFNt@zewd4!^Lj<1;FfR7W7G}uuE}rC*^1|(a{UezF0NMeQB)h+v6qNRx6LyMQ)E(M8hELyr){;*K->|}y;E>i5bcFHI*JCm44 z_2=nVYLS;zAZ-}4-+Ge{9btjN^v_u!=S`4u6`xLmE0{gVF^q@8;Ehipr_C; z&6Y=voSkZ-viNl)eH>sTjL0R6RT#c9ToElclINQU9rYiQoVGcs*oDtdlw$iN^KJx_ zQx!_j31kQ^mqv1MMW><7Lxi9!mw-e!Z(n|+q!P0QM6vElvoq#;GAZAf28&%Oy2)hF zU{bH!J4wUlA!A~%4{ZmO*I3fXV`Qo|J-p%7(HbYj*kp2@sp@HuG8`m?g#D73f8mi7 ztnMHH-4*@fPFkumP?pM)TYzKlkQTM%!)rv=l8GL=8U3(GklmJnB6tiw<5nN(MO(}oo=&=*%|(HyUZtvTfp*_Y01aB&>4BlWc=n4W?wntcB?Yn zL`B`lBr!6KGnbPKyGFd0oM?~a-vK^4@1_h%0WrE4ROJSh6H;iTI}PKB7AYdhzi3)B zSH-c+H*~xC`<>dtWed-e!&U;LV@%4rK7T=nF;xdibCdc!Ip8j>WufxkH&OHaFjgCMK8V=vI>$IUL>vxu)>6m#i8Wzt&U@Gq{{k#2Y|oBs;Gw(4_&_@ zhfg7~&%uUu$GesEyR&E_8m2m19%URST2at}D1b?I4dD;J`Ct)5CI{?kXDM6SJ=Ikxptlyf5Tql6;usebRiNN;;8Ax}86HzZM1c6LQuA zhDTuHY}n7QmMmiSy0ycOMRLi(tViHc{NeBs!tLVNf~x!?7ICt)mAPp>QfiVxHTL}f z9fJSp2enz|!igVc-E?x$&xMQcmk)?}*BMzjik5>OQek8R!d4hlMlL0X7s?x_Pu)_I z{I}&|MfNvfuBUK>JXwM1gO~z-e-+I4#bpu38&-WVmxro~JIhbWFaFT=f9BsjS+N3VrIR2nOa@lh@zlOtFVRm{nnk_$t3+Ui`+ zQjRz@4^S&d`yOFT`K}H6- zE0nLSMWPQH;td&x--4bjID5&Bd1wJW^x^K=Qeb$nOxlS#9bu`Ci1!Hao|7q^WU_mW za3~p}y%3UlBh2`5Sg{KGlQi9tP9xp6Iwp#~x`TwnoN#V0wOQQ5->bx9Owevx=NZ%uCt2J z>LX|sUuO8?&Z$F1^xsXZche^%LK1^=`*T2yjEy9kyT7K`#fG{54aGAF>C#}>8yb<85a;1w=pC!uSuT4T zc88kf)PEX3Ixd-p-Er8}&nQ1Y`t+YdTwEwW8+P;Ky)4&oAjXxK&YnR!1bepwF(DQ9e~*stoAGeev;A?F$P;Kfq4smcUnm2%S7iaqr0*PRXLrO4i**u~=P zXv4E3`05&-iR8-P_g9%2*o>?->^A4ig@07OUmBHrk1(8AB$xi#+Du86%MCl3sWN%2 z@;!XXw>VyLklinLoMbq+N4`8w@k~OBd~ev#9+ivER=yiQB=LrW{0>Pr?8Ilu{&Q5u zgC*52=bL;v4glsE?n5p`!(Kv@ymc}7n89x+%Q2UPRjcLc%ar|AuUvJtvi$<&j<{B_ zi%|;=qA8gW(`?KrSf+AD4P5O)e+IrJVFm`L@(Bj@v@))n`am zm5M*!E_3Qt#2YV{^9}noFu*e$l6pc z^>WB7z!-&>Gvx?EjzSzhg|C_TXrf&GwuxX6-chy}Um*2{o!)HO(J_ANh`bai4$|}G zZ=V~+C-@P=PIaMV7deav|&GgLNa$Z{#3av+Q;fvA!x@R zf%zv(m#+yiQZoq|>v4dk;$ep?-$9P;KEr-xnmiHmd&VW{D3uASn9LH}_?3U)q zKW|e!11tJ3413d~^43b_&%#n{VvS-izgs2^`?+H>Z8(6x+2f&1a#ZRcur?ElB|cHH zTbwD0hG$_Rd1spPgTgquI$N>RF(7mDjX#(x^9+0KiE>fM$4UL1LX$5@mNQCBzI|Ng z8=i^pks~Ti1~ErAs#EM1kIR0B{q8!sw#j5NOJ(b3!+C|Wn_;)DSpH}@s7RK_x>d%5 zKju9>Cf|`JJN6nsf-v-hh8u@v;~~XCeX?w8*c;A}O~y^8XG*pXabtyCU>M;T*7MV<4rtD|ft!)x{TlqnArR-wZs~D0! z47=6ivX|l6$woPR#^l=~@+aTghuiS{&wz2eYun}Wua)g449Vhe6?^Gfzv0p%x!L$m z7Sc-oWHKFC-58#ote1oh=jux!xobUk3KJzK9poZ-VH<0|4c=qhE1n&zlP3*(Q`K_A&dT?Ze0ElVaf|DEWs^OX?Icggjt44s z^QPrk!w$987Cqg_n zATJp9picWa6Ee8e+Fscx%bJ<$maPo?aHKmuPx(&akeqGUugqSLy(vVt2>D%;GN;(& z%eo}VuwPpz_m(K%MMUMND@>+0B0JX^zrI;oI~31OmPm6iFuPVQuM^^#xOgW2Htbg+ zbau#Z&XiM!Rs0F$^qNp{uWd}C&l^Vg;16C7`EBcQfRFKdeq=;uO$O%#a@-qX-6=W# zO~pRW)K3U;S)43>7ku`zO}_q6MVtb}h&Sx!B2uqmuK>P3hG(!SUg%kqiLJlkNrVVB zeo#=h9=ln|(*CK*w>QZPhP}2T+5T(g2OagYt6?vpT<-eT`qR)Uy+0}TimK%whMg2_ zbPT(B2!g%Qg*x+LxH8YMn+~P)@09OF)1+lH#V+iUw%x{XE`pNpsMsGs-l7msPRqu- znoL}_lp7AR%jD}2!&&TG!%kPJytcc^w{^?1y%o>IcgRcoEA~?0MH=F%QF+_2+gmCp z9IP@<`jo6a+~hlZrS}NKWxevKVK>r&%rMFi+Vc=2!erV?#w42O6$OXi+#*G(Lg&U5U#&CRm!T*Xdzqa0$` z>&TEJLX3EhM;UhUhdOD2T`MI;+6@O#(Qa{>@#DJXpb(D~OU2d7pB>GU5yNh4icE#L zze^6j-s%q4%khS15*p=!fS7a)Z`x zS+Pd3+fyp53_H;K{`!FOy{Q%{h%*^%Gj~l^>=&lWA%iFQ20WN>UYZ8*pqfcKs8 z>+0mHT*Zt4QdXqcneLUP6^4;OWwl`!Q4$|9>_eBFWjN?hl*8*xegcPNoz`ZwPgaH) zb}F0pD?i8^lx+;pPUlMBi1KF=@@3d?Fg7VuhMj1yZ1K3txG?JOe@3z2-7bTMJtVie z?j_}W#pwwBXt*;=ZVPc=n>_Omt6TURb`uc&(fAI+I6Oy;m(|;;^3Vq=9*j*$(MMLb ztx=A14PzDf%qNOx5)hW(u#4QzpBeTC#^sc+RK`url19U`(@|+P>>;Yzyl<_|_(ZwM zuoGV=d;F~YAiq!cG3+!W#n49cVlN`UBO>*iD|TuSq`|O{$A0~`CR5oWiwy_GQ?ldE z%J&Mo<*VHlyXBSAv#0T)$>`cwvD1ptiG6Ms0%u$MiwjMd%u#fNk zy5o$Gm@_{gZ}JJ9vhfLu{r+4;KQS4E4f&^GH#<%~IKyO+(V_orlZh*oLBsA~o;-4a z^1Ze^$uUgxv%-1GpNW*qEejO8wGp|~aL_s`TV7^u*4N4DA;ur}EW;j*LoW#V{be%G zu+vg5mt1Yv&1sSCuCaC|a%7JXV^?rUh~X9SyBi>bY4zfWY(a=Gb&xP1+l06uv5uD- zzbYyjw<-3a<9OY$hs?~qcPZb6f#lx}`}z6uz`e%LZI;U)R_x$pINz{WUA!L8AYVqY z@!jTrS)E|=WfKx_*q=_6O;VNbLCJVbT8NQe%Wwc=w0jM^sr_<)D3jJFOQI$}m>{NFNEUfJ;i?I~@#Rm7f_uS}iYqVYqWr-Zbpv?CxE|!DzaC6!LrWWapn$-Ysa8Jq!m}E*!Se z{MeNA%H!pv5chV-)tehXzf2z9R`Kj~zr-2#kdC5qXXU$@_0nY6gF#zsh%;*C`8`y| z@9&To4ZGR(@`~Zvu@Zz7Q<-3NP`(XuVUFzZN8@LA$fZYEJBcas7sFm|y`&yvGPT39 z`H70%oMIV0NwGJOFXM*YzIJ)^4COnmZSpU}ZsVx@7-E>w?>@(5N($s~Vm#7y!*Zmt z-Nt%ZJYU&wC(4b4+!A#)${mJXc*{2~RK5or?+^!YIc_q(JJ2g{U1fE#UH{B*fUHJ` zU9bEgFI|4T$z+C;6nm2eh@N5cjUBRyVIOB8m)>VG(JGvD zTiuFW`8g37clZRh!MVzIv!l{%*e`|yXqEE)j1;*j#1qwWwPA0dR+<{LnRV4NOo$Pg zg{AJU?N$}O-Fp~zYLG*tSNU#TwLD}vK<27PLfnHaf`*;M33+E&<-L+-dDpNrQX*$e z89%8>&LZZsDwFd=c21pK5aOylnH%Dq4rzK^tNIn=GGN#pYL&r|KQS)vKch0tpV|Mc z;+gn}9PpfCr!R9o_WCB}5aS1NU2^lw);`u=d4>bnZ+-Em@`I9E`PQ%(KPlI~Z~W|Z z`EW+D2mjgGS;g)!eBnae)G2p-VlsIpvL?jiJ+k;qlb_6$=C2e72ye9VJH-xOF>`)W z>=zZtC%+g*pvKP)J571A>US6EtD!1;J@(^!G#)m*+{wj*9(wWE|VguGLi_yY6mya8&j%98_0G{l3a~)3MVs?4_j3)Zx|+ z{!lL)X7H>_PBMNLvZtM_*vTl8O%3}AiBfi^$wd04*07h8gIt2(V=6rxkq%=!lg)C@ z`6})r=-(W}er1o$HSA@#OHwF<1i_zQWX&TgMCN6RJ-D}Aa)sjA$q703FV-ePu$&m; zk!+b=Y<1%Y*5e?hSQ3|6J7r^%YuL?3(3BO%&r4g6{lQe}eMtF1N<@|?0OPjSj>zqV zT#LZz?SOQ}Zhf&FWY{lCMIIRA*TB@&@a!Z`I1C4MHS)J&mGL5#QXOK%KYOU$_;A5k zUZr>jDK(C6v^FdIGr= zm05E0qriL#I%p3o+tfyl>cTu9i9f zv}@(G%G5`S{VD7o0>cPs{^qBO-N6D$_|oJHCnUwNk0X(E!)|7oWQG{KnGk!(!j>J% zkM>E_uvdu9+qc@L*A<1DMDYx~F_-?LI2fpsTSJWdjTm;(&vSpbK*zkUO6D4Nhf?H< z5aZo{jbU%NMy}XQ<%6a;IcICDo7N?B40{uWGB@N$Yh*#lZ$xP09aY}zN0RxS6g#C6 zIb#oNvo%3Z{k`GTY&q&+lTRI$6Ak-i`ErWk0B7eRc1m02v_F~rRDohls}~F8jF4T_ zupS4kS?jSk)+pzij2mAjhaIoY_^{WRW7titm9x)PzB3V#D-8SXQMt}AB5=#v3sok- zY39;-)}4e>d2pfek)Z2uhG*a$7iZWVN5tV!X0%*V4SVBtQgF4_b$fE8$gqd>Bb_%W ze+HH+-G-gaIC<@6$^7Ji+C-UTZVhqV!Ei!Yb!gctX5b{r#IU+Z#FzJSF+58^G zUKS2S4Et48a`eN-FRGVg3_CbhJ|W~I!|{oUDl-cY;W>r_I9)Hy03SVgqFo{e9~+mU zJe77v$7IB?TZ3@XrOFQ&&Mm~r3E~@epfx(ET4mgUMA@#^5i`0T5yrCj6}-A5K{9A&wi7&JZIQ!#A&LJ2SAw`qr?Yo-IES z^DSB-=e?sMerd1FGwh9~Krf>Fpg2oPi6Q=Yij*50VGO0ox2hdw@>yUaz4AFBKRrn~ zvfwMj$uO2T?2eB}(|1-iV?bJo*?PWo8QZCSa~PugjA+w}lLvf2>{JB4p1ZlhVDnVrL9VUk_C56_v?BhFw^s{L%0X zd@%|RHQxpdcnd|W^@?+#0 z--qd1L5Oieu;W?QCJy7bILGQDuFS56-Ju3~;6meL$5d+AFCUeC7AoKCuaJEWyE#d+ zzu^Eo&4#O0#>pL&Cc}OoGRlSgzEZjN29@yC0)@E*v9B0_aCi}#YU)m=p8FmpW z=F|{ROvxFcOkA$qdyBP`Um{O0Qydf)%Cq;{wQ}2GylgTBh4M|vAD@!%4ZF3?@`GUq zmO?*9ch#ovlTme-SUqR zBe>c7hFz>IBSk7RgD8*jhMmNSBpCiEqgxKEQW+1Xj7J%EaZGuPVLvHfKCV|8ABjh%q7N9JyR|h8ry+W$i;^Jtd{jSNUW1BCas&;9F0G2xM5r-LtK?0e-CkTldLuD zM&jj3!yiHO_sH{FH>fO^lo0nK=k{yH$1eMH!%kJJ%rWc@Cd>JTXRzSk>RprX>60Bo zoR%v)8+KZH<(e6laYwu4gAgOSWBn&qH@;jt4SS`v(rwu9u9copCa*>MLz(J!S@|!k zn>H>9hW+Lwx%NlpJNh9qs<##qMy;40y$x0#U755&Nl3~*2^winGCd&yBT)J zGi5KsUiHv#xENXBjXx8gAV2J=b)CFK*UaLX6 z4S#?Tl|6=CsH#Q{J2(>FsR8o1t#x@4v};wj86gn{6#MWx-qUapS0#HJ_Hb0T`-I9k zB?$6k*iWyJg^wxUs~nX_*D4MQQl;Q=#WP4D_4e~7GuSWxFzk+u$p?lV92L8U>C^kf z%PQ}VR*Sr@*uj@{cf($Jx9n-yAIz5%LyTSgHSd}H$gnIU#_g`Hm*vLx@JhPPFarjk z|Cx$=iQV#~VK+BVHve4ter19#6rLotFi<9m>H&g5(h09(cp89{7y6<>9s_Jdvf}$dd1s?v6qzH@(+_n{(#u*?X-{93^|@rFiWVagM9Y!Ph?>+heP|OJzT+8<(3~+Hi#&@UC?I(R!@K zZ2aXIjl%_?>tsJO181MCF$;$96*;IL!FlGXMFd~eI9zV=tJ7_q7Rup_L;h?& z`KxixN@pQXlULKaak}j1*WoPLW4HBl6UTaudw3= z#PzZ_$#wj4u&|goWHYTt75B?nr~U`=kRID&edC9*{e=OXFRx~F#$i4GY1m~l@sW?TIV)ffbU-)+i`2_ z&*X4=9=|cO?%eA)D~I1EId{Yk3now?d%PiEYjG}}H3uVos8>8-vAY}oq;bl7keLBV}i_8w%&K@}HP$o>>pOIp083!N6{2ij+A;$!`(Ml7~yoRSU% zvcJ%WirzTBHrI?=*)`6gUiKGL(O_|2Q=0w$88lkFU>q&xkwUbayE4%!2aWx>bc~M` zeNqhJvWU^|=GIJHA$#@gdu=Y@=AwD!EY0H}Ibhwv=hnq>$5O&C$U*laUX;B`E)KPv zrXF-}h#&Te9R6U85A|3`b>>M8KRkxppVp9LIf=chN&H^%)pTA8z7!9$t_$Cb$g>aF zKEdpb&f?#G9QT^)@u5JTt-z;b*H)f5i4`IJ4nmJ!X7`+RaG1AH}ejWIq0dt1ByF-q&_SMtfls- zyqa2v$1To9d}dd{hD0BT$M;)PFe7`Fyg9O89C!T2Nu0KXZpFQV67)poQ(T{q0ZSho z+U>Ru3|c&U5-kU7$cCleX8Yd`#lzutbVlZo8(FX7$T&S{pNyB44TV%*(HZdCAjjUI6v&@!(%_tv1twXt9hC$lVqQj zqt3Wo`CQxnRXN}#fBqG5+^g@vb~6hx?=+XQ<&zvN)Z)E2FhBYm$->?d`RC}>RpJA( z<6rDP7VmGzhb*2qgpbL7*E9~lB|bJNZ^QAj>#oI#=CW3tYB{CVxcGPWSUtE*_VcT8 zmFz0oaZgH|%S_;(S`0+K{IdzlQr^Wv^@$Z zmE8~+pO9OaoD$aLYC;q{i*kN`ZZn#&T7{f1Qk88%?ekhIy?5x)Mh+N%< z1=*V^#X8wvuEztGGgOEp{u<}A1{O3w)Y4q7^~MRr_K zv=x>=kBj%$n7p}nsqClL;%YOO?&cpF=LEbc^klzm9=`0=)Z?aO&U1v--J3LKR}GfCEnb7y%v>t| zy6ncMcDrmGZ&{!xbRPk`2A7f@-C7r${&iCs|Q2nf& z)sGf4w|Di*t6a_BF9$5uGH5xa^B7!>k98AW7>OKxdbVIx;jpA8@!H%Z#w^ZTUlZn` zS&Y7{VN&WbFNeKDc=sD|Jm3x0jpm+e{8e^bBboXi9b4UxwRim}zBb~Q&g+qRI?oqi z>mG49s(Zdo4!M)l>^DtgRdL6Te7}93-PrWFGb}chq#>c)HJ77`G!@nh0aIcz~4VV+ROyZ;{&X;q` zEIy}{)zrbn{xZL+?NGiXUy#H6e4K9as&u4Xq*Itp$8A51?S^`AyBuWK;t#Ul-^tEO z%He$OM%ia^=%>u((|FoE*o!T4kkf-t+!!D0W#r>vIV`BgVHRiI+LYUv6aCRPAu}Sc zWLDEskR^w8y~vgwZ~ElQ{#+)u-5uwL?A;N{!R#z5v-GgFv$!}rwksLm?J{1;C?2$p z;ZoV_&&6ey&mZYmSPp0I+e-CVyczfx+2zgR0okw5z{7IDxw!)y^timxKU)r$7I2{) z@E-8}UCK#M#{=>z7oB`-G>!);&G??|__O9h^EgXAm}?7g=B&=Gu>jA^>)5A3YPGIQnvKrL2*MW z9#%$}*@@@(x;Xxv&CJ7Z-zyi_B2{)XL&&u_?>ry!QRYPd%Np^mh|%wbid1}8_PghC zfgH@_B@T1=YwF{1{z}IPK7O!{Z5_lXBS+7p7oSo%U}?sqWxu!&uN)PphZC82Epl`Q z{1iMn4!c^e?KIb9W0M><7vX6+Xm7<{XDOdOYDeXerMt!>bI6A)Fl{Mm*@-`bV!gs{ zg7+q6zkuy*o8v>hr9tf5qG4uo@ln}T7T_S+YiPjP{c(;zP=<44msNnT%fVC^E}4pR zLN;0Zr5sGnYZ2+YOhyFF5mUmO#U=@(bwR@s|P!R@k3oxm9<#W`zBL-^OJ zG*4~Q6#f%2dJ{`a5(oXI_@Ck~V=S@9EA~A;ALj?W2zjrWm#yZNg?yBq8|Q>v_fabc zd|LIgn=eCynIAEYmcx}C`+i5q@-&Z?J@@h1Kl0eTkpnJR zH2d>uI9&0tXgRSP>A}`aJuOaW2eRXy;a%oKJ(`bKqHy##n~C+Makk^vM02B#WCgM| z*)OO^hsCR#&}DH>2X&h(%Ft^$Y-w_Luf}AhoJ@I@3s$pap9L24EoZWJx3ij1qIft| zhC3&9KHLuQJK1Ga;13q(?f0ASRZeX-elLgoV>bJoXuC`Cs1*P0(cyYeZ zpZ1kDNVy|%;3yS-l{dn;IbnIXio|j!)9$vC|+dNjxMZH*+{fBxsTMpVhm< zCkNb}^StE`rDIiI<+A?Io*9o5&i5yFJ zW3F=5>2}%Wu&b%;^ET)X%Sq#nqd&y?Zif59WWR=c=`;0Q8vF3Btk@x!u$zPOGS)&= zoI4%NZfXpBMRC478(FoXEKYG%MQD;e?sDvr!@@q4G{rey`ZxmF=PI%@n&Y_7Ixb(4 zU0*lm`r~+5l!=#Rzm>}uXEYAy6?VU1`Qsj^qCcUy7 zVuK2EQ3cjp4i_IhBnND4_^8FRma);yjT&ax)1TO5X`aU|XDJVxWS7U~ZWf=<$7YMO z=0I#mJdgr7pDjOGyH<=>K5^o?4=LkjPsRK zGKq6!$7)OFkgFfhO>&BH#E*0eEXjL`?6}SGGTHAg!gYzcYt(W^*_i#>_*mDMgIU?< zgvXn==p6bp@qe=4mWBVxjs=^xLn~d zr8eFqdVOmWUQA1#O3SvXb>vU72q>%1 zX60c@aW6Xq({jLR-C4_@YQnr6ay#RKyv91gf04bm6x?fGYQY7M#g7{l)!-s|m1|xv zlO1Qgu8{rtX}tEF&U3mEubXQt@QxgEV#ga9XDRMw_u*{WP1WFiAB&H5{0Z7Ej<=IbiR;=PaJzzT3Pkc=200weEHVm&!xgc*DHVlh`jQ$6Jc~ z8I^cjc7x4$_m6cf_ceXwXF3O#6M-C5bmHxsFq5?rr%$ijv<%E2Np)-0ZrhMUc^6Zo}xFcr6%IVJa7*&oZogY)rYt#ZBCUnBF? z-_*_t@Hp%`(mClZ^P21>+3#TkW2LVf!A1w=ob%ZC z?1(#%hoL*#H`1fP~2*Q*?6t{H`YuEv=v!hhwUtQUKHS?9^$VrFl! z5AU{kE$?8?IaaaQqnRzfUzfe^GPGW$$KniLuk7(g+$*wQ zoPobbj{by)@lS<=h9>;WQpU>hx*X2(rn{xj;|)vU=jofWo1Vd2a*)9Z1dFrjWa5G` zyysW(nR#4UxtHw93lsZg?Rc-{v{hqo+0SSHXL*&StPZ|gufkk84l(oB=4Z@}4LDp5 zc_+i{kN4w9#U1yF9%Jrn!qz;E+0>2i=gX|qd!g)=_2Wlo-s`_ycKls%dmZzmPaba7 zh{&0)AiY1a$E}#TiaWMHEwr4592A>t=TKr!Eyf>v<8fTs9MWZfdV%vv8o#6tbC#3W zj*W6S+>1@-Np@9}eJ<{N!Sag=u-9xnPFT*FH`!kv#;0Ykc>+gSymu5w%Z|&7KD4NF z<3xPUL$TLbB_vPwSpM1U^Ztc7Tw-xT<@lA2C^t7{qs~0jfzNHwbDZMpuFW!kj{H~l z*i-#~a@d@bcr}%+(6+?+VM8VMlwBGB#4KLT_B-+l=a=6vd!04-RuCWSMl0|q*-P(4 znwhl&%zkYzG8GSNW_Ek13RzK{e-4k9Ajb}^El=#Y>A>QS)jo5TYD^*Cc-sTP)t0hHbKo){@??=UX?0$M1;k^S;9~vg>Zc z=E!_$*Dc~Xg@cA3ykO~E6}=*dOZ}X~kJHyL;eQeNVd)CW60h;T*1NLecsNmr_nD{b z@qTko1wJ7AY~yfLRh;h+58$(MSe}9t%)>03C%dj{oMid)#rT}LV*)3er`i*HEMa)6 z;z3;_z9@Um3s~sX`LnM5hr412C7t+YWWFL#RN-n150>DUa+ouWo6NnfxKnnV7@eJp zhw-@RX-N*7^Kq|vxEbr@fJM+=eIU+XVFRa+K4>1rL9)ls-_OVazg|y}LoSRuN%s58 zahmL^D^RpK9w%HXLbdE?cA(Z=Glx%vI?s|Z93eZFC_ej5#j9JW0~hRl<*q-Ia4rKsl3iXWE|tBmJp9<=T(6e6hHW`4r=435 zEr->9)*Tp+8FUumJqKxgUexX_`^8iEfb5#MH%Shv3fZnT&iA>6>ceu#%M!CcT!4>R zd>|VimshxKcz@Y(Hsd%s;L@chj*G|iGTDvHoRy8u=Df+oE;j?uEAG-7u_gx-Y}9;a zJdSH;l}|a~9M+_Hu@!fJH_i!$2Jk1@vHW(E9CGPTi|l2O;Hy_@%+y)r$w6-#8f3q# z2#ppm;FPuOm8GJ^%(@V5vSY8`F3WGPLESC!_#s=K^vSMn1cMfzpU05Jxya5O&gEhx ziu1qPmH~__#bX7pN!gzsz?1jr>E;&VDcNxrd5avDPT@K8KuO|&`>LN;d^Mv7?<$SQ z*;ZYIJ!PM}?)H)$tAW2y4p=Dl19HgU*=9FdgZ(T&eHtH^y{dlfZ~2^^{Z?E2Sgw<; zGi9&53m3>?M<*_ngWL&RV)?D(=o^oZ4fDAQ)m+aq2$A_x-Cu%Hgsm)VXdBJ=4!n}%l<_VULP2WeIKo8n=A2MYGN zYFGOD3=~C-ZiaK6YO$FeI7(&5Vz)JNkUoG~b3t!npJkxxEzW|94YF6+f#2^RkMC#J z;i^MoZ{sWe6WO1b#Wk{H*{f^iuzLd6oAYP!=M&>&9q;v>dykO9$~SGaJF2^TYVq zu!05kWyi~*@5vrZPF)}e?JVe*m>nc!Jod-w)9Y@EUS?$r5YWufqQ%IR4`QDmO- z6NP9{IOxnmi|n|5yhAA)x#3RXkTbudx$!7L&O9DAcd-Dt9OiRbTV#&@Xg0Q4IFB2h zEq#0hYqHPJ9jEk-OW3aP+I%a%o~QF=A%M54W4j7Y<2A(gT1xRTIV^6(8Rq&FoE15G zOW5qp(yOO-yMCF?rZjw72F|xbxr6*WmY>2Jh(mfPt~nhxmsem^_DlLOCWq|hIAuBY zg_yB8_gXBN3&*h}2kn(O?cR9&m98b6Bm3jSyIsMXGm3iyGx)OQbmYh8Q`|CvuPGdq z%wh9W@sO@)1kcKTG<|I5GOQOYXPm#8<&Z^SU$UJ19Q@79-}&3*fZaQsyvj9*fAsb8 zZ~~}E4)YsOD!Zw=#2$O`mRZh1W@1;Ef^v)J7ob`WhDuOpF7HLFd8!Tjz7;=iI9!I$ z%d4!*^P|XoN$zLAFope&R9qo@eM`7f4)VC<%bY)+c$Jk4exi8DCTKUve&-@i{@_pI z|J-bP@+CQ(VR3AEjk{sakb}lloGH8VoWvpTV4tP9Up9bqEGL&+mE~|T3r8IqKUT<1 zUq{OU*Jd0i`-M|DF>>^D>+v~D=grYCSo(Y&PLtObm+(c|fiWxT_!ES9}}m8;pNF9^bLe=H=gr9q^~%wX$Pb zOS8xS_t#mRRg}y=?_~Z|ai3Si*IP~{r`KhV1&4km2f1CizC>&IeCHCff@NdOeI)?DT)_9nZi>fx5S&!i{*>R%eaf_ENVY9r0 zTg#rcoWcq`Z{~Wn7bEi&*0DH|h1=`!)qm@uO2;uJug%rsUfF9Zz$3EDPs0W?`-yFm z!;~)6?s4_5KkJ~b7Mu5uy_E%$w#a@azdq#coNC-+o?I}Jmf_)?Mzf#Dq#2=CUf+l?DlsN9V_2L5At8T_c@=mU>xWsb0GjORK z^wgs3y!hBvPX6`FLE`{M|J=U z?6IBBF_HPiH?oA5!rO8?ae}2(bno_jU*erDoE5a3(gv0j)l+Y)N8ztyuXJ=^UiOzq z@E18`>)3ncwb^n!AiG%>p0=FsJUnDMEXMe-#aXWF?mOf0*LZ98%NgtwY^yeh@xLfr|%jf>J#6#VQ{gEmh9p&)!w~XN!8ieOo$Rz~+EkyPr}#oSMU5EoYo_d9ueZ ziD%>>i_7LLpF1jFusG{S{LRb-ziZ}^-mVB zsX>Yy)OI4na%wV=BRg)ENX);Ovo4DtE98x;v*k6`N&2eAJFD<@*=1zlJd5)_=MT(t zeK`8p@i-m}X&!4XoW%)dZk_ys9Prco3^^QHz$^FY*r8?|n;APO9mR>VH&Bn0&D{QY ziX3ur!Kt!Knf?EqkIzSOzU`YjaHdlH{9=5^oIZx{*6U$QhH#;odpdq7`*Yl}lf<)d ziMezJm&-fZgUuWaWZ??MgYGO`DZAQw+}<5OR+v|fi$?Xhe2Qgqz&c5Fa=6ft*j1Lt z=2O|+fJUXPbg)5z4a0f8CfVh8p;`6|Gtnjoxix5)L*6CmvN)FwJa6tS#*PQ$Q(K#D zz*}<2tLt5kJ`d`#=Rdb zee&ONe!v?NhsZ8_w}bWx9ICii*@;ifVf!c!ll_!59AU>cW+e9cR5tB%ZG82X!pc$d z%5WiGkX_|C{w8~DW4>bXqDgF%!@7K|T25~_{&bkeT$n_I9CAtdyzFt4--7J3n$x|q zn_J%Pv1Z(7#}2dFmAo?3jR!2Bmmd#W4oh!7WbxUi#KBxtt1v#FwdFpx%+k4~)u8J1 z*d7}!t&bdi7P9@h!j9X(%pRBEK4Nh;*WDloW8HYtoIMNwLJgUlia-v_DiF$!MF!0Q zYx8ecJg9ER%XTbx^}PEcjl)iT$K4n^;L`EW%5GpDCz#ot;dAmDYp$Il`z>YoqU@Co z;&jVrfABNSqs{oTnKO#l-=oK6^PYTpgGU$xxIs~6elzTHn-e0YhgZJ87OuPevl7UqP)5oWg1 zGCOuiJyP*7hozon$2EH9ZQR>?yy9!i?f9&`jqP<$u>7G6oNDGg?;}I;X}(;^s+Y3k ze9Oip&i`F<$hqUc%0bIKo{{~g8EmnfqMpP7dn!F|@sUz&HD{&ZmIva;_1S{rPB~zm zvKsT;AnN3>hj$4qe>oS;@=ku0w#r^Tcl66E9W!W`gOp5M`%?T^jteWyVO2456!$WC zr$P?6$hTbHK9YhOd2Klrb@DdewyKw1`qFNXrl3J_kL&#!<$x=m+9LC<-ZY3#OX;se zmzfP%$K|kM0Y7+MFAwWpmdK7xz6UK{-i;xPkC$NBJT{AM`(77cxUp8+svOMZV7tW^ zx)KL$#`ucj9(xb{-E#V;cDthj-#waR^7J1s#P=db6F~K)xKQ@0M(`tZbuY%wRnEW| z7Uh7uk~T)>F!wj(35ESxUWY0CL>`_J*Yi(J8IHBspO$x4O(YKc3$aDN}mV%Nr!1&VvDmU4$2@}Ah8^0urB{9bmPq5YHPH<#hha*)Xl4syutRVlLH z(2Z2fA1p<#<+EARS+~aL6ZTZ%+paep-AGpN@+yzpfb9x9dG>Ex`?u`G@gvC4Oe%&I;TtyMpY* zK5KN{W^vBM++q18!zf+Q%UnN#CfRFd1zn35E@H^cmDj^&HWD0>1J-aIiyWO!DJB&5 zX8RJm;*Q;B@$OwY(VulDrgh{K+@oMewhbiqhvqS_xIb5nMa#)6#**x+#&Ga6@e}c< zhH$v-xN7bQIpmM5Bh7`yIL4fpgU`wyDgiUbDIBt(W~S_q^dd|4*vc};;>&p`F!MvJ zPs zVVUf>K+o*4u4Jvn#~Tv|nLTK*ad_{s(Q+zFam&5&V+HdmxcfdmE}Nfa$d0YwGtKjz z$TD-iQm(wVJc)ccoMpdL%i$gPA~`6YLxY*cJ|~`!A1mapuWMe^IgF0scaiz_Ztg|1 zg-i3$A$zP=)@^19lU~_z6WW{{j<#XPn;LQ=7q7}Yxf$v27H{vyKV-kA1pl=7bTQtr zIG2OICA+g87c%%}BOD{iwe zC#QZV2Yr?JgT=Xu>CbXlUWI#P*Rp_gb6PdBEhjG>+fRxgHsqB*J{LP!?!^9b*p`8V zWRKtDhsZ0PDLB+}I=gU~>;`IZxS2gFkFcB(&Z#Dz!pZWss&SlY`J74mvK)-D&BS-& z^Ksm6V-7go|L-Usz0*y2+gOTsl~?BD{$r!mq~X7^SKEUBSw7dX?J`GvWFGJO zu3m(u1?(Zaj27%^W<^A^$EL`iR@~2=#vV7u$9k-gxu+af72rMcN(bvoMdm|ona17< z2Q{gQy~%2PK=GaJ3;3WN${wW$nP+oZ&n$i@pVjcbni1P^Bk$K`kE;pJm;L6)Vr zf79Y~oOVi%9mfUo+RPL#vUue%erWlHWw=-lrbf`!5emx$NJ>Hb~<$|79N(Rd1U|<~kFDjndgm20|@10yO2P}knrNz10 z;wsC}W|2#I2PY$bB8ST}xZ3hr@An#ub91@b>nX>zio2OgTqg&t;B#zN8etD>t#@Cx*pC&iS0KoE4tV znKmSgZk%O~3a-l@fAxMt_IopM>>KeZg#4Xzy6n|9;4C?qOu^amw%R^?Mcz?di?3S# zL>9hb`TRCEhXYHAU0DIXr5wMNmDS~twLZ;mo`tL}r)C7-H+RqBQrWASN85fk#GjDk zo%r_0W3O}!^W!XbkXe=3n_#7Mi}NmJzL}LL3e2TBD3rsoyu?9QDT*wIy)#N=pR))% zPmUjJl?7S;Df^|R_?NuWF^M;2*IJ&~=Y;t?c5G@V{$n}y{D6|flpg%o;>8V#z4=P~ zPw{|@xb`?j&o`f+GUl=zyhnE2{%Q_Nma&(`c~jf$u^s(~EY2MtcU=&V>9fvkh8)&c zpu^nNnmAxn&Q8nktVNH!HN6A_7U!}6vp1fPL5p{FV_0@9;+Hs|(*hQs&cKMdz8CYd z*EonpGn<4yAO}N@DEJl6pZ|3>b)qC<^pn4|ez(i}P-=0u%PW&T?k=y81O61Kly~xf zW0mC;6r)-WC%a?wp*EzW*3#>_B+61cQ_&~~3qyDzPp31Mjn;gbMJu}_bC@SeF{rRt zGKyg{msyU;0aw^3UR>Pmg=&l{$CXXT=8*TZ;kymP6`kX9$e(bNmd*uQ=8!!*7A>E3 z8~@Q8pXGM$ws=S0R@H*{^~Ld!>sR-dT`E@?%U&~^&YRoEV)Jx1_u-=!X2JNQWxuBf zzj`PhGN|rAwd`2h!5j``phoc>!!wD4p&9Pm(y>*YsF(f85;WL2CAEpSvRXi+;vP4+ zG+Pd59opoORiWDDfQ{C!3F7hnp;TNayRkI|Wb%@%9P~SU&rH51JQdF>E^IF~N^!rq5R2xCWSv$w{vY7O##2!n}o~|Q(ZW}hc;l9K{XF0xMqjTrs+w0;- z_3|?CyY+e|V`IoL&t{-qUQKDls2t=?V@%$`P0tgupIv|{%Ng$4ZGO%zT22c0UzwZR z5<5=&ZMJv~KWZO}&u4{SIJfw*gULzU5t%Q;rb_%?;gH|De~>+Pl>MVp))nC{OUW+A z-LlW+gMYGg7AyX-O zzyeDd=GJs%T0U=l=F6^Z7UkyZQB=qwi%(7-6F-(WQG#hX7|Xy0+0D)1F*!_Mz(&hw z>4+!Hm24VdIb*qrJyunHQt_R69oQ`Uy|Z{x4$88yRd&Pk*luRSxmV07TpIUnoo7P| zE|AX&yJp zVR{;VCHrh!{cDR)WZfiq>FF+SD zALq$laVjoK;uE;kJYS9w&h;gy*RT&3{+Be+iCkWG-xZnhWKD;{(d;gsR{ zQ60DK&y2)g<;t!(+2@9{2cxz4lX5&3wf?i^bD7sJ^Ih3ojuZ=XGfm>z zy2K%?k)>KrbsExSpDQBs%o9Z@G_xyqk$HR+m%Sc8t)QPx)n$*@Ua4}(Nr^Ps<)k3f z@<+>&Ew6HIYKiPu^ODtaSoeR!fj92@U<~>7vr!Ie`mrf7ZzM*JzQt;=#nLNN6T3cc z1y|fJ9L2Np*11+ZCp#{Bc|l%Hsm7YTwmg6v&W=wZ=orAy%=~t{Q4Z@S@C(`J_QYSx zj*|vA$zDn~ZZ$XOBzD8oiT#-Z{8l*u3l!aM;~I$L1ex>>y~h#!eM@QVvlQfeq`~g2K?B}9VVB{KL0YVlmjkoFne4cd6nWG zSKMD?$FiZ)wX&Em`BU60EQiz6INsuH3we_4xNY9-PuJp9%i+4c)9hF-EB=z~&86dO@-{B3 zx>F8#1=7ATe%zp_67POoUd+MXvg17ezGlAWA2xGQ#7E7X^iMp&=Ff@;EXVi>bJkdF zzKzfI;sAwx4s)>Vjk8Fwyut+upRpWnH9z)WI<+zuFO#?OBmO)&ytAew zvCriR+bqtXO`aXg%Sc}ic@rwI9PZOIzsxR)=C$Qagv#-AvJ(fbb%}ShHeiQxR#Vxy zO!kU<@VXt#r}(DDo0{?cgW{(cRCb~GTs__X6g(nt<0t$Ei_gt+JxH7r@~+G#ix2d2 zaY!8Zc)$K>Ip}GM&6n+gR<8Wep}a}-yu8M`124&rSAttDXCwpr|BU%ObaNXHh{&HZ zUiB~zlpWi*9i)_v4LDfbG8vQEe(!Q(S2VcW{jEv*rY?M10|s-2iNkKL`%*j{=}7F@ zJmLt8vqjX=vNzF&V{Dv?T3jo;hCy6j5TCry{qjFI*KxCl9MorGK@R)aG*I4FJB%fZ zvk}HF^YvKA_NI2|Gz-JXE^iF?%c~i^STB1i9eB`k8j2Eod^hvQ<5LSp(zy{`r`gIM z6c*=FzG*q^<)xgwwm6R^%b6@l9M+FxS#giu*#2Vqtfzjj?0A`WU*zbmZNdE(W()QQ z%xnYnpuCeiHXlmjHF!k!W~Z@H_Uo1sd#vAk`1A2o3A^iXbYwo%6S+9n!mJ~bm`kY4 zK~p}CQ;zFs!twG7yKbH>d-)aEvga@3pLT_;IwyM*9axhCzFFJN14Gy$J9ely@2so9 z%Tb)C_G}4WvGlBl*bF~VhSwDK_yh4x8>ONlvEMh37mtn4#%mace_ZUKq!2sh6;|Nd z>v)|p>*SeN`8oey#oZ9AeaRsUdhTQ94~P%RtEr>dS6=C;#z$m-vI+Y|<|(gV#K#qO z+yj!Bi-i-j3zB^)dVoSFE;58~4gE7%+Uhk7bG+|?_uve0gs z9J1_Px$LpVbB!F-X6<$ro3APEdQuYytRz)$$2Rt&$=p4e*zpau_)gxlY*9{FG=Wyx zXMc)zJC=K)x-5rf;LJfDi##hHG{U&KG9s9WWMAZhH<~b-T-?f$UE71xu_l@nBxfXu}(dhb;49-pPwMv)|K;H!X*YXWx;%)N=es-oaLs z|CJs4s^?o;#M4RMC_n#5c%f5cGQ~} zxzg#ZXPT|r@Y415?yjw$GSFhkeh;T*~?l)kL9G5qfcI&AHtyJumqvm zm5=iFUwoe1a#-h2_L_D(;B4GN6z5y|%na|y#pzysHJ0U&H|g({x6WtdJ~_yqVCTv> ze`iZZ;uW?;dO&f%t|0L?Ry%x9@sJIj9*N9{eYhQuT6lOC8|=_=&V0y0MISa<4(E~+ zcQ)Y(#a+t;o-y~8p!tvSsk!b}oO@U7kh_DvC3{md_`bZFQHl%Xpdb$y%8v7a7nv)# zHr8@DZ}y|eJpKE+aj~UviNU3^&kie>DP_|zuCQ?3Jg$@j*8cm69CAk8>{?oOo0sDj z54iv78XKpv7B|Trml@WZOR8{Dx1Kq>?_Fu`%TMfQ)#56}J$}nvZ8^Lgzt-X%xwtMe zpW((bT(5B0kezsi4H9m!blwB~nd~~s@N>&Ac%KFgy!BUmgUe(ThWLec5G2MUbdY4X}oG)Zoog~fOo!MmwmP=I{d$SUQ4_| z_pY0FeZ&OpvRfwmOZ=fGdyD;e$#Uj;u+4J1Q?V+Cb*+iFbDCn!a(IR4+p*at2qW_? z-NL7$@OG9DGKVapx?L%cq+y30IbDiZWH-p(II=&Ji#O$v7x`~n4tpPe?Zo(5tt>9! z8aWtXr4M;qPA6`ZJ#Gc~rMzP*C2>&Pg_{)L*~msV^2@xuxkdK5UglQWD_+3u7LRUg z%C0#TcUlhnEd1Vb*eU6c=7JpDW%-r;sIWM9kvQOwscS7Kr2yB@@yhDLJJ`k-h#5q*{Dt0GYDSi_2^|Sgyn7JiW?Q)A)z%$}90tb8QXY zkhgRG^=&!GPsM-CWsUeh%V!1N#9SF=_Ujk%U(0FkOT5aD>Hpa{oYqR*HiJF$<8#|t zJ&1S9s~L^hOZJ-BPr;l%g1zN6F2ww>9I{L=EN6Nd&(6kUdex=)haC3S<1N{-%7W_2I~TyVkekqYAtJ{KTsn zo%p!L%R6v@?DOO7KshLx#le=7S%gZb(@ZHxi@dVfhIV;dMH>2K$C*&`O4ksE6!%$l zb65__x)Qt6K8)Bnxvdzp{K?tGJ1crHZt><8OqqF!Vs^YAxyyX&n;S8$l+}#-#C~Ca z;*h(#W|b4vw&O46idOth4%rc|`9JYf_iJl$-+{l1e_PMw^X@?GRjxdJQVwz(@sxS8 zp0@!kCzn%nv0vtA$#b%o+m7eu9o;#2Q4Yoj@VCg(yI9I;y!g zpX16*2L2`QY#qVtvdinjo00hvZfVC`3VZyGXkO`Pz}prd$iaW?(CU2rpB%7A_J8G_ z)gy_+wsO4d?)VgTvQmqA>r5f`P(0wS!FQWy%M$ObTu2-aHsL+W372ZHm+TEp55tQu@?9>@s3vfPxexVu=h~>GWd04 zIAcXG2`?P4lid(|UCMsn0RCV(V}*%VSr7Xz%g?LZ?T-4yK?7?)*f^}{d5`6HHzUQy zWIgm$*_*3Enj8$LCUy(cNRQ%t8J?&?hEi5DG82c>b;wkF2Pb~a;aoPd6?bE7cPj@2 zoyd{bIQN??zg$_1yvTg0bs5N4*k^~OA~_tZK$+}Sm!V2t<5@P!ZafcTvR_?~YqsiD zE?PkNv0HY1sC&JWIA*`tE^i7gve%o39&>F6`YeZSBKj@PTSxj^me%IE?9&ex)+@*8lFSPgG9oy%= z!1qVyTd}DJA5hp!>%>Q7SIBC%^2$sPK5FsVQGPtd!+66@fb7?fCk|Li)Z$@Rc49YH zpLnG+AD_^%Tjw)(o248MP<)jemdpV!UJq0}>}*Nwc%#JPu6Q1wR8COGe%{xEhKTi(II&r?`cVytZ7H=8C z_btD;h=o%0e7Q0I5;-gw#Pt^MSio<##yM`H9R~-ocV@NXQ|5*i94fnVmJX4Fg((~+ zuVxhBNZA`$P8_hd_c2kNe|j4_alBIes&ahR(wio6g1jTIBJs|;nZ)5lHBPi+S$FMo za?sM8*iWs)=WWdX8k}aXpGD^(zmBg`i?T5*dt5-gBs-SpxYy#P9k@>pio184Rq@wZ z4i|QB5*5g>qNo8R&bJ|#9pR@JHTRgQDCz_iEaFV>U zwh|}H0juVoA^Ric*t!0(SmQyqJkIl=Acx&a{-pLYytToK&1+W|g;Hjz&zyxt7j~Y+p8WSN|?E<*8bH zP2rF=1ivABOA9#PT+8}PW|n8UcDS+f8DZZsc?YLq50rOS zmEs`TpDx5F&8@6ID7);g#5-A;{1C-gy2kJ+^UzY_V1iYJY@C)x93k&W>%mcSSk{c= zBJ-&~!*$aNyNnu~Ec@J*e7dD`<k5gogjc4%q^1NDjItF(>%Sc3@s7 zyB<#An}^wt#d5f~VuQtb_4=4OwFVnyZ!`ms%RU>ECZ0@B?24xo``q8M$;M&1-NgKc zeM0eVtTOhLyvFskPs<*EEI(uM<$7$E9WUa|Ay;KStN6-nDz?eNSQkz|<2SoL^4v66 zhs&!O&G=?yzT}&$@oj~@)B=28-pX1@msq@%UBAqnNK4$uFB8Rs+UCR|7nxh!6)Ys) zmeY+(l^^D}CJwl7?#DI`KaI^HyZ@LS>poqf96xOmSIVwu3O7XNQ+bekS}dit5-H}I zMlAd`PI-j4sqTnhkX6>GZIy$eF0{#B1>2j;J};WPN#k;rwU-oO7Flq79&cxn)8Kx8u zE4nZ%dj)BjvtyfzF>g7Xm0UKnIN!hJHTD8~*B|0nC*&9Ww$a!D7Xz=#Uf~dY^K5#a&@%;+3u*gqA;Fka(39k+v(o+B=pwoSwrD#eKHyd_@ixM)0bQ!>z@y$&OQP z<{jMy_=m+id+;yWukFSg7H1uFbHMWMZ(2@z1>QDuAyDF!EW8uN`F5?#!hda;fnhxV zO8oLTcI|m3a`cXJR^e5>YC{cppPBVS-!HFb6ypQ3Uzdk{WRI&eKV5+Nx^pAD4G#%*5vDKQ|26Fy%eF&Bo0Z z-#TB4{cRZbcum~PuNcKwGji|=%js{!C*@!w2Zu!FLp?Kr!xZ*QmT7QrXK#(K z(u!yBi4$Ub+zNew9F!H|AlbEz=;hs{3sD49dngERY$IXFuBj&(AQu`&4(aIB5f zT!-W3aF`X0EzXY1Ct930LQj(Yv_5=J4mjgz4*3Cmy2a;)@KT@te3~0^UVrRhJ`I=1 zJ}Y{?dmxT)<;U3v<$yOTKN6WQ!^0)m&%$h=e1O7F&)^V+!-1;AUg;7JReUvV37@t@ z%O-J{ymPD}Hb>uFh9ec;I>+k1ve#6JqvW-PejFpaQLceA&v)ZEIplQjiLzH(fs-t! zts7soIIBOLE&H6ZH9OXUylH3rB7`i;vfs;kp;!|C6OnnE8!~X9!Ydu!I7oJLT+AtN zP3Lb#%NgR@OL=27MgvQ7Wu4=-Y}KewG_qP z=L7F~t%WOREXnI0!U=Xj+fw3ixB;J2{ADf;IoWc!dH!8-_XI1)ep{>eO?uR)120gYs|chvDa(IpCPP%;&Ya16L@#HN6E_%HDFx|L4-g zKKorJIjod*mGWKDRAO%=6F*UWHKPev%PSpqxF(5r;97YHzu&Ku!>SJa)ZCwo>t#1H zlh~i^z|W$1^j9*6pDV>{F2;>^DDUClZ1K@~{6_Y(c@tj_*!acl7Y(4m;@oi7F&V%5 z+j&>GWJ>SE1RKT3!9octWye*pHF8*5g*tgPmG$Gl5x-hm}>e zMa}0WiDx_27WHJeqImQI8=`GaF(_Ge#CA3NwDkS>kv*wtq7vIjUGdU-}bVNTd#>^aXKnYnj7Q%~d>nJ^o^8>2Li0x!G{es-? z;kerL=I9qW?z?x?0mqpVH#qhDOOms>p$_;NB5~8dM87PzTQ{e6jR*8A9Cw(3^K9|> z0{yDwqX5uvNE}5;e^cV^S+zk?%HQR5oSUI_4jbG>^=OOQj?B@9$-#8B2iO6X{T`Q`0aU|F;o*-yJ6GtM#+Z+G(5(^iW#w_H z=u>RBSWBM~*KX72#I4ixdG<-=7JY$T8lIvmw!uoxYPQ7`>Ke8?tW$@zVYQtdqx9Boi&u=9G^lRJ@Hyq;)ywGSTM_>L^_$-uvxP8l11c&E6hkzUXgKVP%;u?%NbfLl##{&wk zh&>*LoWye*v?gAgQM;o7T9(o2kB)JpIc0fRU0pt zc;h<#Jv*$d(M@bSIzzX;KYl=MdV)%9*E2)kX8Xxn`ge(=zs`5XlY8_(?11&@VmH1| z-{ZL1YNj8sPiiLUeVPY~61R0t^h4QnzLK8yfq0jTqy6fD zW-LF(@qn*~A7|V296eo}8l|5Q!qLK zoPe)PvCT};PjkGmUqL@7<7^ERu??2){jzwXg?^LmH>&83tNc30X~P$7#SRsl^mk(X zX#EFverk(e85s}fU5)gQ%-dS%pClVCC&hVO^>{VMUDGIC#?DWpO^n1*cI#hd+s0XS zX)v+d&(Lcn2PZDY9&`9!AH}gZ_e{~{oMO>+LTu(Y=nB~s?PJ7eSSl^Hn22&E=lH2+ zdIS67dI!Bx#>5qeH%m@qe{9_BLtFG#4u{ocx>i#9`{+Y#hZ8d&V;fxRTKMsg$KS{} zB>hjeS!|{sM#hkLchHYLJx(!Io%G{uSC84W>@Yb&Pmdhk@iF=dNpC-*XGqEbDsD>3 z2>rCApwX5ZPx8-j+)lNr!&+jC<-9*9LvCTNE)RL<1pT~himQ^(;_xFG`UMI1^~T0N z-n&G<$l-+c?NFms;sFzT)i~iTKJKfd8+h9i zrgn=RKF8iGw}nHUvhgt7H$*pb*i7Sio}EXv>-V$$Zll^XG|>koAB%lH%ns=`wLz_# zn_+5<=EG8 zjz?t$@x@it5S827TBpm|1(f)?f_-|}t+wrTYS++AS8|S9Lw6*$UtCl>6x4o$jJZ=w zZ)V%pDs||_`VNUN)zDjHoaHRNQ@l1r?_r07CAFPzpbyBl_;6qPK0ilT*1E=a_$6`3 zHuyyUqT~!`=w30_u!zHCD?P~Zfc2jj#8`mw5Zm`{(IXPy!klup9owSE#ryl}kUF9# zB#tUJ-;ixFg<9+|Z}3Tp?+nv7*?zo<^6X=jgF9i{%00E~?N%2s<@!{{si}yKx6$1h zGP3DmnjG6Tb*YOh-La8=JFcEedP@~ul=P`~wZZSlGmeLqF|{2V!#SGxnQ--3sxCR` z38S8Gqo+7-JJ;yjY@e&8{}8Wr()ZX7#dejX-D`MN3Rr;#< zaEbol$~bQDZ2b}2PWICa*e*RneUm6(?$wO#5&*1_ZM*SK4 zcx+bfQJb+GNBTVj^yi!saPGY9=y{l-zu*BMNB3;buqamgGPd72ijC|?Ch6spGEt)r zSVa7nJj(sc^jDlx*sD}K)UuHH846(ijf}oqOMffIOoV@6hvh}J?O3B%a@_Tx8>@K! zfc}YXdN=9SY@cbSe-+nk(dFW;QF;^GVO_eJ|4@`TP0rKlNj>`-o|lIeU(K)FcoGl$IzPFXy}V={iZl9g!mkMgf# z%)orvJL88r;FIw6Vsvi5g6-FEl)<(;-E>`KJhTs0&<&D)fQn_3Qj4Xa;?DlW=&W0g zW7FGFjGa@=LATnWDU8Hj-wakj#(P%SYr(Qdc`;<@BkYj7#9Lz;wS7e+H2HBEU=Tfol9X^ zWxNjo^QJy2#xJH@*?APi{S?~_Ez{@Ner%II&vuyGTEPxz08}mc_?1*A#^GnP0ok@b zO@or(nxwW#g$kS3!5n&7xqtIAH0e_+)32 z&dcs})Fs~Dps#b>qa&Hv;O^cV#iOsWEG|OcCpWOOPY;N5ZS)}9p?3a5633v<&?b4%?zcfpaiBaA6331O5eS;m)Qdy1l`f+;dQ}JuV&Y?}$r(rw}cP`T_ zBI4Wj1Sh@4etMApn&Y9iN`2OwqE~7@3gNMR|GwJb!|*bWTNKC>yM=jrjf{!zU1E>< zr~l0HVrHNIh3#4^>0c$kZc=T#mg%)oJlg$3dL5_Og>AJTYgUK7RP6H8GE<@daI+os zdfvEzrc{@+{YH3ApG?u^hiY=i2-*G7&WhiQ7Bq>t9AUG*xxpW{AtKv#Yv zeyR;BT|V>4*nYW|o-f9c{2#LIQWd>`9Z-}?eB4k)f6Q@{#kqFbw!TrFU&_&oWZSyh z#8_M_@ryWTFSgCC^b*N$JxYu<2<15L;)M-*X_Nxq(@THK*?IgP5}$6jsf*ZSiQBh7*TkH`_-4B%5wm(W}`G{b(;^7cu2p?B}qiisLpjO|N60 zt`5NF4JYGq|oZ@yT)CRLoM`Y8EF||wes?FRSjmox7=t0Z&M=0RNE@B$X zxWw18G{HVaA>K*0*=kZ-H2svgpWC4{=QwZ}z^|1X^zdF>RPddl*+r8Ao4(n*g92u{MyYZ~crX7Ro;(AJL z1{Y~4$|-;S;Kau$Z~M?Djq$d5l69n8!NQQwkV-;z&48+`U=|*kJ8-|$Nc@@|J6^IKZsn-5xs_;-*2HS*dcXD zZ({rXMtUpTpl{wgCEn9T?-t`Sz4#PUfv%BwLrxuXM{3i(M%P9;c;3&U-zcXWe2|KL zV+Y;9@lcOT51f8?mTr_VKG(&jVU6C;aXZiu8>2sVNFR{!72A6Bn&h~vsZodK1+~M%)PLg~ z--MQ{Jme!2^idAmr2}Oh5fJI}Fyu#@)JWntqnU zZg`rWDd{!S^z#x&?=cdi9gx^!LHe(7+(#<_*a62eYE0PqRf(e@gxH}C$FIwnnMV2z zF{-rvrnqTO?NJw6;-wWdwC9{Wz8Rjww)-jiEw*pMMO5)<5_g?T^g9yAxl8dWUK!8j zxIuX#aegXIzbo;P8MVdLRf*fW9(o?h-MJEe@*91m>Gde*IWt z)JxF#C{_-0j<4EQyV(i)BaVl*`q+4S@5#~&IDCrf!!MMSjosKtxpRvCgu|v5&3f2& zcbZ-#@vVOPQ;E-y(#yqD!}PaoKaPcz5=ZrkS4$johFa zNRHoXRR=5&lXysXkl`^MiaA<#*j=WJe6x2qQ;EZQOtcplaQaAW2IkeS>qzY@HxeIg zpjK=A#9dt5SC>#5LgIE|ho0gbyD_GAO<0x~#p7Q>b^4D2ijIDpH*h#}DYjS!^c{|e zu?hMw_9IOZ?VUJ7dkFpEZee}2(hc4e>TeMhw zQhYQ^-(>sx26X{NQ1cvjXb5mDIpcM7!Vc)RaVkD&CBrtD#A(?UYl6gie7t&&pW+l{ zU^^_2Rga*YF~>bh2#WJKS8`FttZJm97^VEhF0+#u$9YM7pGurx>}ZdThwb4?I^(e0 zA4-hrOy7#)NV)Tfo|0Rd*;kvT1^TvZiaEvVX)JK%cz&mazQZn7jjQcmHT^rs!^&pt z=v6pbQ$_#5|M0DA^q-Q185v?TvPA#Iahsc>{}yA(!T+!g-f7=wI~;-jfbDAs)MlcV z{#W8dtMq^DFjt{=)tGq4ansvIKlYLMl^0MG@Hf~N71mE}96#NkAd}AX4|C`l z$xm40klJJE%Z(g2Sk3r8b`jqe;!;BrKWkgixbJ91W65|-yR=3hVEc^$b!e$mA6MlS5?AEVpfVoQ)YsF8#rRB6?<77(k$KY zkFo=n;E7Kw+v#JHk5X;ofG^y#qX%>x*Me{6B)c_4pAaAIC&oMMR*suZ6fcuOao8vJ zD2Y)m@$E%w5Tlh@EBmBok$Tv^XM)Bg2kq6T*{3)Ia(3Nk%b)*dElq!ZeQb+s@h=mj z73Dv&J!&}oi^Ng5=oWEml|IgPi?|vl`2$C>@uGZWo<1QdD1P(F$OzxpN4Ih~Khs2? zl9a`LwZRqN&q#drP+h=*+2wJewJ$3C+*Et%Pw{eQytrOXQ*ECchJx|GKT-^5H(23R2`)_{BSQda@b&@ zqu6!KCtmJQo1;T&;(WJ@br0-N-$l(b4klo?NDi7ww2Bv3s9kba7pa472j{4h?OSTq zuD_qUI3DJDsE=JlhrV%%*-Y|?5S0{ zA~GJShc@U#96s*DTbFGQni8Yr{>`%KW-r|$IkVIBN%7$_-6}az`9HQ#4bo>NzPUx8 zlYCsg`#d|pv_W5xIHsA?&&a=tC2$Npl6mECO;Z{1d(p?9ss z9VR`sb503mw8U;YLmeE?Z+B5AyHJ68hw1^fMR^Q~`?YPgYu=+S&NoLR)X(;KcMXb{ zQtA^-VjSVP#nU(;+hSq(jKoow|GdPP8)^8xpNoHJV^RK7;!a#vx{>2J(5)AvJE1tN z)l)mieMJ*>idUAXM~siWUfCAyfBGXw-#8h30+;{5NkqHQM#HkfFiymZClWiH=$Vj# zR@Z4#ayl2)VRK)7jCxXO&M|d^YKz%25)Z56>LO-RO8ji-lANA4nv!v)NC+BE}Q*8TW;_Xef!)brXDO9Xdmh%haS+$*8Qv1CmZhFUPR>vPtJlCi; zSkX2oIe7fVZhKuF&^AKi7jZB*FZo@ltIIBqEvQ2dP3Af7(QIsqeTD)btFkT546d;a zN_B0BaSCvY9rliBN4&5?yX+E<)Xqs9)sWuxzWB8lx^uMmt=PV8pBkUycy$f!uq`H7 z{nEENelVwYl|A&!9QWhxYKMY9&*r#6f1F<_t`!@Nq;DwZJ^()7n1Hz@o4Ldii-0QLS_Yz%XI8~q8V*!@j)p}L1&#PMTHl@yy}7t|J4 zNM6d@dd#R1+gjAp<9I;1U9rtB)1PtNqUdbd*o8;7>CYvMN}+131Qq+*eYL4>rN7|( z;!HceOl}I73jdN_!gOSD9y1MI!Ex7tq9>ByxTrR}o%FX+9J_N@JN><|f&M|ZJUXCP ziubGOHEe@&Wv`R?3|0V%(SP=3pZ$FK(>`GJ#-DwTjo)7X$UfQ3(d*cSibZ-0`)pvA zu97&41;2xx$Ml$Yvi;hQ`XWB0C2q3lzRWpcex9yoANQ=$yV(}Kcf}`_^>hu#4Hjx% z$M!f6bG_v6w$gjpr}*V?gT!ZM>Am7zG(lmT+$h~B-p#4qL_NJv;+SIm0d^S1v7W?p z4fH{_+gXVny@eX+Lz1$LMOSQ(Ch%f|P8}bXP20!mCU)pdCmz9}9>CwYPZu)AC2PJwR`&LW1JEymt!M*evWRDEqfN#c7I*%P@`CKLg%JB zj{>ZpkRfs0CU*O2`lO7Bbt|`uu}2u_=kSG*MQ(vY~d1)9TahBbwu!fXX}KB7Q*6%eFXd? zhw82lTH>&oL|F{BYuib@i6#v)iu9hL^y3`(59IxSc9v`{avi%%pptD2k zklI&l(e)Ca>s9-Hw1Ag5x*Ukj)(ZMv#dpP{3B7uMl-?`(lhf*BwB@>yMZ*8V5t()0Z!~I_tgRO5F~CYD(EK3 zM>%@2pTUX_86Qi-#IAmxKEiRc*+>7zcHNkiA31t3N9kh{MsKoP*gmzLcx_K@*81t= zoMSOvqHGNFa6f%Q!X1_LNf~k|mAGSxZsoWcZl=$${nCKiq08Q9IUcG~RKd>UWnC}% zDC6A7wkQ$NDqcCFF1D+#r$O=I3iba^Mfn@XqrK4|J~y`A&e7HE)5BT1R^nsZbOYOW zVcNdLQ3&b-67OuLk4PNFLqE!P6^R2T0!iGo!FI6TUT%%if%z)p&Vq05JAD8{aR5P)uo1sr|+;r|HMy)xCTdXS*=chO6 zlai0gq+*Xwh-KsDaN&S%l@yc&P%k#AEhd6XJTwof4UXnNCA&G(rVi+qSdK@ZirLJ7 z{==Z>>!*1Gi>tHZ(7vE9;N$f(GU(B++T&EQ!~?pumyP|uubVy_rGt;x$Ba&VU2tEn8yR(Is7{PprIHg+iAH>iUl5T(WAD#M~Tt! zKyvIsn>uXlt4#&2v+?*2Q>Nc1Mnzt+Z|zYBG_#VpL4|_%bH2sI6R}5&74^!5+Te`r z2PD6`OYJ5S7i*?8p3mV}P4Y2GR*eE|;;@EeJI*PMSE|iWHGPQVW^YA(JWxd+=D3@~ zcPTrdR;buy=jkTd7R#?cBKer9STMd`J`G zY?InG)>B%xMf>BdY+H>wPZCG98nI7}(Xzy|6SN}vy9cx?@%sJ5SSl%ThcWbjg6an6s?IloJTcGabdqkeTs$w61S)z zBle5E>f&ZYYz%p4mJWEx0%~1}Et-*v-BOa{wt6)lvPrx+jYV}l&RN|?Yz%*Hf(|)+ zy0)k`*=DszQ%%Y7hi&RpT+BS;Z5>LHi2Z0&Vk|U}cxY-+7g4BL;>W$s^hF-$WPVR= z`C1dm0gXAt$M_j=8^;Z%hl~A8 zMjf)fYKH@{+d1ED)vJ99eZ@KManAS-apf%CDMoV`vBh$lyJQ^9!4L<0M&Hfxke#A? z*k*W!?w4_Jp5-BNQzyOixs~yG%V>Ef+t01jCfj1B?iSk(qiv+*;G&SYfZ~ul91mEM zbRgT#jnfgkxUjE2!F10rN)E2nsMki+4)17*o3TCm66d?kWpxq7^(AgGW%kRQejcmb^nZsVL2G@=oEv}{r1w}Loe)}tDeGt0)y;hqUP$HNq=GwM^!Ex(=PW^qq# z*Sggnr^_TKbay63Z`5)e!=FbhaURAU_Nx8VdSVpZDd!;liGI3+(=8gat8?f=&he)P zu@+VK48PC6D*5Y;YS%iG7!~&<$D=@^I1F5}Eq?#s$>Vse`Vp6^_G3r)q=D|@l)@-p zr?LxJcrJDw>xoeWS>hfgeAMIBiSc}D9Mc@d0i~?fXd$UyJXD`79?;#q4=3BTYK!)b z5)YG0>LSigoR@uWY^JZVL-VTI4)xGI9JiPSFZS(AYFC$`ugkVAW9m|FU2RYZ`(DXG zTLH0O-J%CLZdN+!LAkB@EL~vxxjK4C+|o*qu}=>t=yA!%#E5UQ9r_xdu)|&h8Hr{D5Fz(X!^gVW&; zOMXk6+7EOlUTRl|!&bHLzsYdA*vF%Ud7FWw9?)xVb;}BNjVYwkTp6f5q(vy5sd7Quz7xu=~ zp{Xx6(r>GvZ$|0h?Q6=r+G66jS)rynF=l6#vk^C~y@~rWYKtzll4CKmMC`g!>M-A< zHocXU=N+~vnWDxbU$LKWRy*_+EO36Ytu}V_BAHkl)qmu1YV4SkO#h4eiAN(PASTqT`y+@r9yIFEJehT|c%l^CVmB<`9g$;vJ)qP?tmbc%v_y+&=j zF~?fQtlFfDk+ExccG9;v?1vlZKiGD+iT+cJddojxhvjO-0o71+BTrJrEH6`hvKl+pmw`hn8!K3W>RfuyXbm} zqtB1{3|E`Qb^vErB&TvZG3HrHJfJ$V_zWv5%8p*xwR8h-YOoMneQ>0{*=uE#DaK<83&>6WuO&|3u8_SQE?m%i+jtmFnYxarG&dzDtg}LeK$C*4*1fO zctF!j_3(V+DU@)OhXiHr%Es>8iHkBE&f{K*&ETfmqQjBIj~nLb{jv+ay=uQxp*}st zRBp}*4UKApYhNGYxIxvHkH|Qfm;P_;JUYwYA~{1jb*Pz8m#}79b5^VAlbr82I@AG| zjAtaiTSaSZ+dM*leOXocFvu(}(JgF?@*pD;U+t%Hw!^RKNp`?A+6?;yr#|+?8#y|Z zd@RTnpY`|9k>ubv{} zSbb83;`4I5hmC4Gx1n~Z$S65(1ji{dX4kCxWN?hW!f{($rw;fXCh^d_5<7aqH_#n& zvv{SdvEoSVGj-~KlK|y7Hhp4_zRH^(H&&@#)kxxjZgmOub|k-mOFeh;whrAb#pVbH zAri-=0d>o~+P7>Z9>#$L=Z7}b&jM$6}73*rC*y8Y@23X!9ZVD4r({SPLvR8(nk{k7=-mNo>cf)NXq%cJyJn(AA;; zu(&wH%3xWG&9Gun@``L{83}2WMKgL{v`#ImQ z=F}mbNxZzFHmGp=0Ova_))#xMDi#;ftLQ=5hm8%oz|Lbn!9#LeGt2a_VQFd6a zrpLrhtMml>6ra-%<@hsfG)Es@i)~RQs*>$TDyWL>rYERI;x%h(htG^!jvK5mt!G;- zp%DjExfR>aUTTmWtR5B@YH{&b#z|MHEvkAoN&dm0I$(0M#80Ygsadw2oL867(Msbx z8`L7>pxk(?ahJb|Wonk1Jl1Q>YlCb{Qv?Nj%=K4sDo>9_8S@b^(RXI6bsA ztMh1{*2(e0{)GCpEvGi^n4``)ZoNlsF}1%-;?2wI&^xLwZ1+*OjK7yr8@wDOZZ>)o z?`PE(SA8VMp?#JbD@@9c-fOeHmkfl{WIcRiHq&>B7t|rutTw3>^>V%$7)(5Oq_*`f zYKNuSk{_`6SZq;?ye}FDkMox&sb7d0?J_z#Z;5SRE-{utO5D{oBt|7%iF*{sdlNF|=fFQp+Fe{PtD*`a<{ZAQ9j zB#K8*MH*MB%YQ_l@`E%g`8d;&W}l&+=?vRorRSU&or`jkKeRa&VcXtXx{O`GqMm=2_|S~n;Uv{-IUd&0vYG8L z=IbOMRcBr=anz3zpAGcW6&ye7MonwBK@qbz%C@W9^d{MMv{voMP;-{!VPsLA#~Q4+ zaJ+twc{>r8yO zp$=F>D>)&9iu9actgT2qz7rd}|J5;irwotpUiA?UnB}H$tx6oa<`bi>kHqu%b@VRI z&#zXhPcX$=;-Pj*U8-wOjA@uw(R73;@u6k-)>b0%odRRu(zo;gOhZPjMI))!ffA-NfY8_9KW+m zID;oXYgkYhFuUMd$!}~=Jc22K9M9(t={ojtV@7Q;TU_EM)lTo>9E&+RH?RXHs)$X0 zhTh9@kNNZBfWrr|?Wmv|WgL{HRpUyr*rES{IE*JbB@}?rIOZ_DPxj%kn%*B7Pxaju z^a0%kOjwYVo@sR+)i*vUIjH*cA$D>4fIbo#!`y+=!W_0ZAiS9!u<-5;!|}(WtIyI` z*#;9Pmqs{_&hG1xqmSvi9aQ_`f0*i7+GgABN!pbSQ2y#Rc0er%ap7p1ZkL?R9r_B} z;8OG*62~$;vB%`XJ2~##_UJCQZO+l%;>E_q{T*~(;#jRA_TBTb@i^Q&OJC!ZlLd6U zXB(WcQsY2Zjq}3l)h@Ng>O6^CtXwM_!=K0HM;U(UQ0=hbN#X$&rPW&%>SHVb`nv2+ zOS9T{O(ve`R2OkA9D;wd<2=hq(n3cZVHnKisblM?JBz`}g6{Kz6^PQ*Fjp z)UJCmF^Z*2e(0}O=d(w2zucAVn%d5dssq}RNKU}nad8o4E!92E>ikwCJs@M^eJr+f zV`_(K&=Pkk6l~>uk1j0YP}8a|U`ob=a(}A^)wZfSasN(Y6w#M_i!0OW^>(!%npGFE z_EK_6_$(2hVI_3gc*yT+q6>1j+UnF6>nY0d=vhhALz04*ve2Q5~=XK;mb0Yoqiz3L-I?%SrboF$aK?HY_r@;?_t}KN&2+pG~-u4+c&Mz20M>q&o4eO6 z^0QU+74cqQ;_-U=s>HEQR(#ypLw9oApfiow)eWgF7FFKGId*tw_-om zs6HF2i5-07_mu5ON{Tx0ndCPknl=o$n z7b;?6j6(Hf;ys-1;oUWZb!t1?nHb9~B*%4kB}SoZjb{$k9(5We?$Lc+eDR=KUEJm?&DDl9Y+V!_5KAKktw6~LO^EjS+NblpC`gj0e6dXUr zVnwlCpHjPAwc0o1i-Pk*8_vRsC!5rH6nA`>ijgO6Y#cfc1-QNkJJG@C?%(~lU1MLr<}y? zBBmMeI3CwQ#3$${^n~miYRZc3Y7)=mJ^u~P38-Z&_PA#8q>Q;XpgzWV)jY?~dS=yy z(QPVl-0tKOFX2iz$4djlbi&T(ddRTdUXIRW+woy~ifvZY^zULcar`gIsU4wTd`|pb zVmb!sbD7ub&|19ynC3-E} zV_}0DbxdBzafe*|dvK*}yFQ(`_kiBO@qqKWW#i>@=LWrz!-Y{)XJ`B2rNkra zYKwD`Z;~9WlNXzfd3rO)?Ml7cS9Bz9sZra}33`j?Wt4aiPPDMYJ{ktGi7OI!j#}%j=Cbxy@2&$XmTy_Qu;t`QDWd4&bJMFiRY%(ZU|E=WXu^%tzZWn zuZvGn9N=1xn+mj9664B``fy(DQD9Btp?)DTKG8J3kM%M*RLv6Qyjt1a51j z>*c0!=HxwWi;E@V(q_Lp;3CZpl8^N-?`69&tXpAw%&Wao#z6t2_pt-kr+g@K^aE>t zU@871x{TPjL^sLC-RLmHE>vLUyljka#~)=|T#o!0+l}xb z#i4(J8YCZ|+8gYUYNfkn+pak($hPTO`lpxI^0{y98#|BZ;^S1Qcpj6a`Xv9Lk@_VcORNUiVPcX7*=BZ%h9cu;&-KtShYPi{YBPUC zBa)tOrct&Zt4}<&qRwNk!5HV*?4J4*%YepZTXc35pVZ9Jgp6O2QF}bf5)UYhDK6n+ zv-lMA3nqEn)1wx(8JSlXTe{VyhH-UaXOhx#yK~!Whq)vYcjyo&_HC#b&-r0^AvPYi zJ99J@g~5j^W0nsiX4HkKs{n6#g4v&9gHtDJENWAa^d_F_Ogz7oc(+1r=Oz-P7lw@E z`VJGLkfg+Y<6z=#6kCvcg+iF(;`o%hgx|R3c=WoS9cz#Oi0*nXW%#aNIE)$H^=b5{ z=OObeoDv-gVGggvk5}i^C-YNkvrw&eILRmZ4n?cfn`q3y<9nQa7YEF?Q{(KGdi)?Z z_W1TXnwDK#s#OClzUe?6#>Umh4gIvp`ELD4?a{qzDTjgUH6Fk1RdH}Ima9=#4g{jN;Hp*45 zbHq4#Ek4QA&~I`47!~Y)Tej`ZsxNL0)AJjYW3iQ`CGEoB0N{ z#T8_U`{he=D*EUT2?IL=5i4O7-IzGno)~pd zB)^CfSz?2gjejh+Roj@jekn1|tZB~ry4s=3pTq-dDXIr^>WjFZCGo;`J-v``%e1Bw zqp7RL@#P{8_+hHm+V#yDl+m>T$lqgnD&blH=BrxbNuHoHMlSll%fs6ugwjId1Gzo94>In_X(R z)~e2DSLjc9Tbr3sJ9Mm+xH}wCmuk@)LvE`9-RET6x_z}vbtT55FFAf{LmfKMiH^4| zcCE&aJ_N?Gm_zdzTc`2wYkJ-clmPOHlP?)+0jd>i~dq}r+rvmYFwbdlK9aOy+Urb z0^RV~e(OMeTwg=vCX9Ir@8!J6u{+qZHt)qImQ;PiGhD)iNj! z$1fA36!`1eW?-2vXZyY}`qCHb;=}0rLHf0d*daYbuVK55G-cTqH?}NsOoP}GV;%D$ z+u+yim)He#RQfX8}#CwnsTwD2Q-b-lkyLXUS9Wbl3l@7FSbVoYq7z3 zuzMw5iCM{!@uGPWvkEwTj4uxHNgZYwaNOX6T-nj>PNCR@{3E}I8OoegLZ=w9Yu=>? zIPO~K)P4+?pg0~dJxHA2?V|_f#&B#Y_8qvqFZmU%v9Zs0;F63C-_ep7y;CF}P)4kpFS(7#aZQQx@i z=qa6OZH)iGV|7;rJt||OjE8!EBQYusN!(?c)d5YgB!1SDja?oe?byrzh+jWr@gLER zEzx7qje+;h#l-jBN!^6{v@)YM=-^XMkG>OeDnN}hA7b0yq;@F$CULiwVhB$^$=c`RPUVad)Nq6qO<*$JJcoy3r(# ziQ1Zj3&!HG)2l9`l#9fRJAAv?g}YPqgzUl;R?@P4~)MEG*JEPAjQV&{yo6*VLs^tj~?c#7_o6$}3oMWc96Qlc}#?c%}js3aE`L=C?itKyMf7oX|HR}A~7JZN77QdC$hfQkJI!@o`9E0^?;(%ti z;zDhPe!w}dtzR8#cGMsT_JsaxK)RY*L0wkxHo=~5lo9O@XwgJ;c#KndNwQZ}W zAIkWhorzOT^dqm}rv(l6)cB1i_NWD-#yMf}No6fPjki6iTTq*!R<*}fSIG(M!)lLm zpg$UIi>KnYBlX9=Wz;{gDo~#$*(?~Qrn74Jl2}TGn;CMZcDPQLt}1nz(No8=7c&w zouZ$R`!cTGI!E6H)Sn;Lhh?jkv1IN68N=4>2~zI(?_!jO zeq=8(j@%_DtaYhNDEubz5~`t#&(LRM9X3L z^q@*@s@D_mb*n9U^GUw1Tut1PZ0i@Bllb%q#RDEJBc##~3LkY3R?1r*& zcOM?4pXXstv2H_b29vnMx?hP0^rKeenu7QYr@6#tWQCq3cXv9M7#G?l?phBMFRrQm zWD@r{tdo2{+n;zLi5J%@H12Xk^b35u9+m0ErQU6I9?MIAQTA=8N$tkE)I~JxmK@j6 zMZd)PuCH=J#n)PI=KW%>i&z@w(s3)x|(MQzq`idEvM9J3brsy2obq>k|6-yKmGars7aJU$`r;(Rx@KzB#RJN|A=0OxQX3#`Qk zQ<%i=@KEhjSRTarp$nA+c$hnJ`b)xllj=fE8{NZkH!zbpol1;Z8*UCf8K@jGxT-7seH8Dhh3;iscmXQ?a*>Yay;sOiwl@sbuVvg(D7GnaU7_| z?x^RJxa&ZLHs024HK;vG0Z80uyVaroC~?z7Y&Ua!TbRGB#sRaqfYQR^(>^RjlDpfzmv}MBw^cZU$vL)mH8ILFOWff+ zjC!_4?a@|8;vU7H#G!H|@n~P-s6In%+iB$SEev2UW|2~VsnIv@Emtoo+1vj z^RY3^eUo&7N4#(eXX7ZI@;rK`l{4T%#i81?V|F|bZaTUWPtPPqUqQ+7Yb)x|l#Cy6 za3VPuv1(e4neXBv7LKa1AXZ%JtBs92@-?&>m&X7#S;U1}oYLj^F;0D|dokgj<0oUi zYO|8$SWFd=9G4qW`!r4s^R}U5GVu&1>q~qK6aCp2Q6WiuhM#JQF^NCQi9Tf~(SBO~ zQNY@rhk4@y){&@pvBZnxW(c$R#Y>poFJ4Zzbu+sfcbHQoIc~jL?eS$UagVF3;!ulP zgFI$vNVW}_%%bu3OLDN(P2xq&6fGOi%U9C$NHh$1|5D88*FkD0G1lvsQ=&L#?u$dq zn))KDjz~P8o2Ey3&rY^x)poHe@%q{&K1z(aK(eiCs8@Te`;d6R)F<@_7QM-%fOAe| z>+=WVX0*K5++5Y)gDKX8bX(CYa&`j+Ao+&#XwdXH+bp19C);AlxVYG~ zt}dWV+7pt48Ej&=xUKeRzbA3CF-+f({8ZvX?FK!`@e|DGQDdc>*mU+M-ozPT8D}wZ zSWIa=k6BLNl$-_>PL%PnIxo*Y-DpzV%EQFt%ZazE)o!yQHeMU&Q8!VB!8gCy4G*dV zx~WS%j|(HmobOQsS$tAANhcgPW1DK%xtBOIulCuR#8~z%+lJnHwL4s*QyCMB+l+Vw z{p-becpTf~_f=p!RF5f2Jh!B_T}SjSj@t&DUSu0gQ2mbN&*4;~H&4Yx>=1^Ez{d22d&)1c3@HMDpP8Y z?=H!4yD54HZ|l${Q|x=T6XT^J@qqRI>h7b&+quMnutbG#z)quOiQL#x zgW6-_uEYa=w2Dn%FTG3d*up?!oWv=|@ftfc+cV0qCcHNl zNAdQ~ZyqGZ?Dg`7Q5?(M#RXJ35}(vfs|`xXO5AQ@Wee}48*WQH-K+Lfi|P`ZC`tYq z7C?y$yR&qi+*d5l5}%+-li1+WgV@$BC+<#i>;a0J$o@1XIS#dEB<|aG5~Jj$#EY1# zBQBxcYT0-&AI{MAJY*h)io}Il)I8;Q;b2{Tj4wTjAEOb0_+)QXZE!|Z;s)o_)hpF% zi}G3$cf+lTF*8Zxeq&A@hNsoV(e}i6yO#5DSME>Id*rU5L5Uh`aT9N=3uyi+@#EB- z+N>-lUPW0~9@BR1C0^K2hn{5HfJ-NmUqlh%vaxFqWa)-z81Nxfd)2+fvL*56p86Oa zjwEhSkWk%qNxTo0UuDl)lWi?3d`aBlb)?1wYW4VRVl0l7xJPv=aj5D}JX@7`IoXF2 zirY&587i`r9esXHpcPa358U5#D6GT#dE9`uKx~8VOX}=^+76=1Ea$kc+QcYKB5}Xm ztPZtFPCz+($qB2i>e9h*Z0ydx3v?q7U#Oi?pR6yc%~qY-E_A5f0BZQ~wxt}Z%(C;@ zK6)S9^iL(;oL1YJNwvcAWNU`r&*Qtw;lwCOs&V{)7GGRJKS9nf z&aJ3RHT!CVzH}ep9E)Fw>eeaxpu|xjUmP}OW8)#b6O98orGQp6V%N~E_L$u)@z8<} zx11j^HCmlb#wiU?X#5Pn8p=7*D`^U)3Cn*(w~NMv(Kz5;nV8XsIu2~hVYslfpf;H< zwHrq>UET#ZIjRmQK`!wkCaH_h&?Q{mf*y#xt*_sto7iRuJq;y}WrOOu!PwF6&n(yI zKO7bv{+sN?d`cY_HWQ=CM>!|jcoL->WrO?@Iwi_2H*c#gioZ(SqGz=DsiP$BaJ}_o zvMoM4Z)OL~6czhclpvOzfdTpi+oD>!daX|FF}XtGCpgV?E9al$Y^yk6Ews3R3hbZa zoB}2%e46c0+2#wf?d~{Lvn{50*0X)Of>P`-F-VPKTw&{wd|WZ>W*25_DJMpo?hP>t z9&SdC9^4En+3xT^>;UEmmGBR|g#5?={dY)}ZE)V>+KaL69M+AnO|F-|#}0#4^t57} z^Hh3+ev54?F!P*U=)`ne_DS7T?C7(olKxiGTbI?R_`dynj(eQ4x{U3%d+0Un{6Pvo z`+3)<7Vxtl#*k08;1@lIaVGG!?0_>nV%NHgU-xm2pWly-O`q7uk9ZC@_0k(;QxwY- zJ6ykblf*IgU2JjE=FJ>;=^gwK=W(!LKzzKBqN^ll3`K5a9Gp;6?a4n z-yXpa_jvph6tfY#=Kk2_kBypn{l_s%wY{A;unnt;J2nz`Vc9$9xMeJFX9t`wQlmtv zxQO-AWk)xb#gSb34?KwH$MM5G-nl%cGl)-c^dL6-2Wnf@qjvLEYM)8+LwCQ%FK+Sp z(e0*DV_F6+RFC3Ue|*CRU+-eu5*-=Daf=zhYD`NN2V7ben-psMbH4BHQJT6T|x11lH;0JH14s4RdW1xS{;^9w~)6jX7^*`@w^Wu3FW5Rs}qkl zCf@H>8@v!D-)53+?IN1Q^Ehs9LG7__RpK5Kkj2HOesvx#0QiGtr?+{QMWunyKvQ#j}9gbmc#=3+JN4hjP~OIy7!^ zidk|T&i$xovudASNQ^4Bk`pGY6JxEU#LxP8Kk+d9|75)jSX5V_fJ!;erCfd;srV60|1r-z%FBlOExfQpfid$6? z?dXXPWk)+WY6pJ`1jSZ~p5R1JaQ=5eRdJsG=Xu`e`_{MDUibUnySP-z4ufQ>65;=3 z_;Ox(vR|)PyOzG3c#`w|dTR>*(_aZ?z6Ym z)%x{zeyq=D@5tQe*KcR2ty5n(w@;~Umsc;_$mu)5I#S*lGRSD&;Ko6%=YW$>3+SF|2yilck)op zuk!;=KK<`3v3&a7gSI}McE4WH@8=fZ zHv9bCHdp)gmNF9?w!V8?_$g(d?lIK*^>*hMe!ZiCUz7Fvzg2kHuixHh_3NFP{8Mq| zNA;~^?DhJB_Z&;?*LhvRXTQaJLO%WW-i&^|+|jSQ41T?H_wjz6<+JZzkHN|Q&KuXM zM8Dq7+p#`-hxv$Kzs*nh`RuoIiu!bt{MB(bX?>}0RQtQW5Ic4r@$0vFpTK9o&B3ux z?<`h(J^ZVIwvchccfsSRVYAm4P&;47`}A8U)BAPv?tZ<>;n&+gx3TZ*ySLM@->-X; z3)s?m!%XEr!1(+mW%~8IyHfr7?X-HfbYB0tschr=^p5Z^`24u4^vPbm(_qo3di9(9gPb&LCXFpb}zv0sjc72QA503 zUcG$}|I^H8=gl3T?r~)5Zz?-g@weUjcAu9c_C6i+cg6Y+uepW)g6PxRdEn^R4>$Pr z4kiSjy@MZG^6BkKpXqOT!)fO~So(F|ExhfEmLJ>l>7HXp^qpS2=a|#4w{eo@3#b^WNKf(9x%>#%$H!_nO;}=lOL$f6;gO?EEyjU*{=~PjBI4%kO=D__YPUzB9jH zHzoS@j&w(#PPBjKhhNpLs9!n#>VG8q)xYsz)9c~J?mks5yJ_paW~Dm|{CewwD!<;A zS=+C(GxPbsxzFX-+xds!K6^XAob1y(_y*Lc-!dHX>n#QO`iI`EwpM=O*W3AE&}VPq zh1~z`_0z%+_4#zqu|s~nl}{^u_O`5}{rb^5zkZYd3*__D&i^I!>z}9l^;^c>eLCfL z`IP<#Z%H+s>{C_gR2ceod!1kRm=F5(wwxTl-frr*x1Tufw|DS=@p^+N%#JTI`t|bq zew}}E=1ao2`}N)4)LcE;r>fMH9`)-U{`*g_7wqjugWvm^&-9PHcm6r6PgQrG+}W=e zl=kbTDSrJXzc<)>EB{P6M6l5adw|h>0LUd|4(nM-&Xq7rX;_5Be73a zY2hQNUeD;Bo&EYvmJmLBhmj}RDnR^nu#)oYN3;5Memc`{w^{mi{*Pdkj($Hhvb>XKz2TuV1g+<<~p%s{3_*+`t-Pz|({DQV_vvKtf>Zy0yk1)PO%uOf zXz173-uUdT4V8YqZD+r|o$=S_r#;{1*E^Cd{W@HY z=P`YVSMNyL@7G(Ziu9pgduv05UvF>dx8LI7`p>+6Zt~wQ2l?~{z8UfA?Y08H-r7*E z5A)jFOqG7UJ>`@>+-tvOtn=#~xuyD_d+i<8EPaIU-ki_%qkVSADgCc~dV0G4H$I&Y zi{J6;?MH0-Gd}(EQ~DXJ)B6t6Q|t5#y?SRNN1Hx9`7?dt(~Q0YcK+pDqgU_bhrn+= z-P>!w^%+lo3$uFlPChO#@#^jTL&QH79_+L84^vKh^>%*b=-=~?_1&9c)XOQa&hq(X zuijb9M$D%t=IV2Ne)vI=%U=C9@2~|I^xfOZF9yx>>5dBhAH4eQ!)5wBuYQY1TUWh$ zyMga?3cdH1exaY^)jRlYjxApO*5~_s^eS+^}Kt2b=%!`^g`D z{L!%R4?mdl(#FZZU-{wc_a<-s^>ffBN*5w+)uozz@Dmib@c9^%a9Q zBvsSx(^{VYT+@zgLkcvlT+?ba?MqD_YC*Gqzv07ZB|lvKL9hH^<$K8B*&A1W_0*r-8Pqctm!~Q3l?o6Ii6H8Kc+CH6psgtjD+BZ7gRo#F2 z{>?8lPA$;I~O&dS@w@qu_ z|NXGGhH$CUi6n{oGicEGlTK=aL{i40-$aiR`IFM`_SttUeYS|(A#x;C496fBc1CdX z2oWPhMTlV<#K1f-Ou-H(@t1~-dOJX7zaAj34irQ5+5pj#NrI+HK!%7eO(ZB=BrsDX zBu6CpkVt5u$bejtpIJl(nnl8DMFu%V217Xd(6b^#PK*59Ei&x9h`veWnM)$iHj50u zCNiR3=cRU78&(GWOR>6HPhx;j-%c0kZmq!SbFVT>j_P z!LsK2!LsgyL9+IxLGu3hgJk{HL9+4nFxfCFO#U!rm~7e*CiBV12ZMet9~S&vHsAfZ zYMcQNE~r&|6#anL2iUk6Eram62lyWPHy~648MC$ zWbT(D^IjKu?Ol<7+7~SIlQdcILa@BPL6d);5h4q}50RJ;LgbCtLgZh*50piXfpUs+ zee=a2`R~UE%90^L5-S7ctv3hC|M+HrEFJPQS@!w>dAnwSBn=f=K6`)|$iFd1t|Of90wr;%ChfS_W50*&fuLWI zU8I#VYCIk&Zu~V4(8P}0Od3txjU+ zX`G>n`x#BrPiyiCb_Z$6(`)kQ1JBBtqtD3Mq-W);H=mXNb@y30XBZ|K#FvV{f2SQ? zXc^AE!(|#}{ujsda`EU0xzzYe`R3|4`RnapO2uMLe#^B}%^~9CzAKJj%C)cmL9Tu~ zR{r+mKgcANwtJ+R@EU&%lnC78@RxEvSjtI9Cim4M$6@~fKhu!OTu)bZ{Ww^DhBPz< zi}7@DZ+W-_gCub=dE630VPUTg7Sps4NtqBLNpnQ@5XLF`T@>M@c7{lrJ46m(+j}}h z(uwOZ`W@QRsW{@NP11x7`H(j!Smtq0EF=(T3^EQZ@xihh*+3c_f23W!7$9fR^XTJe zk?AoamE3=dc2a|^hzgbz!Zl^nW+O;bnurT~Vj*?YBx2D*eOTlah|0dN=a)vZDS5;`X_K>Gj!8F@Rw-RWM_^>9GZMZn5IHaQsP8X zX$NVfYa?N(wx5nIF-+te{OnyIQr;<&z_srQJ8uH*W*p(gYLblY)HK@5N!kndUBQ0_ zHq{otLoW^04eo5M%2i2cnqVF=6rqC7) zv6SsHksr8kC;3aK9kg?AGh8GbV*-BS=}*rSMmhaIAwVu+&l?&b_b5-}Buy%ML{2fL zSmp#sJGMN=q#k59+#{S*xDAwpF+!vQJ#~`E=j7p3Ha5aG5dThOUN>&iWHE|#b6-Vh zfTU3lNm0a4KTM*&)SP7qRb!n<4}KcI36Vy|*`QIhi!}Vb5iIUN+WQB{kC3m1$dO5d zrLiDH+$AAmqApWn2h-jMODJ`is11^xxDC{uk+I_>>Do&jr{5FFLT@GyyGd8-#=(-t zy@R=@O5Hb3Bu3Q(<@Mi@`bfqZ7zYtB34*{dfjq%a+!IC`leRJEA&oJ#om@?x!=Bee z`esls%(D^d8s*{sL`Pc+lV5Uu8Q~Y4A1o;|MgD`bPs<%F0oXH#iX7wGb!>-l{~NlY zb};>Vur!hGof(7W5%zH^UfS6tWF+os$cac}4q=X=JZ!3bi2pWmRt;j@9VGc~>hvsm z#P43?U`bCKEUMqW#C=iNW+N-kb1&h>;+_X7)KeTXo^(v-dR`D|c}$%Y1juUauVYJJ zKs`U8f8qWk{^nv+?H~d77HBM?oqa9ROrBEKiL@Y%PWm9_)~0N=nk?d;Omkm*%Hf{3 zaHo zG{%F*QK~JoJ~&IA(r+4P!&>y$bh0ysu^0a-*QmQm#7!QJw4+4g+Dx2HTysOS3X?XO zN_nJu`qHS@7>!AU^QI3s$_s4xU_YC~BQ}3`765(6!O+uS~ytVE~n85W34!QV*Asu(|NZSo@eR9J?cano>`bcopO#P+<%3Y@IT=x;h9&5h==wrX!^$F#ai!B9N ztK?AD2NvcV^u{+y4{JOl^=3FvSt8Z=Z|I@!nlw3tpLFJnv}qzA5`J$RqwGzSC@*YD z3G78kPhv9nq8rrQ!gw(PTkrVq5!uFlpTa;$4Wm4%=To%#Ov2ucZ3g9;H%^miWcf0# zV^{s?G}5BiWH0t9glEE*jQ=s{6W|neV+bQY+Ca(_%5wpAHllBCp2_ua?#sl#WijRc zE$#S6O$z3S%;B1^KO64_$Y=P;fppS#k9L!XZaho>r>_^HC!#Olo}}{uy=}xqT+48) zGByy$V)T@i?5U9JKu@~WSnV$9TOX)-I*f3Jz|SF_aU*Vs$8Yy$bHTJJu8w+ag**T9ki&S%vDNd6IlvNV46X|A8 z+4wzUt~N^2wrQd&ay{%<#h#4(L z^Jp1ITssT7j^Ff4k@5oW6x(R#Caxa>%4w$)cQSg~Xn8<*YMph2Jg0H}F>W=d22&R# z#b5}detOv7qvu^4EqU7_C6Brr#C6NG(Q;~Kq?{tYT<%Z8?R!bS5e>PqvTEO7360r@+kHZUFVQAa>BDKAnp`97-940BJL5d*3B}LeX6~WPGZ{mS_%$eU(cv+1{THY*7)l<~ zvNbtkrd%%k6o;o3|U#$xXOf&0JFWD+*Dep-zE*XVPQuP7V$Ekj=bDGylBUjwzS&!pdC z`-t$1}KN1w@kjqhcqwCC&mex!xQsnceu8Kylw?Tv$zhD64+qTsrfxplDgR&DgSTdeuMg?ZyS54XTn^o$_Brikc|T(CH(!HPj zn?@fn1_jFkbegYW8LWey)PrFR*T}y?l>z!p>}jeTNSBfM^$)~p>>)hL{Sfv{r~tzl zuG22_sJB|&r{O#_vUqgAs*~zb0kVaAQ`lQy!+r@OD1UdVYD?@pnuB`#)o@kLEjsaV zJ(V&s#*h|R%)Oo1{s%I1W{CU-f2n6f#Eg3vYnBIGJ3{(X+CwCsG#i<}4KNFTiH|u; z;5bQ}%<-Q##8rODV&sz{8`;eiy^PC|v6!{C1af{^r z40+NVB7YCWZ;%iJ{b(iS6UG`D{cDD`U$DNgti$sp$~THW9Y-Cr7FYehhWkn&1wEC% zoo3Tymy`K-f?gbi7oqC^WBTt^&IxD_YRz?%v?TU${|J%)#`P51&GX9N%k+zv0_7Ih zzgVZ2@z~OtU)-F(G}8Yo7HBetWT^89bq>TYH_C?`)6K`0)J?u$;Qki_<T(K=0*kcTSP3$_Wgk5El?gncJi4ELcjNE2(6CJw@I_HgcTO=JdPa6&D$PzSS# z<9A>hBx218kjk_GvEoPV2UO8)fMLXOlkjfAG_Iv3)7Nf>NJDOb1QFkVsQgftW^dl*I;-1FOR%OLTISdd@LxayX4~8a_WnKSVY@&%7H*+FnJzgQU>hoW(XeLL^y5 zq_HOrbNI4NF*M88Mu;Z8xuGS7&ll}<4zniSdu4*R1r@yW5q^nR`$V_$%Cch zye8$G9aOV7w$pZOPzA=-j6tw-CGAq}tJx?g(GCnA;%y!*TL>c#8o#EU&~TF(6O;AK z3FL7mG~ON|?y3=TGES5E^0^T%Av4Es9gH_J;rVPFTi^{GI`lx*_p` zL&n24NM((0CXC+wnS=E&`+|QVJQM4H9IhMa!^R%US%phFjSEOCect^YWq>r}Cl^?{ z8Qjdj2N+i}2+vJA8wXKVwcJY_I~hOro6#+V(L@+I$U^1_{4OCrRab`dV8d>tOt9}< z7sxpr^;|_7RG70lJHhXMEs*hpv!)-YKk};5b$~hb5@~1xmY2-el+!5Ur@g5&v_kGz zWo=-6r}k;?wX~rFgfS$P^VH$e_;xt^eA>k`VT|(wX&;@+-o+n-VXg}0dCwk*m!nwdZy8{|M|VtYq4OhTlR}F3kN#CODOEwJ#3Ep1h2+R{TB&3Z*xn8LGciH%XnGQK6gK z*`KQG)TL^_IqG_Z$O3F;rBl9()%94eZ|sYI|2*m)zp8J|cWL&#=(hVi_*MHDxCcDnB0daVq??xSJ^dFPZCE2N}?f*pj(-=Xu64!Y@m_|Gk6v{rXFHMa;h(q&{`H)P=ciZY@hVt0qU8y;C3+2bJpM3qUL+jz81-|ShZJoe~g_oe>Z3Ujkl3s zv;K{FMRxX3Z#+9ScIsp?VJ`ziP?RJmurDCJlPEv+%x?lyn2|Z%z#O5&m5)BHxg+pJKJB8Q2Y*s5io^&{u^oBf{apb(#_V{@FF$?`83XXMG|nY zA#%ys5vZ|b$urOenmbGA=5YqX;$G8hMHG;^Rba< zS}IQN%OuWyw6*=szm+U4P_jw)f$ZPPJ+2=lUurDbPhQXCw;P-4zn9PtbhDpB_MTfc zlb&|Q1g>`~Kb_JS5AiAgXYuzy*(pCa`eSc6P3RS;gQT4K)QMh+v|S65DmQaAX;))N zEpv4)dN$Nk_HKBVylI3Hg#0V+##EjB4Bg!lOKf_DRdNXXFzPII0_}?NCq~(jDxIg$lP9q!IZk>uke(*eL;e#r z))=f=8s8%=jXHVF8(!le(lda*I+U=Ea=o1Usu+L1LVk%f^ySNffAwsl@pHzBH?cpD zUBxj-jT`VW1d^|1BD_gMdMLNAG+Bs#7XDq`{|xsJppSk{`c=JS`wwM*fpbEnIs-P5 zFGB?TMmPDRO*GDC%{7g%SR0#I-|;GnA(b(~ggo_HkgP)PzK35>^GNDA!dT6HACRw) zi0>sN`_gytmkw$k`Zr`FXYJdSWIR-3k%{}%K5YOtQ_@TF6?JXkT)@;5BA+XN%tfj6 znPcehgVIkb`|E^HA9)kK2tNBVNDeZmWIxBonEK!UHhW^?OeIbGew-~P%I8=?kR;9H z{6yXVtxgVC;Wv(Fwk1Jw>TaNfWH<*(=oi zL>nm}EXvV=T|JvhQER{%I$4doF;FMFNz-1g$K&2Rf^;%Ye1x9C-aVBtKj%62p%*y| z!^Yaf(D;Uq@*sR~`DLkayy>+nUD*lGj-JxZdkNoiuEl)HHpFm^JYQzJ*aUxr1Jfwa zDV$ZW)d{a78Lkob0rGtVo&GEb=nroW;mmG0`}k+Ya2ct>_zk-4YJgO+cUECsds`>V zv8j8ny+s(rKLvNoN1SVXP2TmC4YI|+xg&Ptaq9Z6PELN!x%E3-$NwSaKS`&~KO_hJ z1bd6F_Tr_xrY1xR&NfMk(Q8}d*bd;J*p{PeNI>o@~I9)&NG zH95xKF6F3BTo#doW|7aF)c?zzyQTj|8f#`Szs-;@vU#6~c9nzQqurdjRA^HAG3RW= z>w-g3jNiy3w4KAD+>icOXn9>Hygw%;@94xrIX=Pm2%f@iNO?Cvx{zhmmy7=K8Rf2f zj{8b<5{xwc$eGIB0BNbxNjUB@!YkcJoy^8R^)(RNXT;fZm2o5~Kn@Z{gA9_KjUq=^ z@~#}$D`-#FH+gpi|HD<7jE60hiZIxi(xlRITql#gwnN-s%{9*FWIDE4u#0wB{WATH@{Gnk7pz>L z@0I_GJW846Kn(g@pq`8mxhR@EKf#zrY=6fn%_fwUF_#CMD=r zsDsn+HJHIr)z@x311Mk8^#B|1Y)J-n-Hq-M+B$8Z?Z6OG`;X7kSVKI+IPx)LtOUyZ zSv}G&zX-Fw{O~9Wa#hqY<=e+$SI(ioj6L+>=V)(04LVCB4O1 z`A(oz;!fguxS^T(bQ0ysIOL*j)nRu*3Vq3RG*GO}y=QQrQ+D#}8WkjV)&#a|0dfIb zJ#%o~g8(`F_FuK67S0V>qqw-{BwPo$Gp@@}jo;xgFZBt3TH= zTFhGd_sg}tKmA?X^Y&h?yy;n~(AG+2#`~PXzAse^-WOZ;A0(WxTRzZa7;-p_gh(*Z zkJNcmZ~w$S5pvBsIl*~&-kcCQ$-8rAUe?Ozg{xCMykWJtJk0pOu0s!BPlCF~L$iFIY;D7WV9=8NpJP%z3_ne*Rp5OeUVGFdb$=G|Yt> z&hcxvgh*XNh}0t;xSgCUxOg_u0Of_jJf91e%55QHg(?n~ZD5ZJk?L3^gJ!d{Mj1+0QKkiU#Fz`h>6SR-$ff@Kr> zW+=NCBBfw~5-1>!!bzGG(T|Fw7zdCR+@;7e`t?@s*$z7(5ll)tsRJ-TO%CVj7TT9t zlX}Kh#}?XG0(}M9fV-S=qN0br@@eJp&aNE^})htW;6 ztrXUkb)>C+8t3`+VJEl}G-=qvx&J--H2zLNJ`^d5%?zdJ#oNeRJn^I3p=>60#s*6) zd4)P`E;s|{-~xC`G`WaudpB4vBd@@9=z1$yZXjF1RW(vNkayrdnAVPx6vp(2=#StD zJO$r5h-$C8+4LC*hhZ=r3L==N=LAR*6hjGE7SP}0>F<#Ae1wd|UnGo$@h}l4!&Ip0 zWNhmesp}D`e?a+XQvTp#>}-H?59hQ{309~A+dbA_*O+UdZIC*%gO?b?2ZYFU!kh)s zFc;>-LRbV#pzHGxS&Ce)^iOvDsifdi`q37&f7Ah6dIcF5FJ) z&H>d?!D0s+RE-H1>oocbRM2kALxZJ(w(MFQEKa0@zFEJKKCvxWYU6{Y2HCrRwlE%; zMYa?E4oCzO?1D1W7)oYFiUms1%f>~D75y;&j=>4Y2RnyTHYh@`zZNNV9DbFeS3<+O(c)S;TAWD7 z=}0Mf5GjRF1jSG?jqs!Jk1U%MDR%Cu0~efub8rDJ!eyvoBVNm9yAD~;#@N9I+L=il z(2y}&%42yyATClWk=CBkQiVSo{_V(WHuibjM#~k#xehm=74n^<#S9(j#fKO>3Conq zoc;!50{Jc-LmE|C7}-QGU@U;dWwftDk#dK7Y-yx>u_pJ?AHpN3n?(N|N1fA;ok$ln z=y{$OL;guy5os(&mf*JJ&_`KElo^?i>5EUe|0(FAn0H_xgu^hXp)J*J>}zB7j0q0h zPTZ=EHPF7x&5S=5`aja@Wc)$55xzZ}win0v!Hb>431=ik!dS?UV9i7Qk4G_KM1K{yQd1lAvL4849C>yO0& zasoXc8W<;Bpq`037;6d^^KKJ)Dk48BuO&9}<|L2EG7Ee0f)FXl4NjIk+*F$yf9j9H9_WsHN<@iPm0 z``?fNiAJ9b^I;*>3}q}FK^hr5>%q~@x_~nb7qX#~^+yb2X)I%E9Ahi83b$xV|+?F|vewTgXQ#=`ACzChpkdf627J4B8*kkwgDSx^Ooj%L%U{jQmrs)^^$-^=?Djp}L9wZw`{f zgn0~3KoR8U28r1gB&F!Z=Yyo^agdmJsV-#{bMhGa#!ULi6#B}gAgRQU^?^?8$U3m| zoZW>y12zxqZRYU$Z2@u)_XTiG36O>;*8cQKClrjM|4pNfM$!L}C6fp{lldRY;3D^2 zhAVI#N?5<&K(;~$)Eo+6{Y4))6VFD%-9otZeHXF;clm<=sd&u04W~IbfT{|{1}EnL zHuj%|0dj{h?!!ZP1W({8=w6}y7K{`ZFa3B9XcCOOZ4~ou80|Nn_6q~C<#h*0IC2;y zv7a7^%%{JZ+XEyLy_kMjw1xfh*OcFjVNyofm6A_OEO`abo53;`e{Ik3eae@EWIXyr zm<&^4I?RG-m<#h^A#{EG3vvB8SQeo#fu&%j&X*%?hp1ol70OP3UWKfqE!4-*K4NJf zakLL)!vgA`1K#0WUjyr56Xatv$A-&h^kQQ;dxvn@ioPAnlEbC+P&joIE+t3+-g-RmQ1iPSZc9`rzdfw(dN7*?mN^x)=fqobY2&a&6iq3~iF$bq59E4d; z69<%ahfC6J&JXc-0`j2u z7Ra9+s-a*p@h>BO{1xXAKl!tumm6RUP&|2VKp(Z`9Qj{5=ITdeJ=c6~lWMud6c&ky6}c)5vEm`NUoj zNB>(%8|7QSV%llRM%rctZ4}BNiSt?AtE@XD zvJ`h2atZe?h2^jUR>6AM1e>8InejJ`@i&M5l}-IuQ2*fC#{L7!X%7|ctiL?0zaBIG zb~64p(?6RSf1S+z-_y4Ur|p?=*^cz>D|etLvaVNWNGW&u&JMQ;cEKL#dN*9MkO$!~ zB)^j&$B-wW?LFR$Kzgd!`ysoI)1Hy}jB80hGG?ROp$@FfIctzEI0NUvj?Kp0djY*Z zmpP32^&S`ie?`9mt@C2ShJ~lI9>*i2z%!O*d5{y0&%1BG;B<4scA&ueKtei6pLk(=%;fxF9_xfC_Wy)Y$Q)UOEM`4Zl0(=T#8JWg3lq6# zGE9Z(&^!KDFy6r|+%=R>EoIR=&y!{cx_^9Sye+@P{0o&$%)eZ#!jG+%^+zG=5AvDk z43=oZnF~otoRjWT@3fGAJ?9|UijhTN8bEn|LYqt=ZSkaW8)?;oWf6X?7TPItDcB2{ z1CT4ghP`eh?UHe5749|QqCGZDqn)m%oyM~Mpzrtge`GO~fQ2!jG>1Nt%^sF}Ho<1t z3fo}^B!UTQ@KcN5x?K7xIO6I5;DQDyzeoS?rvHQWEd8IpX0x$gsNmgI*hLt7APe%^ zd3J&{BlA5xCqh34C!lB?>5C$LlSp3#>C=eY%tzk!WhFM(-A|+r zX?i_@Gda?Qeg@9L1t`cN|D>zvG~q)Dy2VZS4+tMF;^#74f$MNySbrc}p#y48)Bm9k z>gm@GH|5$*|L>ukbEyA0^na)XD^!6E>`)D!8NqUgaN2m!;6CypJc1{XbRa=oC7;Ms zblqIWU~tuZDg%*SLo^9T4ujz^5+cE~Pm{68w%0h1N2;@1_3XOKK%1S(`_|Z|!XxH~ z=}1rcr=$h{-+m&qaF2&*FxAlBzGW;$pAQRR5iEhF(DnQ$vK(nmOOO@FRj>xugFPdG z{X>FmLa(24k>~?=vh#P zUJ91%9Z~{n{IO2i;U9nO4?dM7o=Y6WuWbx*yFZosOP}_Qqu;7?xlb9FKV_ZpDeHt! zrJ(&&DeV4Kih4ekVr0p^PsK8gIHGn)8S*gU9D@^(4@FQ4m0*XOYdfUYLp)Fqj&}0a zNgfaFkOnA^*}-?&JERh#({9Y58OM|$cA#|3BL9OT<}hpAqltOB znQ;u+KtC#9px$w&|1YEeLsbm@e+vB{s=>wiQeJ#Q?=uKB|K}&`WFX;(gE^CanMS_^ z6VG*11~F%6^!3XUY!(=Xzof5u@2D||{Y`)j$8AH}?*+(6^hg*Bb?gP|SF>(NV0=Ql zK)n+@o@*1KK>{U@F=jHl8JRzcvkmm=Fbj$-q{Gbk;3QsTDX8%=nrm}mJ}iVq(AF3r zOOWaubt$rIRDdi;u7Fjr2G+wS@O<#8%vb#{kUoNYyRse7WCybCZH`}Jcd~^3=pk$%j8Q*O|oF${HGtoNEJP)HEg9hd;*CozB2-AT* z7~57j0r`-t1dqb zQXq+kn#mb!`F8gs*?o4iT;SS8*weg2_D)HV^r(2r$cdM%5ec&2xm_-Et!rPX9HeYM zTg>})aoZ$2bekNG50WF!Ai2Wz>u>`+ybp1dGoUtpmm@cs{)#P+mnBc|{NSUY?WUg5ZQ0b#x)9FISVuu2ltLMlfcn1T5pg_$r=WX{ISd9uI1Gb=p+Qm@ z!5PEMASs5DNkL+v{7M%DN!gqr8IGTk5D8;pJWPbiFcoUz7=P9={;;>N2M2q7C-m+U zY^*;*`Nk9~J&Zq4wT1B~f$;}fy%AeQpd<<75FaH>C%jo;zZ57obD%_{*SiCmLjq+k z`g~}pCF}~~m_}T%5L;e%pe#Zzf&A=1S&m!*W=|mFS0LjTV^=|#m>LI3X*cH@Jv`@N zZDawp|M$EcB5kkoEaL4D-lw9!H&b6tJP#&}HLxBw!DiSB+o9_yM=0!ZX7e^x|vmVJ~qGP@t1@xXVV6Huk+1WXWmvy)l$m zEae5(#e?Mner&{Rhl}W@7S=I=>`&u3AK-Tz9MeeOOwxzmz;_u1vF!ijh$EYQvV|~h zgzY2_WEt+ugmDF~!wqPK4!8sN!GHcsf363|I?5PatOXmISpPMVe>3?fENd?LN4KHd zbI3pCpBE7>4+-ZHbf~_9%#R}f@D#mxC21qSx_R^+C|gI`uvxZ{)@|W35L-A5gW)g| zB4I3yhu-~)p7LQ$JQ4R~mId&6LoxR()1juIet*{-`8GJoBX#Y;yzpIn^*ApN+xRwYe z*adqa3l73zD2Qi$zD=zIcs>Xvhv+v@ie3ig^!48Jy%^5-=5f9^hx3I6#82Oc9m>q^%fw z&)?r=+&ay=;5_RD)DUR^|926HUG>w>%vH=q?d;11k}t|ZQS zN&?064Q&K>+cTOxLOy|~pqo#b)6RmC10fuSfxU|dt z=iEpK>jEdzg}VV+zKs1ZR1%gIs%EnP9moC`s);u*Cs@W4-b9!TQz75U{14O7i@9C| zv(TfV%tP9GSc50ut<~P<0OK~p;*A~_t39Lhm%x}m7 z(osmdibz*6l%QLnbQ}3+9kPOZR>2xr51U{!Y=!Mm!+c-cO#Vq%J?U|@lYiDoE~Gju zDGww6L&-l_u~&7IfBe~z)wuI2I74B7W~7g*cWkFq-uZ>(KbL0)gkMbfMd$H@?pn+K zpEQ&jci2E?Oq8OLPQ1>%w1JihS z0NV*jVvKYdMoB)pXB2TF+k`v}m@K8}m7wMuJF*U3a0br71;}Ggy@)hl;~hri70AD& zlk3PEP<)m?43=8@hK)YLdRm<^7H$lZf@RG8VO-~$+S{u=NLdiqvANbpNGtbuz#Zs) zuUnlrAL98c?phCRZ~}V|WL+nH8>!X+YHU$sOhpWP5U{d0shq^zKaMrPOy>TftO3vq z=&yyOvj~dYiMxk1=;`Yblyw-t$%S{g>r@7 z{eL&(KXZbA|4-di&7*I|u>VI^Qx~S=5xn2QGg!jvy2@|VAtPa|a(@*pNgGCq=Lh-- z?zVEyqmXKUFj=`@qfL!qpMpLeWk8j%kC zg- zHLxC1hA@|W!hC?f8MeZ9*a3-Pf?ZHx9xa6yp0Oi~dFEb%wBRm9mgz^!9{gm%K{yP@ z-~{AD5!8h7{C@<`|HttB9~|s!)ZWI$K~2MH&R^sCjR2?w>pY(S&*Axh4A1|gc>X^P z8}AsD5>6%9p-xF`HgKWW+jxHg&Y+)z2A=J^k|V{L87U4kzZ)=t^Y?X>1M{ou!};hJ z@OKf+=)U(FF5^yWptR-6>1XH`4f9Qs0Rl)$-C+!b>aSaJ?HS`AFjy<_YL(8Qr_5 zE5cR^z3Y!p*yo_DGr&4z+i}_f@(i4Vt}2lW$cu0pu7E9wckhsP1^RX54Y2ZDxE1Lf z!JNk2)v%H|4cUS14%CM-=Z#_h>IstjxF5nJcmnxx%wO;ny?8Nm$u`dN7tm&)cm7(y z{56O93t7PR!e#g&{9-5p3zR|`4CJ107zV?k@BE$bmnvA_;I4^b-x|w2AICh8bl`R} z$Ge!@8{*lwo@L+aW*>{RHnVR<+Hl*E)!VRjhDapgjD@5xskf`tjR!y2$7Acwr^fkv zD*cZ0oCfmk!tF$=a+!#qJWGg7Moxu%Gw+ZgXMq`e(LC}M!x=yBxloE*J=ZUZC7;Lw z@?N-_{BOh`^8cHE_5RO%?pp|pU|g!>cJ7u*bXjefbs{lKd9`c z{n0L~&eHxW$RkvPwViKD2xkqfhfQGj1d;zB*^FLKKdytV=-Z)T3;kE4|Ax|k$LM4S zwmjY!NJN?-pLVkgxd+S^_Vw}fE6Bp_dVzKOJm&T(^yAg^;~4sH6a821cRid@;YYpy za}258|2ctFV^Kb`_x*#ltp6Enig1@gC0H|9>m%zR$uLM0w*}2U=IDCbQF;;^{!0G0@9Vh+Jk(S)vEPJm|Wz(%WwsfHVl&M$Q#fKHT36N z`gI+scl{jncc-56A4>T{`FYMip^~xA3RUPfu;Z_K3;q9N_Wqx<53AuE;2ruGVcvoJ z@DLus6L<=`g<`l%{=43R+w=|GYVK8IU|urs>?6%-d=G$q806d7m%Pe*2VV`6V)Cc* zXn_(a163CV8SH;Ez5Cxn_LS^TE$mNAk!9??g9&FijD$!S3*%uTOoo~*JbzE%`8%?n z-xF}0Wq<8ve~oPDVgFB<73`%e!J5qemwm5|eYzd0!OH%3D&b6rSr85ON%(=e==JfO zEo=;z`REIwA)Bz5@%(!+&%fCRFT$3GdkJzW!34c&Lf$@JP<3E(5m%;zNe_qf1n_x4zKB7+`{qNrxsef=JQ~w#% zKYdG`vzITU|3T$F)z|2MJ@h}M@BN$YgrT0VN4+`(%~FK z*vNXqR`2>8q&=AMmp39@b|Lpb79529$GrdhfcJ;di|O-4a18wfl&vIh+sI=)c}3=9 z>-ui66d_BY66{b1E;s|{-~wEP%g}Z^T&^HJ>H{EiM3SGa{iSD178(<)W!!Q^QBSFo-k;t(y9@_5mE+W!1gEcVH8XF)} zk<(!oM1wto{tt7}>qF`PGXrEk`a{ z@yX5iA)(}t^YBI7zXX

w_R!fvll!YoTr(bpsB{)Cn&12F8PO))o~R^#5e~|3>Pc zYyS8D@MC(O@)^JwOjv7RJ#2!_p!&JWzlV?cw&HH1ytgBFKq8nRFP3Mh$UTq+2ch@< z%?O^Kq8Epfx0!rvgnk12@Bclf{)f{4*D?Qqnxl)M1lSZ6-*RR%g|H{^jw1F8(4fbT+c}1~kQcG#5s!&;niR;l@Xk86D_}Nz=hZE& zYjEFy(v9S6G5JJVRx|%jBmXn;XC(hn0v0HRGQw@;o({MJ_u(Ntf+z44YRGTz^MB&{ z|J6VB>wo^Q@(R7r|DEKYu&YU{HI%i{8?=8I2;pGY$UpXB==FN?55v(%LIZW=iYNa; zTIqcjQ+WrHk?5J+(Q3cM*oCT&H&4xTAhDZWi8+OzdVNbzgcHs^Sxhq0sSIehC0?6_22*} zxS#>9aP2zWfL7>$J8&Pmeq{f#cD8tC^NtK|_5R@_(NZ-b zT5Mn+5iQkvZ02YgPk0kyGE4J`D!L#{C31r+{>Y~kbFT2SYU6^II)t?6K!;7cMp$9L?=V4Riv(;1N;OUnn zd)Xv85Hm^kdtQ{RYZD}M*#z15;038Nj}Tk)bF!7NlLoL~&(uqWrtf{0XLXL~TYp&S zo8|!7j-Nc%cqbyFB$p3MkH^oJlV^EXcTS+>^W5;1Csd|WU%B0^ojW<(Y36wv-+Wol z4wcetJTsg*L<&|9kV2j_7V%?f#i3!cYszHVo%?IqLD-3~r}o#fcj$CUkC-YMF;jW& zJYIGd{v+*&G2-1oG3EYBQuLE0X~aZH&YUQRS4PVb>cR9i^VoL*vWqZ0??lVdH2RQ4 zOV+ySvVYreWDoYP=c46c%x~qh+E;jH`&-G5n?KO-R!8ydC5m-nlr&IAE@Xu6%9p$_sV%@#Ab(2M8HW^SAO06A$X2L7%NPXCTE-w7=abZ_dXCt^wL5Shy53`TwVAdC2v)QPJ`Qsm}kND);lO`?~`~w@3^rl#^*gpai4mFQfl2 z4v>N9#q4p4ARK)dl#OE##`s`id?<-xyeFJTFq~^6Ari*Ic$f$U%r%8-9?4)HNoL+) zzOx{G-v^fWFu%`Xe+^cs0^0)i*X+Hkp=J#GbEq4~{(L6;^9k(F_3Y0h*q`@s&p75L z;+YE5VHWt_Ka57NXFsjpKb(s`9~#(GyRbQJgar$+<*npBIOGz@M_-Cu4rbC`6i2!} zyvvV!6_lcvv9~XI!1HM+s33n%{JHVhO#YDS*+OYI^FRAI1M4IsYrw>_tm)okKedl_ zhVYFFVQqq%ORWE{vHtV0{sTuQ>p$i#S2pXvH0FQSd=*<*|0l5iM^@psu{O6uHCUOi z*AUKT*b3Xhp1}N%eFu7d1z{HwKe`FjyL&F&&RpU`?!uO*11)=vwD0TaHl!W;*8l83U*r4zLgrsHZQn^B$mJR|WYZRhQqI(`dY93n(rcz3 zuocXkDTVPfP|`|uDR!4q)ZqaDT27B0~y+G#7C@zkHD-A!X%)V|sm{muhfp3C~*!usD#`P8!iFJ%9p!~Q>;XAtZKJa4h@|CTd=8Nt%V z+Gsdoj)X`U3rRQmCIC4RCc{*i4qY3lgPXiRvp^@aa7V*jSP1#p%&-W(cmjDFN4tHU zdiZV-Wli2Dkw+-u9H#gEclP>4?D>n4CG3YS$Wq*8NJBfnGk9;XBqHY%-V#^}%V7o7 zG_n6@-(QEU?_~e~nEgMv&>NtA{kP z`yiB_9n8Lt{XgloAPd;{7hc2POwQk-WCHyHO6kMu``^Rda|}*EJ{ZFK_9}@^&Y^JE z;HMV9{pW9-ztcxt&;aH1vx;u-`5R+MRWtqH)_?w1N*I-3t)2(zjgh{FZTKI`9M2fy!tWK@=$ZeQw7U_P=+&m=vLEI%=!Bs^4Zr90PzWhxx``KaP zHrc1Y<1Tq057CbSUsE2?`#s;k;6LMOcF0 zTfgZ!WX5?0o<*$28i}viU&T4me8m-H<-q}A6}bku~(REQhjLduL<#+YJvjG~nun*>H>(S!e)HwAro1hhy>gm(cIfHY!fLMj~ zck1g)^xAHH=&6r}EA*?V*M2A24T;0ra9kI66Sr{}_hEf#VD^RWcqrVD0SncoZ)l1!1|L@BGdHF}}HTi!e|4|u1R8|H#w-7@y93xSLu^5jw<+)uM@3{83Z?3E2 zL+$?_^}li!eV3^T&1ku&{6A90$yUE>UTzFQI+HL3(-8H|r#6}MK#$gxbfB0%3)$)3 zy~+M#r+NTYFB-oQKMzs4UqCKGWdS>gOt`)}W<=+(*N1;l0?812}{uIEEPF==mwzitPQI zcY;i#0~wsaNu0(RoI|yK{2Ihji#jBbL_HeR8;$CWCbC)m(L#SR|EN{n(#FScpUU>0 z8TBQxKl0cg>WXL$a8_N=tbM%Te=gw)t|C1~JuJTLYokM_@*S;hxGDZNvdVTwTsOKJ z?ISkddxpNZ_-MI;gyWSDu-mmHNB&T2+W*{0fEhcAS4(1_RJ5eQm0sYDP?M3uV^{DWN>$j^X zTYru?YEie`xb$pe;`5DDlh<8$>3jC{k2bG+WFLL$EJJSo&u`q{cZ)*1@$3$=^Rh8< zvRgPqW|vxjKhygA0_*PwTYpcsj5fAz{y>`E`bb=RQCK0pRak?yNc(Nd*!X&Sr}6TR z2Ss5ceKWF-WiI-Uv;K?RDy~vGC1fdLJB+82yHT}IzPjaWXHnQIydTNK^5vX5GErgt zeS-WWzSw!|#BUP6P5#MxMX7=5$ncaB~wFLBqC*dmV`9_KAd=K68;&o}Q-4xE=FFOgS}rvI(+ zrmOT$?^MSv@BKCJ{R4ehXO#o`b;oX^Od6Rt*Z|tX+rquU9D?7z5bn}@zE~9c{>>bM zZ@m!i3qM3Z2K-OQK?SI{5W5K47&ic{I2}x1z{I*>kHN0bwk(z z1=^Rv>SS_`bLL?I`u;71MdT7Ji^6~Fdj#sqp7)gz;oeb2VHFwq|7)V~=ha{Gw)0!p z!&>3>*od@xJ%!Em&eUVubSu3Ckx!c0q5UAEwc^vnm7*NGu^0Ps0EbYWXI?kro6Pg( zd)H};6WZit4f_BM+WJO5SknXT`#tUZLG3%zuWR3T@WaScQrRZ*%|VnIyOv; z<0;3}Wa}1tq|D_9qGPH4d34dc@nnCZsQqp7Kjojb_uc2T%k=bNX`qAN`8p}x4E+SM zkLbwlf3U~6gt*EPY#8zkVi%1+kQY!zk3V2nyvLRizJg>se;0M96SpSyzRrEDji}l!u&G@kWvQ@YwibI(;xBQ=7|L65n3hzZmKmUI601lx?eRqWH zt$rpPBg@|R?K-mWpIs-JL=)2JKs2V8Ay42WP9v?1NA~v_dgn~#Z>sOc(JvtT$oYuo z2z23+xSrqISBdQX;zz?(@;YwfHtr(xo#(SQm{6Ek5KUyB}?UB|m(0bnZ z`xgI?_@3vr)BN(o{Br(%l3w5be3;}vreGRopcu0-2lLRT51{>^K7ynAAJAn^LAQQ} z%v}8sXyD^F^7WhaQ8be+!l_364`@ZS{(gaU7GVjNA?-R-!wSO+dgl;%8>o+hUiLPh zZ+hAN==(dd*^XZXaU{`%<~xH!n(ROZCy?GTIHY<8hm-Wq zh3tm81Hx(g8RYB+;b;u3i>&l9lXK#u{)0>86~ta19ID0)4p-^5``87qvoq*7QLoKT z3Mbem(Hg)f`|y;`x2DPQo^KBgy$p#tin<`6`Hw;1{hGIBIqO7eTw7J*q+mvy#bD%Pht!OG)L&I z!Z6LT(x~0srrcwOa4|AJ)Beu3UYDLC)ARLZ(96Fhzdy)lySYAfMAswNM`n6l-+B9- zRy-Sy6g(RaUmhIZn)zJ#$>it4A!~sS-uhVh@yw5fADtf(4m1u66$PINZ(bV~ewZ2& z_D}v)*f;%C;Rl09h3~H%6)w1s&HU#D(SK_j=a7q#)-Eq0m!Y#lJBseb`UsFk1}hv} zg+Cf!*ztJ$y{y{zG96!w%5eo@Jy}J@UNk?Az8PClf>M-YH}+yb4&V@u;20`v3c|6o z`Qhlr{P6Z_^VY_U4u3P%erJxwQC{(>u1S@>OL-fzRPi~RXvkHR~tN8ulA+8N9Hjk5onP+jqWeSA+J?Y)p_ zyrZxDSKiHE>7%_98hdVqhP?OHt-lOSkKXrg-O$HyBb<=tNu0(RoWliN!WCS_byV-k z4>c9}A&%Or{7~1O9}@0O@QE%ES%3Oiaw5{|n@Qvpq}m_XJ)MiA{_jU&*H3>NGJJ|GU!Y4F z=_ac$%l}+$yYaHxb!-m(j)@932pglmoqfN@_>b~Zs=Q2-<_r{L7UrPMx$VyFAUhLm zniSinn~g(e{ci&srt!S~2{b$2BL2zzfL7so(pZ4Xv+NwQ>fT4f5^@=0qdyW>kgHIu zFC$*y+BUexfv$~AZsBiz_M>5qA@YTc z@ChW(_lJ7dSeGZCsDAH1LXCXIQ9DLF5=f#RC;jF$&fpv_;1aIjD%$qQzw7C!kpG1I zr(9dRYpjufH1zZS*&k|A_HFfy@#5qw z^0in#H_0b`kn?)K&>srPo>w0|UjO&(N5W9y;TVY`lzqnd|7Y^USbE=A`onl~BBH%} zCXtyhE5~0d2vg{3{9QIver#@Y1?Z?D}%%M|KlEyQ$=%o z>Xe&$?N?G+&V1Y0uiwqVJS@N>EWt9gKkyFTQTFC5gJk#1(v^0!_eh4`I8Yf>&YP9P zraaemQ5igIZjf*^J|FeRt&mRdudH7o*Wknc!&>3>*odttK`Ht^$95tsZ}26^C+q+A z(sTaFL3@nTqcw|h^}zx9A>{hsH=wnBgy`682@IOL_aBrDs zNS?q+oJO-Y;|#g;pGJgp&wl{pY1;ot^XVEHxn@lQ!ouPP>fkFj@ctLL%Z^Vm_PeNhAJTVy+5Q4wHCKV2KIK0y^IeuH|H^*HSz`|L zLCAK?Q=@mF-Mer+FPwCKq4=Q~j*%$BSd2&Yz~|YZ=7*2qpQDbRK(dB^j)sf;>s$Qm z2Yl&!{Obz-HQzmr*3|Q%jbGos()))_bfFs=WKp?%@bA{Yj}WE{Qhhw4(!^=xTI-kwLWAOT!)ge-HHk9oPSd7I7)0(TZqIN|SUl-_>4f z!_)Ni5H{*P?KZu0rgn9zWAqcqZla@mn06K?#Z?~W-;-w$TWlU2c>z_8%GCyT^%lOp z@D(H{vt3cQkS&Yq>Bjf)-|heOo9noV+qjGSc!+-dcl$rR_gVZ`{htH=7wrlLp%8sv z_H7KZ{Kxw3f1w}2+MHbb}vZIwhJXz0CEUxEk*1C|rU-X`m^RNJmumsD{t6y>jxe9Ag_G^0;{KC6fqc0Cp zpI)NEdvwrygtg9DkC?h(Be@yTnvl{*#@xy6M-YcO=`CryqDegZ2N9 z(Eq^Plc?vvH@v9-|6Tq6?1MjAU*|vC9BXGkbX+t)08jSq&m7hVP~kU@ z@AZHCaR7%9t<6iFw_gvvbIjwp1;^+yWGAQ-#dVWiQ(c$1N^O3UY>LjIr^yaf?Q?w3 z@mWVdjJ%qrszN7erc^?#N6zg_+BS^aMR?GUa@qg1^Y&H3Dpo5Hto7g0ZAS>1qe zpT6r^W&aI(fzxBryH5`IruT2Ix}6+^LJUP(TnfYKofEu&2Sz>s+uwQ3Js<4+f;F zId^d%57CbS%hj0}gtk-G-=DYs{;c))_TKM$VE_GF_TN8f|9v!!v;O`?>+hx2Y`uL8 zJ>|dB{;Sn|g65qsghJ^I#c+&7`gP}^h#sZc(epwWOCOKyDQS!A7T4u}CW@;({z8~U zPC@Lw7s51h2CC@sD)~yh5Q>FoAxW>_BcC<$=~%V#`x?X_82?8dJ%Qv*&&{*`@ciGh zZw&}@q%jY>^skh^W&gXg?7xf3SP%Oee>DGhzVg4=^IWd{lezWZci4Z(&Hpt9zd(A6 zumsDHe#IPtmBxJOo%^KyI$NH;2D$ma!reRk7r8cy_e?6y1z1n7nqaO3xfwBY1-6nU zsCA9;vmxb*vn_u^0Ps0Ech{#}GptNi?AYnFuyR&o7Ko8}sk|6-J*8iQPpT-%SL*J*pTjV8N!PO{EI@igYxQ)Ar=KtR(AEF-vRwy6Q z92&CMIApVWVi3I$Lopm_anT+?Bk8&E$8q)-nQ&sppR_dN(p#)DOtb|E*zTb-UJ-xd;576w;1G z^M9vEXBuXp7_%@3^U(LtMY;74J>T?>2=|(MzKC3cWmth|{_iSs4c1~kp3MKoT1 zJrj=5JI6^I0xqF#5C31Dqp^+7D%aHQ+BSIp z$gRJ3UmD+&|0u6$IVEpMJFhh*F3;Gnbgtq$ZX!LxIYan-^xXafuJ11WKC**d6S~ob zhvF({OD`%P^l0ye0spIaQ-5#OV)@!4U-SJ&ef5m^++l{mDIA!kIWn=B{ z*?ZGD(H=fAdLg~s1K;(g_W9k&Kh{1gJBiWSXR=q* zZwjU%^P9n823d?*$bCnEy^vX{&kt?4^vClnp6oXkeG4^po?j?`tz-j#IoBUA+$Z4>o$W!-n{hDM}c&l?tP#T?!-PnuoU;g(H?f;Yc`SYRY zjc3AsalP9;FLurjwo54vh&#CMh4ABdUkE=r^{3&`mXYD`rjg;TcmFi}{R8&1_h{cM z!^006|2_P0-6zBT_dXda&VD?+`N~jxR1OUX-urm?o5jBfr7!;?Y?js`?AkLrl<)gg z*lB#PtYSp?yCI`|qi<9=;@B~Kf9J5U_n@&aYsucI7#)7N@DpMG>`#Q4V{!awr9Ch= zJsT=~r0>lK!;F0yPfQFBNynOyMh7x@>yGa@t{WUq&@1!u!?B&-*@OAvoqhS?A4dE| zh#5z!8Tc2Wdct2Qo7Q3FS$CE9`A|3di=lqP=Z(>PUfuEq<6mD;jy`Mr>kImKt*JZv z*>F;tr*Q`7Z~>RFJ+ieUd-9?_QQ_(#`Zx4z#LXF~U95jYUqNDn{*6ugH`YBLt~%!? z9)Ewto>TO@xQ~ZuGmhWB&>R7DqHDFe0qcyLliAVcKaAEtV*NoAn$hyE{*m|ekC3ff z%!xSveCU_XfN!y>FbL^8&T+hu-dUgzLm$gf`fz0Ry=3zAT@2HQfsx`W_dOqq$gzl} zo)6>6iKyBkUx(%EW%DS6ryx07zT~lPp?soxhy5QBM=k1*K(f)dDvsI{({E;=7_%@3 zrS0w^+j=|NrrLjVroIC8MQ4Tgdyn@!<^9&)Wyywh%A5DRX}0!HJJjOb)M)MBFmvz* z8>5!q0yO6h4vWYXxr{7R=cKJq%zQ3CtPt+hK1cI#yC3)u_H%9?!5wX%V{5P$m9qzj z^<)(pn?E>gq?i9(d;Ly+h+k8mysOQ-tiEw=x$pb9-fGuWGgW=#x_3IaO7Tzp@N#lD_M*+Tv?p9!mHe~uyEbY6*;W}cE8O538(nkLars9}qx}2-w0qN9 z!(Wg_Z&_hDK=%CFybJOOjvC{J1WvoLUf;h(C!7xP;g~Yf8rzge&ygZvXFFFjwi< zQUAdB>pkPI6~d$9(?(Y(D#PzWd93cx3U= zv3^{S%D_rx0E1A7p~%@M+Q|4T+A-8=&vN!j)Q;;zXt?&l{6jMO4nWHdeFsP{HUCim zL>oW99UbUIH20v}HAHg{qVb8!c%A=&mJRv_klw`pTqn*L{buQG zMF~og*8ZkYPJi@sEJw92v^3EL-F8evV zL3xPw)vp_&93u0#>UGDSe9tpllbT?!CtqY=kh%8mk@{L*D%mQ@C|xp!CVKg|+~2RO zE9f1_pjUt039{!M{WoOaH71YiO33PguBVEv-%aI{eOOH zZjbeU(f&VA=QEwPSHJV8U>Y*-d@;-*i_tgASb=ewS@c}~^Jj)R^yUq07&0}&^Brg{ z2z?Qjpd-)wfi83-gJ^zI_0Hl@lPXrmi`9?C>c`@cAd|<7L;a)Tu*`2)U=`M2E!JZr zHluCfOWMDew0|#Y|6Wr6y`=toN&WYd`tPOC@VfQ?XgXMIe+29Q=_#bQycAlU6LViH zrx%B<(i)D~%f+FDEJf8q*EQF5pfV)4viIx7q37RTvd3<*aZY_x zK`F|y8++0A`pnRdj)V3v*k|v8c6%5gbJiXPXejXh4EFwz&7-|PWJ)+qwmPr+u=h9N z9jWsEc6%qtgm99q-)8@Sy8PT8!u$Q#0UW{+977D%+VdL3Cun~VjVVWa%jU-ah1WPI zjwG6pMh7xDft>%lkN=CO{;ziaN&EhE|KD4D-ADXiGIg1KaQ*}Tm%n~eI;U|4=a6o9 zPNO;V^iH-!2QJX>dtb97_(ZNN=l{FTsN7dxQ*XY^CPmDDTygGIRNd3AUX(ACe?>j; z9ph@++WO`4xlKOl)zZlAKlsXr?29e(FIJ zpX?USkh$@v5&VBN?T~-8AcZtq5q&%G8R-l{A%-ITz&Us9`9tr#EdLluFG7~DpF#Ix z{y)Zwi@u>Wo}7qi@8n736hz-pisnw#Vw&&_B=h78k?oQ|?)x`qwbQ75@sG=YvER%> zwD-Y0asd`$36|ma%KuJf^02bGLm5Rj^9rTt0cVXBMC;#;57%!~w;@_% zTzA}hM(32E6y@k^&^Pfjb9(4|(fg{t4ZcWEn{_z$W7{w801n{@jvj*O67#wo_ zV^ig4rZHOaNi-q%{l9PO2cTzg0w-}AXK)S|a0yp%6@C2uvbw?6|9B6Dd)Ws!$=kS# z`-tXzKP3AxV70m$(cZ$*9&>}}x$(y_)&$UpB72W*i0*OT)fct@FPdLI)fo5856%DI zXVA5(w_s}N^I;afbCYYs9Qr(D`M()-JEvrsEW-+{!WyhaoBPzhhd)Mk9_IhkyQ4g%`2SlX|4;tWgywhU|G2dUNTU^%4+_J2 z>1@PiY(;E{`3EBkLkT@vlMzQLy&Uxgt_=z2M0*(K-nU_%$7ubrj_<{Olo>70Q1*yZk)6{9XPp>hJMeos&ifG7+!ZSCc%6(`ZwU z+LfaY`ttVTSYyZi#a4Q+jUpXV4bGU#@NY7?JvkR`! zJ9kKXu5aAZuOlm*Ip{z3J+?J&imO~|%^`Ugu{`6Y@c8{&iI>)nrpe&GRY*(LVWgJ9#>1(rpfpdEJYh^z+rtvTKH5SgipC5*jeb0t4Ja=5b3)%Z^ z>pHagyZ*{NWZ~v+whWm{@b9bG{Pc;KgpQZB59m@hyXSj0WcGpnf5)a_28uBYb1)Bm zKX)D9$qzmM;yo1Z9c73fgo3Pf`THVc=a{8NwT2eh5l|9w## z)ua7K*FJ62VQo}}`EReA|29|qhu{1D?j7wPTG7zMe@D|m{ySPm@ZXW9w<5Q{z+nEn zxK4D5>&BD)eM?=-0nfIa+>O21AI0Gis_&S;z@Lxv^J^~~_doTa^@~US=LF*n&O3$} z;z%#o{*GY-&^u?#8=9hcWV_`J-H7`7JH%D46GxsvYy^9TJdLW=kNXa4^XyY7d=AM6 z?knn$K(*i2G#ZdT($-h^>^56kL2TXqr!0Kjzkg0Vmu~d656-vgG99Vb_b%p zx4V#~N8dVYtZ)z9ljiyE0X?+9W!~WAE;bmBXUB7Y9 zULDOY$Pbx+9Oe5x&xBRt*I+HyVMQ z!X@$w()6q3b)@JW1?B||ej(fxzKx7(HEgYOF* zej!xOFA9au8;ao=iI})56wzztJ^pf07)u|Idhto&g!nr5=)>*rM8_v#3d&mCV`P7s z7iI{To2#+Qny#`gdw~njLZ(Ie-(&t!h4PQ6uCCJ_C6J`oziWNC^X6dz7GVjNVFgwp zxBvWc?|C<(X@b5!w9I51j`RM!_;~(l=Q{mUSR<{qc)CCTW&Kl#*3GXM zzY&{J=3Pv0QT{g=yAqCTYoa}-O6aA?eK&L`JK`Wa0_EZ=oxhvhiV}^#mn=mvSa{dwQ-8VdZfAsM1#*m?5 z&ye9^|Avv_hu21iV~+K`xj7uTQxtw=Uxhd4jtmteiozk+_7inj-0>uu(DUh`;jKH` z_fHQGe?R7haK!wP!w3H~y!GOr`YxPtwC+!agVR45e%$!U@T1p19uCxeJXGM#O_Rb8 zr~jGtZ+{l{HU3%n!PNgTd_OTh^#1h=;hb@So^RTdUl~jLkL`EZ706U;yM!}1f#~`7 zeeQ*Dl71RJ?-Yezg9^Q``o2P!x%}eJ;R4Fu;scYJ@9=5KtGJH7kLmx*FA6v5w{aKu z@eutO@QU{zgRp(3`e7jZ@RT|7!j%tkUY??m6Q8)nV)z30L| z+9dKkl$UQ2QeLV=DV&Ci{b| z7oI5nNtl9Zn1N!4)0`tB(%xx+Ga%GD9%=~FBbd$+j8GUAfL{^KkFSv zZvWve`C*=P7GM#UAl;pBj3z%Uqjw(6*SC=$R?t@=JB;lyoeeUXEwUp&tPxi^Ge4{) z*CX~IKWrp7qlz9spU>{d4_k#xklbe8}J$!Ly1mTWl8{@SG7a9%Un zvXE`FT>cl!|9o+G^6k;7JmL_JAid2RJshKVHd@=a)Ot&L9NB~VwkoXgJ8G?8jprn; zGL;{i$TVWpolj;^MURj6T<;Zx6T&BvJnOk$_MC5d&e5}PHwMQJs6C~Ok_mM}9iHs} zaoTUr;2bXC60YDXt|K@9pN}8)J?8fRIQ+N|BIhTFYg}v$5>ekm6QaJy7Vm%S4L%6j zJk#Ff(z%Vhc!;R)A=S=jphtZU(b~T9ztw;6_TZ2kKbx%oey9JzfVENmW=|<{5MuZ2 zwMh;|)CUpuJ=9{j@JQsoE3i!-kF$@ko!z#B{Z>l${j38KTlHKh^82yqDa#Av$=-i6E=5j4*-r_)(YgH( z=DNPw(p)a>m;DFoH}S#!rWEDajlI~912}}XqwWtn2D`rn?(b;#H_!b=ZvFRQ_xF_h zd(r(p@BTKrzev}(zkXMFyf7S*&N0LgkEFON=O*d3ejks1PfsJdrs&&@i9FZpSckX_ zPT(YR_Q+K4lY0NO@EM##nRh0G^3StFeiOnaaaT}%oDF_f*+VV6KKK2rsrty2i+b{^ zbFSkiZsRWQ<01Od=G|^r{-UziiLQN*ee>M+H|Y)h{Km!n|K-XWncJW1&f~JzN_IB- zZH;pDf5=A}|6>^cLpa(aw2N$JJ3QmqAQWOK(iO_5vOJs~zomTLGuMw^grso&LjDJt zxMps%xJow1cyc0QXN@J0Q&2U;xB*!@=7aaoJJ^@cMtjp-XYs}8{a4==BBS~L^T-8Q zge6#p@^7(Q?BBnF9_@s# z9n$yt!QmJ^hO#dR*O{L(UHgDs|8$M^LEHKC`}9ZcpL0X|f{s(#mv^-l_h~@`0Pvx6S=NvBJ5_10kI5rpkiT{6+i^%`K zPd-G|%HR3_{lWvjgY5{ZBXFH^m38u_veWiZB*s+x20O@_h|@ z`EOn|9%pR3PI(cYf@zq6V$8xEMDtJPkqfZjySIo;D}T}alO^=dZOZp1-@m4>Kz6!k zI92&xtbC(#s{SkSYp@pU5!U7?I4^Hbjt6<>;S ze7OGGExZ@|qhtRlKf(HP`VkyM3~?mUgfu#k!3jj)0VsR_)e!mHCxv@I|B-N-JcDz% zfaX`Mvn8+KDy}2Fl8+_+COxO8V`rq%1dx(BS^}v92-WxKOP)B_I z)lk0Ob-RZ6E$_!Q??;dK;|Bjw-BFK0&hOzTl;yt`dOyQ{_?KrwVHEfI{4kX4d(+$; z??UvAv}mr%aB(A16dg0J(6*-}v{#g{aY{mGRf#^W67^Gw`l&?yR1z9imxRW3C823c zNod|s5?Yp)=vyxdY3H@3N<#JGl2Eg<#2k*2P>VVwkVHMk`kw>xHa>#>3?^a{rl4Ay zHHf1Yb^BiP{=cUG{WbmXuZ3yOnSo-=!W_)Q0xUw?&ez!gud)AM)BpaO{`c4Pf4!#v z>$Q+w{+jpyHU00e>3@Gs|NCqD-(S=J{+j;x*Yv-Od+=IVBAsPefmKKkk;f4wVGX@= zs(XXA^!3OVxHsr_PS-f!V;{!v@qf`bn4)_eb3ZmZFJq2+)$-rvYqPkmNWSd8Aa|df zTTQQ7=sbGuHu3Iv!u?J@@O3+9xvFf2R5Jws1f?hj0YPkiNAoq!6QbF4aG65Zxqi<1X&wA^H)mIU2BDf1LiQ^7r$@GxWZ{HIJDrMAU!Y`vr57={;Y!CW;)1 zB8)|~zKgmAPrIcjI4@9&Ox-b75o6imYm6r*jh@!ui(-SBjM^?AR$ ztbHdN)B}xZ+TvbsaQ~OO|42`F|EG$3(VjihnS*&)fG6{Z7SSV{y#veWEAWTw_dopo z>u68#RnF;q|FzK9<^84id`!LYFZkCI{={oxZ4~#_lCYla{Y*)??jA0o*YXwP$jxgb z>$Qu~w_?iGnY%u#p5AG{9p_b%o5`(+(WCG8l+a62j@@|j{hqz_&Th5?_R|j_8WYIu z(0-C#723ZE`o9sM&OX4?@9!>m4gAwXescuJ5JMbEG$D<)8`^(#Jkb85i{6dQa_ztN z{i%JE$JZUrzLB>S-#Gf#Ve3Wn{~FbgAM!ssq#3I*|L>?Y=(Ug7COAPad&fI*ifwY7 zjY8I)cg{a~hVpUJ@zY4xuqViKi1s1OtqW2A=l1`4&U-FwS8x^AaTB-kU(P?e=>0qD{X;WaklSN?pZ5=y{PDZexsQkF zM~rV?WejM*|0(~gl?~Q%k4Q3eQ$ggJ(`a{hfK}nA56734t)W#Zwykt_zLJ^w>%x+VD~I?y!U%~VF?-a zpDiO-U=`M2E!JZrHe)NIz06CD{us^3I@c-K;6?|8C5U-Q7iaNYmh#BJO~dMY2eK;HtrbCciV zA-x~j#rhu5z54N7`vKq8{>=BCLUIsd8wx`qITTedyY9)Z8)fgX6_Fg`x=}YqK2TjH zfASP3Yum+N7Jpp+5bZUv{nq1t+*0>yxHNO??^~>=r5B;?l;@9VkCo1|ANY6faps`s zU*Y-hbbmbir~AKo?kVBi^LJkJi-lpVbjD*MCL#T@{LAAMdZ#*|LwFi}2C~8#Ww@K{ zYBaA{T;(g)kCSr{D<}-}$OWj9&$w`{JS-Akg5)*NUbyZaJCgj}`g?w~{;aygY-=)! z`V_m}Z&qLx)?h8xV*cff*)V%PvzU`w!^25{hpKR)<-+xgLJn7$QsB(YU z0Zn9Xe`0!WeS~`&t^bVn4*ve(En%y)OHhjF8$MN&)$R0X%|oX*7UanUy# zqcx02UVk?H{qchE)~SKvFb*9Z9DdUMoUto=IJOTC2aawCKbpNEyg7SQsA${}_8;A3 z564a62iLZQeeZ1vyO(bae>Z!haWwaL`>3#2UHkpjh2f3%jbRVHbo$otH#p$ob|&HI?TdU!}|7#iwQLqmQ0q>#+}TsXQ-S@YYQfQ zW5?oWA=CDPZvZR*H}t;@tla1q}`l#`F#8HbnB#=Zs_WI3!9KazQ!7;=T zM_a=DfRz6+w%$r)w z1p9+c(Xr0|kh$+qlNsSG*>H;ekER;$N)`LFTibAy{ejjB^M_s;W&C1P=r}$qbfOF0 z$RLY`*WEufjTsf1tpjZtJxcvLO8q)2w5}T!s#kh`h@%#D7d<~DPkDaFUkJBd<6Ydx zL-b?7|D~*85DGCAk&j=waC8_>ui7#?#3}}ak@O;r#nbiqceKBc)Cu>rzux_cj`jRn z`$qP@nwR?)$Bs?P%Qka+#O3xMcqczhqt8GwW?>Hc-Z1t!m;I0Ua_zf%tZs|+k?fX! zitX`sL73;<1z3bkSAJMRF2f46z3BdXjvbzBCmHqWb)R?tFT4Nmx&QC6|If4kFB^j- zTZE&wIX&9_Cwt0_|9|I&&};5M&o}JvFU>Vri}l!u&De?(+=%S{U;CcISG~8w%@yh? zayRy3Kho{~qtQAn`jh>abKhswmyzA)+T{1i{>$T?Q@OAp93c-N7LEHx5>=b@+wU=c zYrHouoQx2z*B;lA3I6}~*Ub$?=|Szk_Or=3PsR_@^bTYqIbZwBCiuhkf7LH*f9F4L zQa1#L_D|LAx{`smuCeMe^XWBa1vUH$*>>HjC2FS1Wg zu}_e8ZYwG;+jC7i*KrfK5xd6!zh|#C`jhp4_vsJuN9+IevG0idHT6fo;{(3OX8f^x z9PLp-FaN3jJNCyQdLf2lI7Xrfx%Gd0^#32$zgxjBS3f6^RA%dsEB|ksXYIW4n21T3 zf@zq6wk_U2biAkk|6MlSWwsreK{VI9K^@YFrn$;Lp7uY{TcfyPz8@oqtq1B>WOkUh*sMmM^!^Hpn@v^C2dTY=a%{rTh?RBckO@{}vA6<&|zMdb>0 zNFaKlcOaVIu+j01{(!CIu4?vu*K?tSUW#(;#$N2l0UW{+977Co^nS~|Bct}WiH!OJ z(_{xSIDwNmjWeizMfpd3i|a%kJ%K0XKU4QyIOn_zxP+(ulQsG$h1(Xo|4ZHfmC8T5 zMkxPy>i_>|<$r_nk5=^lEE=yf*8kV~?z?<%TAF2_(&x~|um57S{`vWEQ`~LbMKt&4 zKKU@xzh{q3a=<3_DWdtugUBcIj|=HTF&xn|>|E}dk=^Wp3>nQo-t6~P+pL$~G&o$R zk92Mk#-eha_23oOgCDjYoUETas1EiQGhFci}}kMiFy z@=xh=kgejs^Rv4f`R`=)F#h*Aekwn>maLn}|0a{d^<>UJ{J{R1=l2V+2xZ1%mXK|( zsU02D^)H}{-i^#7aSUc@W9&nJJ!&X>%}$S@U22}Gt&Efqn#{4>YD44 z&rbO)6)wkttM~!RCeScxP{JZwVV)=L7(fo+rezO<*aR7&K1ji6V+dc1} z^4)<>bRqh-OSH#h7EiywFi`%{(xd%H8m-!=%7nQY(n+ETX~fRkf1e-GL9cz}_&x6- z{RHaADH}*!_HJPNa^+VYSh|}3wUF=q7T?-@nDhu^WzH|Ne&-CC`~H?by)NA7@BE;0GQQbF|9<6;Iwd?%U-1l|J`Vk)u)BI)?)?h8xVH$Oj| zAbVO0!b$Qp&fpv_;1aIjDz2l?IMYq?HkxnQn}@uQhv-N2-Gfw*eR%$F_kWKu06aq< zgzU@8ruHJ5gWW||k7NJHYn*&K{;`mcKiB9x4CQ ze7e4bp4)#)+mhRVXrTTFG%4%N?`r?qr71G{MpP?V{`MC`wC{S6v?|T%8%tIt%=a@# zZ#;b>GM}|ZUmrs)qIWhi+B;0v57xH6Y<_@qrXaF8r;*J^&7UEQk(sUR&Q~rsDfeWD zzPHYG`uiQ5gLznhlgij4atW4U1y-S2J5{szL-x70E1^s$8`Tf`5gXJMjhB^gviX+s zeNXv*qY*91!j`zyA<<1m)jw-@i699HYk& zM_HL~o|8>TqtDnt<~v^uJ=*yW;offxlP7Qzr_p@YoMG}DF5nW<`#fW9`xSa;w|4_q z>DQ6nBn>?Ieq?p`%ur)JaePE^h}N3dO)d_Jg~cI>`uWz~`^|^v_uLe|jk~yyHtX8k zFBgXnvUBcB-szY0?^|C_&%XSU@i*(2(Io9=wA`_N8ENa=TW_%)_xYx|bjsDIyMAiU zB|Ubw*#6(eVZdKU?T>52GxR~IpX%BW?NLyNLUGN9UkXFX;Ye4#6h@LoNF`nh9k09; zI_)zsR(L!zt}SaXfUb3}bM;GMqPR(zf@$demVFk;o-i{MlYPIP8D^1lFb@l`2urXG zmG0Zq{(}wHiHnPkas3ml6Q|e8Yg|6dKI6Wl{=tLK~?H7=lTx_0WvHI8j-{k%~ubFIF>i*Am z|2^AQvU;cTRiS*Lw#qxrhD@CIj-OS&QR+X+u^ZXXT+iD}?#BTfLfc~HYpHU*PWf7` zd~H+C$&7H8Y)J8crP;KD|BIFl$|urjMYR6!h;)u2hUhy$>Banyspj{27dqMG9UEMq zV@X77ilc99JlVgdNnEu4FHLqJrfg=&6R3)QC!N}v`tXHMBY8)?fjZ|UP+cSch_^cr zb@T+1(>*I}S5H2vf6w^+Ib?kI=Ms4ZZ8y}v=wNeo?o_8ARzCVVuMvCoz{Zv|Fi4c1~kHlpp6@h5a#!3? zAajuYhlb_sk1^~IG|yyzNHc|W9{U5$JALCoI$KeKQl#ZETBBc1@4Tk&J*fYkz8BeB z<}aZ8hWQKk%xe&L0Ech{#}GpteLu(#NwVkPw7+EU`}#{JXfGmv#&}moe^vCI-VVod z^&kHuLyt{m|G2*==uw|Y+!(=0`e{Ua7e;#+vwI&s#q559qBc&6}6TAQYnV9$T8Mnr!?H!|5YY zglG;z?J(EJ=14St(EgK8=3V91qdTwsSN!I0sUtr2T$m`l#=AXOOyeVg0S_i1aE$=%}iVm~rJeJ<>5@x8E*`Q91Z zw>JzAJ^G;zIoA6P>zjRF@Cf}F%EPl^mp+E(!-KEEHuv_o#@(NegQJbqT%=-?7t_RLK>}T zzVUo`D4opqywFb`K)S~{;s^Xc@?XG*#~}JM$kH?D)^E{;-1m21_B{T|cMOb0mjBB9 z`hPL@qa};b^K;|1Hz`xDIps#8}a@b^cb3rDsL3Wr}A6y7@fZ1_pz zv~XzCv~ck37s8Kwri34@n-UJJo?@);rSRs;;_$-_#bN){zYO~}%m_c&G9zrSP#+#v zA09RS??3v!%wGQ9^Pz;^JIZ(O+{;pW&(FQT-&9{JPsiNvqn8WA+wJ;tYV;8}UiQ=c zkUmQPmHGI;$XBo1!$W-c26d9UCd*IB(D#Z1FE=^ZwG>2tMmi)%inS z3cE(UWc}e^hMmi2gx&0>zZ>`GVZYxUz~1($;rqk>TXAq;}<>R65 z?8o)7eljErJ|2$VF-O69m8U)$jvd!8k@uiIV^ueE+e`bFEa{c>@`FHF1 zcU$!9BT27cX|A8&p20a>K+fN8^t^CMxb5|V(2kD7zW-*NpsU^5Fl6|!Su|WL2#vR_ zzayI;`TiR{#kWkK_x-nS-+!ZDk9wveKE}RE(f+^s!s|!4 zHgO5mO)dzxozuLM4^G}kdW=1B$$q4?p&j}TI`@1u40u)fMTVZ;pr1lNLN`6*TC;d^ z-5vB!w4ohOzEjt-&2?{*k1g_nhGFt0F1g%&itgKN*F=w34GOhnb@!l9lNj{>Qg;6_ zR%KQn7>(lo3hF(naq2&_)PLrv|9C%{CjLLh zI%hm4Vlt+p4AU_atqaW0Fh-)C?08e1MBO2Oi~T_t8q@s$ZR#KDIW7B)f23!fms58* ztR7KSUGkyvkF&*@yiyXXkClYE^txHn^tyH>`XV%BrCC@S_4JN9|MRx0X%X z{>iw9c-%XDIei6I;yIl)*`Ndzn&~d zB|7Efn8#z@j(dLnWqaVE9qJo+G~TF@-h^hfV5>N`V<&cFFXFm^`^g?Qq413t!a;fx zy}wZxBQt154pm!}A>$s_>0;kNyRhUo^A}K{*Y%ec9HpN?13iOB?SGY@^!$D~BAoKu z8JxofTtcsTzn95w>kVEdd%pc-xK7^0ZQMnRvgJP6hkomP zeFz?}|1*OPK-E0^#k+Qx--e?U$v5?X>*pUyuiI4`Qd>&HX!=+*OwzZD^rq5Kj}33U zU!Rrs-akDb#ye*sCSxipe<9y`hh_BXn2By{I?N_}ex?67t^fO^|2VJz`?NT3m1<$oW-T<6ZmLM*}(EX8uPp4R_=QUCu9{r^|=|99*EM*&@EG+)1Ip1ypv z;Lt7o|H$DmYV_yVZu9@rQWw`WAk&UBZT>%78dms^l~{!}Sc~;2M;0 zd9Ri|Pc2iwUdOJ#9_!b>Q8b`JTW3aG&B&o01suh3{AGUYJS#tHk1LQ}C)NK?tN)@I zE%?j!_m&&%lXy;{u%RTJA#)S>|K0td6E)|h|0T~0b*M)g88o2JHT~AhSBUkW0puVI!Em&0 zQ~#F-+Y#$ooq6wHz1%YjU1Z}G{{IdB|2_WyE&l&`c@sJL^Ke$%|8v%v5KqsiUkoG3 zjn%=8hcKF6P8Rz=?DaF&@pw$cWb}TmG)yJSFdduz@W;eVayELtu1|oB<=A|(SDV2? zauF8F>lJ@8Uy;5Pas8WN@0u3hQ6JgQG^FTPK0yvZ9Ou*irtk7C<-g-0C`?cx(hNBcCF&bkr9ux85^{>^nSIe8~wsDWJE^UDS)%w?y#W59SsCr;73|W26GZwi2V|DRM&9238?p#DAScOS*^MV`}I=dMS2yarpb9XnC@?9-w3nEcPjdOZI7 zmgkD%`0qW+f8RvY9QmJYVH*ySS;ujWzQYgXfAjlyi>rsN-%IYtK_rnvMUB39vKcuP z8u_|ptQ!=_qev>Fj*}c)A#LPPM?3pziD%HZq^up_S4!0P%KCPEtbpL1(s6M=>(VoIr?sk6<+}{=X^M?B)<2=IH&sWfY zx#@ck60zU!glk&JQ{)*GltsnzI#HkDynRN%wi3 zeiOG5_bz?hKT_@g(Xh)JXnwnoKJ?q@yGM_HtN~gfAW&ehd!%*~}ugHHl^xwvF zzNU@ii^c(cUc23Q28B{#BT@Jfn@>I(|398S5tGsTCu9GO|DQ^4QHGb1ca{Is$(hIv zRK`4Iy#jhiH=pEr^#S^Pbe)z5u4yAjY%9;X@6o~+ViA^LDVAdeYS^n<_A13D*6~N{ zyV;%_Y|s_{=`sF2+t)Nh`#Qh1g{?hAM!OsB@Zk~0|FgYq>}@;QF^%2T_MRWgUJqu6 zQ8m~;v;Jol)?h7?bJ+h`#s$#p4)Bd%wazcS5)C`v@BgpmE9*D-VE_MCzi-D*#Qh@* z*iGMyV*kVU%&DRuL=q{~?0Y|b)zT*|_0n3L-*8s?PD-b9nvp|03OI`6IDuAWdRxqA z=s?_4H7`Ft_P^uv+4jEwy-fMP&buMw7>&bZ%Q$WL;yHtJxPaU|b%asYAE0-54(;sx zW%^Zg9T0b$|7eOfT-`xfl{&#q@-~t~tv^8CM{)guY0@>&I5Efl%H?4;r{RivuJh7& z-1k}U<)Zg=#Ct+vLP_XC=PmCE&1vs7Ya9pJRPDX)^Iq?G&-Cmr@8)&w7I?IdQHI`t zdZe+T%~}O~;d1S9u`j3c@BWmC?FEI;$s_KkW*Pgtn*CiTf06ZX$lqD*`+5Czq4SV$~ zRZY^S>3Ao0V=t0-wDXNJ?}=V_+Ot)sIT*tc>r5HP=_3#8I%|ZbkU=wgfBqnmBim6x z&vzaqj*{J9{8QpMc>BBA`VyDu9k2a4(SE)!ahYE6@7~Gl ze@+yH0i8``=T3;efW*j$=h0kacbZv3`y|#es^qaVi zyJ+3`=R_OY_xw50fzB=B&iaq0KPS4t1qi;vGaEf`&2fO;~!8d!5pk7$&UcT3=!~S&H1Q zzQjm!G_u0moB9$RZGDNcj>n_8FN`vvb4OnykD3wEFRdwQt&{e8GCj|KpaBzIGZ|A+ zhUu7z*_eyg^U5Ez-%$Q^ix+tm(1pec((gW-6Mcym9Fpg=%77es_*xwQ_aHG}JcTdo zXC;>)dHO-3TKH0W-Gc{-)SU;3<@6P3P!G?r^Xan>67|?{PyR;voDepB`;%d%b1Lj9 zTJf(XVHLgdi|qGTjHRKk#d?&-a1^xF*NCeYag1ECe!t83zkTvQ*`VCm>b&jPiQT9$ zR(3DB9|zH@{l5)y4qyj5wdvX74zgSS*-^6Ro9Y(i2KDBR`f%fzm{X2h?%D5{to(+4pTF0aPR}h?$H4`9$1LRr zU*!`0GP;y01>`-)&Uxw^!n(iUJGa(n&mXKA;`khzy zAN?nO!g$xkaVrza$(V{Vbbr~o|8C9GZ--1V zaZ>%_EW5>?7052fjcjj|xSRL!zschIckE`gv4>GL;z|8EzfY_Y*IKMca)$B$lgw$L z*Ue+&W<41y>0418f4GZndxeb?zJJEO#6MNvOT4?|L1N#Wdx^aZ9wdIc_(9?){ck1q zC<}I1-%kAawOffFy>Tb8Yto&>&hBf8A8xvv_`!nTCw45pmS}1k5gMyUgqAJOhvrwz ztx0?;9J=#-*zUh}VmHcbX)ibJZp~gM?>AQXG7}bP<5NZpC9(*@ZG39z<)3)o0t#92;{A2g6L|lI^<@fFP9wff6O!?ll+lg(j-A?SB)tC6; zzCJen=}`Gi>rOS@PyF~^|L~(%?V)%t1H>3@1kzYRKFlTYY>Lf16?G;ivs zK{Hx#Xr6u?h9> zVR7%eew*Y?B&TR2BnP2dc5|ZZ`tFjDMz=YC6;JE`|Fmyb`^9kQ z_3~$m#X4^d_KKyi}$|q>9CN#2n~)ij?-lQ4(%;N zp9vdBJQK=a(wELRUE-WaYD+nmCSIPwM|a%g;v#U%r#ipWmhbe+NGwjkEaQXnvLdjYIS-a?U$^ z=b2FThICVSKd^QFJwm;6xdp!Ps@cZ*0UXp(4tsf`-3#5Os^lLMysIhLf>vCw1*W)OT z;{;mS{Wku5JBsT&X4(JM@>cvGvT>61Yb$7eO8Rk#?T>pyb`(HaSoD$C&oWlha z$NyIk443G2v2DQkfXno&Xn2X;V7nfV{~yR6YU_>m@Veh`;x^*m@s)4OE59!Z_Z|14 zU!^>P0T_fKD9+1pkHvK;M|m&Oe|VpL63MLN_PqD%Ik#4O&-B7L?{%p6>>cMvJrjnB zV>n7N5~DE|<1rDbYR}a>s5$02U-Vq}d9I#sz3@1OqS)U!lx;-a9JcXQX+wtIFhl=` zb%`g7XDZ52(OMFwlQS_Jtq-(+yyE$z1D*Kr_-pMSe1_(e+CPqI|G21bK;|4DCcFQY z|Nk*#|E=NI{j_!d#W^1fu?S1B6opTfgcYPJaOnM8{7T+%T)s^E7kgf8|9XQzzt30= z;cJlHrS7psy9_F8)Wd%3eJtj?NLSpuEsmkAQeNKmZk9WLE!HFXs{VK7V>!J}8JR*Q zeJdI^Dx3ExqjxB)$?d{oAHq&@H*)m7HuWT75?`P z?F(ex1NNVM+`h0_xN93K<^soJDK1+L1S9`@GCH9M4z#NlM!5CV9T{fBK64Ei}aZf3|;k!g&KQ z2t$zU&wrF3hSBR@lQ(?7!|A1HcwOEQ_GtZ|k-|n}EXHFZx~*3+nJkWX)E89j4)&>OcJXxsK-}$4)II7va%(oqSfim3n;_8TU9?;+9YR+Mn`%@Ob_oy}15Q_WkjHWLzuo&|BUwa`eNmY5)Hv-~Z?8{;k>p z-hSd?d)B0f_3xDUGiXNPx7P5F61`vZEPrQwC_VOl9w$%W6wV-yfsT8hoTERQzfk!l z&qaTGgZCfXfVg*|{w3+(r2l(|eqz+6^$Tb93!l~heOf>9G5yryxPVKzjH|ego4AeE z*Yv-m9UbUI9tCuvF|YsIe>88?*G(SUr*E6gIX+CbJYO2_iswH1(C@q6`vT`oD-BQ3 zJ6_TM{Z?rhKp%uI$Awq*f6vqZO%4%OwYW44BZnh7Sbr)x5^)SeN<4M_jU)KE`i}Oe z%n0dnTu-Kl>i?GJT4_$9?mgk>g(E|6I8yqd^?yc-v(nlYo4;*s75aGmm+Sw`_&@aj z+>__s-$e0D##EFcCtX?Tn@;c8;@jG29{~DnbU7|Kj&uAvcWBoYHXl{1?c+dJKVKRa zkxLMt#Z1q1DZTE9KJNFtcQWoV5a$-hImXNVwgM}$3TsgDoVESQ^(aRrwqiSWVlVci z@U4eFNpb$mTjsii665dKnQvQT-tWCnzZjBacfIFJ_WZ$fC7Y2$I|?|8<2Zrdub3}L zp20cv{G1<6#`S+Lk(Y55*KreZPodl7UED_>a>B9;tP8ls^Ly!Gy|?f)^>W8uV|)j~ z@{`nqr|AEiBL5?`P5$2`|FfTIGK1(N7uSEfV+wWt#k9VOs|G%33Ps{%Y*nfI)ey;yHyw12#ag<^tMq@0-VCrPVx7x&j? z8&Z7Xx;NMdZOz5~b?FVu%v*EKa;(5ge6as#mE+bc{O@b*{5^Iao%A?1K8_LS;%_&O zV1J#{e1rWZ4?SRi^&#YN7%k_W`)ln(;#!OK$T=_We^*ZLcuCyODUSd9j&gsL^QZWa z3I2<$bWWB3+DdLm96PX++>PS?cYcdw==VC_kIXvhdQCdX*oR(o!~6sFl$7}DP`^Vs zGHAd-*CdgO*PpauQ+)roq#w;_IVSx_qzyS7MoXJ= zTs+5d0;iB0?40MH2xsUWY<4@&(Jvsb>0dzJIh~{Qlf0szK=@@OXFXA@i*Jw@tGYeE zq4v`e7SkH{Jj|ftbLOJSBk^6QQ23Sg?_C?$k+?}_&zt{B-bIf6tK48b{pYo<9aERY zqjj~qhRR2{@4Pox6xXkYA}g&lfNS%BPm z?R{u`o;^l0y#;0B==pbRe3Nngz?o!h``U=uS3cYET+GKJEJ3elwv=3ss*4{9E6B>< z#d!zzAEzhx82`7!xJ7y$e>{aX^tFh6sxhr`%ws*)3oAz@wqiSWVmFHWe^~>e`+L4$ z$8rCk{p3L;kwS|)T!w5$4((C03)wyF1pU$czoYaMc)Wh-K4leZ`tytUg{cwzq@nyG zK4%&kK5_#Vx~BXjAD^$hfzN#0@hMcA+qk(gK5zZsXn(-Zy~Drf3+ET`)8_Hh=J3Dy zv`u*Ie?HHzLvA2H@4WtRKG+%Yox=rOLh`i!Z@*op$2Nf!uF|ig;i7v(8gZ?@o5EV$ z*KP7HavP23C;O1yqg{aS*s;u5{(tmc;&K1~I{xEo{r~*T+DZOng7l+)fw*V*55%?h z2e@VshF}pO}y}-YC-2Wf%b&5V<;c?7Qmi?Jbk8{k*$ja|{KjwgRP4OK}V3S69-{_#n zHA%0^v(ue76SGlKqAh5PH9zQ&*1wxi@A~wVu#ik`k{@sQX7BlS@o4?Kp0Ao~@(aEz zTeZk}ORyBnu>vcx3TvFVcmJ(tpOxodH%BBP&p#z@GLG?SnBYI?ab2Kt z=T>4XwqqxDV=vmS_y*B|cI45CE)>wj)-@_$T1Lt9$c|xyg~hQ4ITXkLDCbglq;FvC z7ga~&c76KX9rc1J!{k|tp;e+#o2RI&tA&C9&!^q*N{E~Hqm7AsXkr<7!sHtUtk$uo&ZH$BQ3>_%W|1-X#fUbSUUuwT^TC5-2 zc`c*#TTjz(P3CZTs4!`nE1vmSh((BP7;*o;K5_t;Tb-otJ*&JmOf7&K<^fNOxhE^ly>= zIljLIzQ4u3Kl0)HQ|&~ztpAUe)&0VD@$JNJ>_sjspDZ&km)^1CVH?sxdJdjC)DmlF#TlRLf^6?Ob(d#o@SsjN=Aa|jp2aS4}k6&ok9`vbLQ(#z?E`f1@N{WdmzV_LXR_M!6gUkICjp% z__cd{f*gRHuq+1AJI2U=qx93!hoNhP`$PUE_vg2s&rR0u$MPAYp@pUQI3PBzhInJ zBK&Oi6#ZXc2nXJi4}`})n1k|s^@tMdLzaYpzW33P)E8EpcskU)q5tcoehj2ff9zrZ z*V(}#bN*x2Z5S2)Vce*&RUF$bgC1q_24gr^~cYncqLp{C+f}1&7`$ z^9;*E?hW(%y|>nvtPglS=o-!=U*djI%xxJq6}$2->kp-sOIhO& z^4f{WhV$Atlm(CaH>Ntj4AZgMS}~RHmV{PiUmMydX=hiaeQ5mSd+HwUsR_-mC@XMi zin0~C!RjEGEv}x=m4&%vcYRryPxij8-IH8|C5ZcvFD1L*mJi9^-<5?GQ{Kw|6jw;tYz!s82h?u?9wQ9X)*i1fSpngnCaT> z*oocPi~Tr=B#Qq3Mm7>f|91=9h{6H>FB-+!G?4$>pRap^|9gjR?S9|?zbLGhpE~a0 zGu|*lc`weK@Hj?3L+^M`-0W;Ky~6l{qW{lk=clm0Y-`**yh^<1;&)2oSUF-Lh>x$$5>CrA1=ijCI=HfYy6F7x4IEM?k zgjTk^?KNdQI?#zc9(*Kh8JViQD z!#3CY9#UhhzlHiKzJFxs4H)j4QjEk{jK@S2TJ<-RtvC4Fo8+sF@)hEqczOPAp~<^K z{NIAMhNneYtUEnJN!(RG+^t$KX zU@k5_i4-zuMh@*L;3#^39@l>yXzg?BKRE9Bm~~{yQ#gZjXt|@miM)i%xQd)QU#x#z zr+3WrZPIVjZ{x8)_=;~;_+$Tf+WY?RM*i8KFm48QHqxAGhrl|bq_iBFxvf&c03mC z#`<@lQ{VpM`*V*quk%5X5@#LaoPsnmXux>aPQ+wPMH!}JCT646b86cr{bYw{(Mius zUqQMaw*N{0E$K(gG5-G%=|>KSQFTwBuXyHTAr>JySo)tgmV#b4$-QAIeK{KXyEmkr zQ$K3ZhwRe|zpun9RH*Y6Q29mU(yWQTR@i!!qY_)O9W~qJ|6TGw>e#P()Y+rc^V0%9Ln zu^sW)N5Un?myuBxG&rs&<2r-ah4uW3Z&ji%Ui;sM_xR%v9=7)vK4yNo-wU5E3Af3+ zxR1C7XCK*bhiCa)^&oo|JV76T9&=y@k=?(2CJZ5apEX8-9F9_q#AuAgcud4(Ohp-{ zVB1dt)j2NTC7Ja8{BrD zjp}Cq(Yb?d+Q&w*uW_%R##!tinisQm?9`z-Y}+mN4~J3prf~-1sl--nNAh*!4_@mZ zcG9EmPGL8FFN*sTi8sBG4I}ppJBSv2@<}pFrpOF(SDp-O+J<-Rso8 z&_xzdlVJZx`Hz>_Nz@}fj{O%F*E!C)_Ji#k?T!mL8s%sC-=0w$iu)tJC;gs99tC+ZO>Yx)A{=g@G` zy&;V_7vqAkmZACp$jiu0^UlcY$okLr^XetI>G(Dtk3U-H8(8iAAM^g-^Zw6!|Jrxz zeFJG^d_%FX=&oz-qYwRdh7I3~el9rxgV1_I{Z}4uSB`XSRR5*t>4m&;Cd!6Jd9w-4 z&TCQsJoLb~N#<}^SrgZu9wMG$7>-gT-}3%_7bEF)`{ZGarjNzr@khS<*nd?%*f>?; z8-nt*^}SQsms< za&iSaj!5GjX&WrxoR>DT@O&by^xG<|!CI_GIV#bsPk$@f%}>})_E^VnC%GGYu^+|u z9c?@@KOb~l`05Lx;;Yv7*LK@$OmMe0p6)*xAMge1`M5sioD3>|qCfwSFNCVsJ!}1} z&BAhMN1Qtu%f14=?kUf2n(~u=91XsOxOZUOV?M5dRqD z^=Aw~?uP%sAbQ6nX?w{y1Nty@y(w+_)${(d(|?q|uCMLDi($j7zQ1|q-`BnvhC8nm z)%4Ud_vM=6Jg(n+_R<~KHK>2heR>Wx6Wl*iuQ(6IdHJ48#x)HXDW1_7i}9F<$(V{V zw2smLjP@z=`6S=wIQjatx-{ZkugB~Ed*3ZMq|DDE_q_2ZXz`Axi)SWgV=i*SvY1cr z80fplLi!?f(F@3<6HA0udB01^h~-9*MoikSnK!R zU+EJg%Te(IZDODCf1lPS@f+=)!YV(;-u|mOFZ7*gdCeRzaxeDdAad&UagD(wy+eH? z_64Ts8FY1fHu78k*2A{qD*3)SUW0ZNkZhBGn&cV!qw?oC{RHBg0C8==^aS~bJSFT5 zdcXc+I7eQy-dVg+> z|38+7?ts6uKfq_~6VO;1`u)gviUAmeAsB{=ijolb7aUH{YG2F^O@vbVNQ}ng@nd(S z{efpdcHPkC%GT88l`C2H?>#n!?Mfp@p%8;fyUoyv+qP6 z1$3ct3;T!WH?*}c)Be6r`#YIK+<&)HTC1j*DQ`NSv&Z`*ZDId$XrBCs91bJ) zKP1JILI%xv)c=s9cdTQ>SA$+a*DiJkdC#d6M}@`yhvVc4#Quj<iA9S`pvPfFCQKvvyS5!ox{%CxIi1gEWa&d%jvP-;jVb^BS$a# z|Ns85ZJ~XOxcLLI-e2?s97mtM&$<0}$rl)aICr-igXnecEyXVxLLY_(#~H_Ivflmn z{DzIK=fA(p#$vd0Dy$7qN{+;6jKz3NMDGjwS;?tUXP@I(i%07@#`Oovg!ky1nNG&_ zH)hB4zNL-&U$g`Kd|;UCcs>?l5!%(gI?!6(KeU z7sdJ?J$cQ&JFgkl^py63*oU5T+>Q)A?ql$1o`212(l4IWI_Ht~;!ocZ#{=h)mByWI z_8tqaJ&NNvfm1kx53T>muEsv6{4wXFnE$s)|1Rm@C#}-^SNre!k8|R=fJ?ZH+-m8+ zV$1`*W1jTS8WgV6Z=!3DxOe!E8PYHOwy>%u>oAb_k(@9n^pX8`OTTBG8Yo>u?T_eq z05WIwE4s&e;js;+M&7R-Cy$KM{wKfJlWE5pvOzu@O0^K-CR%|9?>uHXZuE!&K*$At|gH)9H0@NE<5I*JAsTGAM)e zGHDe)(>XoCeEP3`(^@vqgxQYgVm=n)KSU%f$AaJpnLiDJ<6m-eqVy6 zs3<831uUn>J-=3vE3pba>MLu=?ysohkn2&7N_?;72SHtJ(x`?!yV@82`lO5D4#b;`5h zA9p<)Dy}^jHuZllR9<;5Y`*nuNct^>@(C}5e;E2gc(=ABWE?jmhjtXO_jF14sd-dC zvcAWzD<2KJ@#E}A!%@E-$Ie%u4nM?!9U+`>T$Ok`9NeS-uI=sc^Br%8f8O(*knCO; zYB#_?#}t3|9{_cANu`R zI`zl5vY~BgAEW(4n?dJG{C^bCg+^s?6PnS2V*AG=<@hxIzqWyv5l@Bz;wrBHYp%~A zdfYoPt8O%e-jNkIhS7(k>z26HBZ})URX-6*g;l-!WEe?~MpB(?EIA(4gQd&)b(rXQ zGBPLG2k{lxap-0@YWe?cNh;6(=O@;W(*8G3n;aT2)iq_9j+vrX)U#9nv4y zW;sPagSaPo0eR&MEz3!g>zqKx0rC&$G(DfmMv~%jQ zVUF^9f$|$IP3yx==VX_?qyPDxaGPHFMdN--*oSNHgbsAxdq@579eu#>gszkCguBkU zk0;QFem|jO00yDvjrE~+@%oTjus+mndMDIxd?%!lK?53Jc_%cX87(-p`Wi_SkJ3M$N4Dmn1FdU^wzO>%@{p-U>dR?{u#%TIj zG;Eaq9nzm&uP&}c3vy(Vq^kN1(n zi*kUsJ=2g!d zL*%!eNM7t0c9VNieO_L>C(mKO-x|y;1Vw5Dz4)uZllNig1cn*7peAf$78-K|0xGzUvc*r-w89F*Yo-H+8;|o`FZ)=o&X!FtbS76KImi1Z*PwcaxkJ)F znZ6e5QI1M%#dg%};a4DagB^LmZd~D4(2MK;Tx8$apr$nYM#i!1hv?CE75Dd1z_6 zm%du(A?ccS6mS&BaRR4s2CcLB|A=F1JMeh^hdOpFYa2bI$IoY*=i^ysk)t2p4hjWdtK@c?AR)qwg*(m6u<&%WRP?|ZF7{UrHvnmRJsFi!paBp?0#@@tUmhak=m zDtz160Q%$c_FL4?Hz^bLJ@nBF8`aO%?;97W|1VbmCtLUdhwiEWBS$~ntzP~~e(}eQ z3l-N$jK)}uN3Z#O6UlD*VKUicFTcWn7$ZP0!*s;4Jnj67f{OG^$6fxbfIK?I-}00( zHp1s(J{BT((LG=hy~aDPMR9$rHqV7#Z2vJYwqd9~HRmkFa;(5gtil?sMeFnY_Hou9 z81>MP?>zs|Z!etW#~(w0Wd^@}F~5D9^#>;LQGZ1HKi|5RkDA)VPvoP( z#q}S_4tc&)d7OX1zVDI$cgg?Tr2iH67cDr1EaE)j!)S3IPl#s#24M(t!m{pr7(MPO z6aD|;^ip&=E+8-NPK*>*HE<&O8GG5{rZb^9~72E3K{f%^_kF2=FpCwANbzL zZtHFyC6D6-PT>sVUK|^xIqt<#@mzm9mv&AzP zaok(wXW99`_pMBOBFq<79Ot&0FFk|Z!*chtPa>O?9YOJ6~ z+n2&h`YJ@9{)6iW*DR3!SDoYh*p6ENy7VLCxinynYt~{t%2A1}*p8iOy~qDY+~cof z6Pvt`Z6*uo%Bmk-lYTTirv-=T*^BJ|d+a~roXFkc*^B)+h+I5}Z=R%gxc7FX=oxf9 zFK)+qbfSD{?0;1MdCfaQ96QkLytw~xb+zkWmo8Lj!)%Xn(BoXdN9ip3zuSBxyV(CW z--P-=ar_V2a8kX0gMQ|(%QG>~uYDvOC6D6-PN8+R{J%~fBRk~t&hzpX3iPfc?0=sB zFaG9D{D1lI&;ogLvHVXSb{xn5oDomON7dcQ3wSjC=Mw$V`dOFhSJAac+&laS8Q0Hh z4ASp=zwWo2h~s~5lXp=Z|Kl648`v-0cie}}S>@VA<=oJiPV*J_$!{2d-ufrPAaV$X zVK@rEdm)sPBQX}^(Y@ixFp=!}n&+<08J~MOTGlCNg=gPTzArO2pI-UN5%Rma`}f!f zHl)};bY2HR-0vRvHtyeFNW4PjSqOs_<&c$kIr4}%Rcsn z?Be5Ybw4@pbvwBeyRjFQ|DoOC*ZMcLdDa?NkUDLD`n>(=)AlY9X9f*uk>>r*KZqn! z$Vqp0MtlaozpS*OIfkQaoBsce`v0BN>Hgb=Rjm_79!2u3{S?R(s7`pU(qD&Dj?W-- z%yWIubME$>om=A_)NVBH8Fi>fdW&(&P4+s$IoDjkC0xc;T*pn^Mk^cJ#@4ozaV|_J znRi?uyL{h`2h4p*n`=<*`)uR?t6OG0Km8%axiELdQ9i=^L>zy8-*F%M?f30JYacnX zqDEgDIRwKn9Hscayw@`8BVi=Hxc@9$J(`{qmc>|l#~5V;n>(I95na#AzvJXzbZSGG zEUapq@j7G~;@%n4$(e|IXr$EP>uzXsbUYWCYwzdZJL)O)Ld8eIeBleR2urXO%dr9< zY(H4(coo*7;&aBylP%-a?a0l>9aMgqk511C%VI0NBl>o4CBk<4PITqvUF6&3<$3(~ z@4Ogx3-9^$i(xOhAKk`X9*kk%Fb084A%ouUst1v|sMq_Z$pVhzI8NXc&fpw+|F$$- zAiMvqG+ZKkUMLNh$?Ld@+t~QJ_vT$!U6GD;+Rw?m!YiIF2}yb!yLjJm-A3hr@+SSJ ze7{E>aJ6y(H7~J$>_}=9J28!2AdBPQ*oZj(e~oLeqEh+1`Fr-~r1$$LWhMrom8~ky z|KIi(_Rn$GCiYL99~%G924<1F!Tw$G`$_Zj&YGWhT3f&k;{*8S*$3JWuBjiOReOBf zDD{hR+7HO$y8UFqaThtrJq(HWQn7>$~t(#K||#9b%edNM7}jQ?zC^Pl~V zad6&vOvGdq{cqt@>2Xh;b~dw&J{?`JvcL1*_rI5!A0e#j6?GVLE~5WEpRC{`6vy9> z^IRvnMnB>r;TiW6>mbE(hUDY*7n~Ev>DRCJ{Hwj2w0A^qxF&C+{DJ)ZDeZxuea8M) z#yW^=DO!)o|0m^tvg4xsPtQBIIKE?_`u9ccf1~&xzUP*g_#en3hr=k2Yf~RwF0K{m zI>yIFp8b#G+*v%sZ+lGbB*FJ9ltzlu)o?&6@fg#}^anNr`Y*+4n z|JBm)y*+;uwhb-~JKr20et2bgNcpYzcb^KoZ#@xy?A`rH*e>sV@0fn!r_z)0doyzA z{`4opyBCcu`s6d=pI)+tqk8SWNy5f`BK*|8SbOii81_tgG5n^j!VmSE?VRxOu%qeY;Rj>>XZZfS{~3CWAE+oXPX7htQ3iTnlf18K-iiMz;AoVe zv@afc3cX)8?&Yi6J-%jNJjds70hdtm_s@jOWUuzF-ir6f|24Avzcv2v>)M3;b{#iy z8+UOZg}_(%qP|~xzn{qu7=S?-f?*hrD)pa(IH2tPnLM=6`NvV}w~-hPZGd6pj$m#W z`>Fq~cAquicA29+ihn$ff6TAUpaG5R__uuZ=Kg&7ar(EX@Lf?{$IbbV*9SOad^|ea z`0o4oRJ)9i=Nq?7v;M#T9FK{Zj9imA{63Z5agG1KM?1uK`31iFF8+A|`SaqsWnTd2 zRJ~;%!U}aI^CXgd_UXcAqWU!d*SU3R>-amKi;TD$P@k0!_tD&cl76Yrg~m5ND;<-< zeCI60p~R=P(~Jp==u5B^%drA0u?lN&a7J81%RUkG19Qx$*6#DqiO+|hn~zw%PQO~> z3!!etv`{l^N~pauMVUJxG_0Et>KB{KmiVmtm-Ywu9OpRHq5gUIh0H+rInF$J@s^_! zTd^HGu^W5QIzs!4=iKhx4zhEaXCu$YKAx_V+8-8pKZ~W4EWV#fo~`p9y`Pqi1H*pt z97GBkC8$Tq1L(L1EA9mVzcey&ZBUcjUMi{pNf z-Jh}k=SQspsGq+$9^$m}1EXC(7UMAylQ9)#=>4uS3!gE6WT^U*<4wQ(?_nl68_6lg zKgi#6>2=3ETg;~~L<2p8wDLXrdNl*tM>eV0PP~BqVZYMIw6Rkd=9)!Vf~8oF6MBQj(sO%*c$Py#d?$@ zch5P>@k)A!`)fzB{r@xae{BDMiv3OSf62J#X4MXU*el8`B=g2xyKXy*>;KGRSFzLa zZe-?2*B$i$GCf26qsjfD`K0?h;{K3*PyNLI$N34Z{k886R0kwGMyT_VdB+8^OC4~p zIQQcql1L#Fb>B%dSu@V_o#go<_QTho_P@xW0kQo*=e%|la1^--?qP~Cee{l@(vK7L zQ|Nj_T|&Jhzgqg=GA}?_Z2vz;UO-a#CGs+g?f>)qFRnVij?7E`cZ~a(=05z-U$y_w zQ?Et_4Y=u=+bI0rT0dkT`W^KBV*vik_Wud;ADSo0f9mUh+5W#lzTL<-iR)j)F@S@_ znLR6i@?D0|MVA}YRVTmn6gt%&3B(|6=Mb&>`$?lDy9n$3r%h4s!UMBaS4#`s?8NAY<5f#-e8qhkB7 z`Q?+!+*_G?YFVgvoZeIxGH7Ts-`F+BaRR6C!TEn@9Je~B4ehgD=KsIU|9?5;QJC{` z=z8Vl(3mzqJ!^h?llke}%unBAemZhn%uh#^-{Sn2bK<&yOGx%F({@!BF4ODYbZ@Vh zJymnvM`Z884dkii}YaW zcV1juGuAn$`mOwVZAs*G6zBha%(&Ay|1TJaDr_$1V<8q{3I1ySpE`dU#dSK^iKWh2 zjulvmSpQ!|uEAQgj$nV;(f@S*-xlqEXdWZ~;}F}FMUH-0yFhIJTQ8n+RAMV~uktUn z?QN%b%wa#VlfD~W59BlC(TTmn_TwOu=ziMsB716H4jFQzdiaJ3>e7Graxot_3oHCb zNvPCTzxm779DSM}@GEuoc&+(=1@b74;{;Bj;xp1Fu;JO3}|o=3Z`SpPPbXF7c*W@9e? zxAXsYuu*2)je#-&Ui21;ydi(qtc7x|Gs3s zDS9%i|6-Hz5A-_sl5$Rxo%(QkBOL!srbz0A|E;7y z8vnP7z7~Hy{%>G-Liif2M>&e?zw^^7>5tZbFSh^b|7jS?H$|FXSC0+W*xzWKz48Or z|F0etwz?*JfPZPO{dW3J>_+-fLR#GQR6 zWb)6126C@!_v0XvNFjq}@-KT7Tda{R|qkO=S4#kJcYJC+vgy`-0<3xQsae zKF+(jO5gljZFlUvlT%SmPaWacqs;MiWatfew9a;{v&}|+Sq1f=nbf-?;rL$Z$Azqi4-zuM(Z0N@DHDt|DTfor^)}5x`dXtxiq9fJ6 zE|6W?CW`C-Phub0#W>gQkaQL2-%H!!M4W%GZlFH3F{^HciQ-Hiu>QaB$@Dt)iMalM z#m|g|cia&B4%8{q;;+{~Q6?AU z{NJ2b}Qlzd!Huf&JOk0q>+ zFKjz@VmFH4Eq`P$J$}dg$%80-N?#tCK{G0UV_bpycaGkU0^eTZYe?Q7!mr_D049Yu??U&Ua8hx4acqe0NY;m{9p6@(K*=jM*(q-fa3b!XXXEk z>_2(vq_Xu*&mV`;{nhu!|1~NDjQ<-V&f%zh&hz=oh)_x&iP0E~!hgIF#*@8&cp~)v z*7v*1I3LH8F%@NaG=F(Iy-uEwbFgR9XQSZ`+u}IxeIDmv*Sul=`RdY;(uZBAZ@Ye% zerbLG8G3_$?qRN(i}_fHMOcERSdP{$`v1|s$NYbNGoAYO^C+y-|G!%Qzy9u~llsre zmW%rT$*kiXdDwX^cS^$w@vOuutU+#sbFh})@szmrqpzozqswu@ah~kd{}cEBt!gR_ z8y0vk%e4Q!S{f>>|Iqbo{crTt4C$Kl{`~u&NW0?($Mp-OliVuK?Wp*a^_s}t*o&lZ zrIM`4dOn-PvrRl?y=Rm5Trv;D%>v{Xx9-o1a|{T9EEdA<+CwV3|0{B0Q} zJs;Hv<-ZQb|3&U4aZK``^o}>B|8@UIZ$=zb_;~+=*E~mIRb)F^Kyq1WI7%KzwQGy( zB8cmR<5MW^e{fno!~e$p54yiHJe(2U^Y;V7Ir0K7;WDn`Ituo{xk=u}_a}@E_sQO0 z{y(9Q{NZck!_E=^BkVXoHvC}L*zn`?`pZ@i3A+aBFFP_m{B(r=v{#0OJ+p>|pA7tX z_$T+YfAA;5KK*v@zNPMPz}}Sm^hfP_G5l5`PFOb-LQK#&%vDq%X$) zR__f^pBtc0=A)sin_qRy-f35i{i*d$CH{N(2RuO^fXWH~BW&LCKSITp{~k8Y7$3Gu z%Rl!28|_bV{-FJC27f$!AKz=z|G0Qq*f~xA<13#C75~ydRD>tP?s@tQ@uPX42)m{X z4|~^@ng^#{Q{4YAVRs*7-S32}iL5);&P(cqhRM10>JckZ$(7_El=%9lR zI_RK-9y+)|k31)xo8IbDO%+vCQB9@Hn=gCki_E?Ab?;3NJ@n8+4;^&SA#c(_2c7iL zLj|Yjv$n&WQ&o5Rk8k{bYdz1ip4YX0>$l#XU&AhCMcJ}~aAJjaMc+gA5}!o;*D)Hu zj&I^O(0gNa_!ikc&Oai^p7q8Tk>A0G_$_p9P(NECf75>(xngA&G9%ggEy|oH)^ryC zJNRAv9@4HSCI0v66-EC4<30WX{g3d6sAAtM#s4wAZofXPcH<&q|Nl<)|2O7nAg&RU zI9wRYn*1-2AN{Ge_c*@qPaKn4p?!a~#*Rgx>U3?GU#}Dyi{1jiBUr;FhV(B|~7CAKPlW4-(9r_y3LjU6Y zgbL^SnRKd9i~6XS^1p|xGtldjt_^8=2jV{g>V@MN$po_E{sMoAzruU_!1n#M@ZaF? zkpKU%ex<*s|5yA2{t?v&jsH2W{(oEjKjzD4>i>vqOvLr4WBvc1?Dx<37yKNbp-(NF zgtn{t|8J@%qVs|N|2O*okFs$m^_7r~3)KH{b_M&77J6J8E^WWo7mh3J7di*_3vJ#< zdwehS9J-Lf*@K>&x=v%tbM$`Vno6zeS1r#xN7Slg)h+WL7nnzh`n%rS#n`{%J>E38 z(KQanuiI!5Ey@l7;eH;a7ECq<_D;aPfq z{@*ZlF#dEWf4@uFoZ*LO`QtgVdIkSi+d(brkU$dkXuw}c<1g`7_-p(P{uY0SzeigE z{~sObES7$e^ruU|B&P5CcP&l){j+@i=0*JfW&HnDu8-b2hyOp(90KY51O5^JgsQRp zfARlJujB96YR~u=`p*%^4#c$qlVtpd+-KtY)X9tZDtgBig~8;n;cMvOr+uC59^v2a zWZ7`vDW5mCRsFhf^>z9Gw){U){zqI>CW(4_1G38e->}~v_2In{vj$~$bYx_)%YXfKgKW4|7&}s{6fbDvg&m?va!kd z#dhNt-AglCUKziLwEJmwKh^sDYqs$%N{tgFk<|fIo50|A;@u-`2=S@H2AnOJ#_*fGT<|>Twp$?lDbvAdA01T-znC zarT$=PVKB6+pOzG|7+x8+W<1^0A1+b{6Y8|@jY$ksGKu@;F2+)!heUqkMXm7pJccC z>_3p@TfY+ik?i$d{1f@lsN7+G&sF38=w)MlU-(ox9M=X*T>iX0iLcjypWEj%)IRdP zb^8|#y-&im&$fmkxn~MHkFU}Pqk5)t9koTudA2oCs9dMllMU=?<3M%+XGgLVL)iuU zrjU02*2~)0U$HCbxXS*Y|GeE@e7+sN$uX7YxjWC-@O6wvIw=kOI=%CWWARP;Zy?qg zvdBD;miTXpt6;ajP5vgT9(^Tzhx`zg^xB%Y^ZI{F_}`=cxa&s#KLe+~5`J6Uf57kH zckz4pef$Cb5byfe{)isGkv}H?1ljuq;Xjgl?irWF@BdT!$M`hH;k)Q<>=(XA#{S#y zlRYc@g&&eX#!vAxR3HD`{ug;4h4O!~_eJLaKQW#C-)Bsb{c2H%d$KVJSH`}z1kdK}yO&i?#`H2xCJ2l?4#ocH_J$B`@L{|{&Xi`jp&`7--|TiJ>Xd|Em4tUqcr%$RdZv%b$fNoW1**`VSvYnkl3Ye->KNJoK~hL+Sh& zKgG|G9%!F}&&*}<-4#h2we))ATyGYc!q3!Er7Nytne@mGR4uZd{0mghbKC;=h4Kk~ z!(WQ4pXt7kuwQb{XXdvm|JmSA(d@eZ%04OAnZEg1_-p!})$A@h|X5IkTu9|FAPm&`DT8LI!u0i_bc@paeLT|y-N#2JH2u6>_Zj!fg7u`tvM&lZQ59pmGzFj<`KS8dWZE&u4>mTOF|1aU=qV6i6 z_9oxJACA%=Z_+!B1DHAM!)T+9y@2{;}aJ zVLW}$Z1&l@5FNI4PIP@U)g6$f=fjm`SgD;QaO}q0d3X z6VFCm@`Y!G?iqz)p7@?|{lWrr5td*XR$vv@U>!DK6SkoDdB3oYtbV2aW1w>k_53GG zbEdS%I|sdi++m+m?80um>nDr;!+znm2mH?`-Z9zvM*b%=!dWu6LjLz0oAjfcMKf9u z|H~ZzG1!Xg>+BzU6!T`CJQee1lB`F=bop#;pK#cDj^a2@;xsBzgSPSV=VZ^9?37=- z$o%?T@>~v$i{$UI^0)YA+gr$#{nKQtb2aZa2IrD8MOtwVU=x`ZmpTf1{=Z)Pl_LZ6 z*PoE)BiE#?$dFy~_U^}D4ZrD|thnkr>|c4;Hu5~G#9bsWqb{ioIPcz0zbykA$XEtk zwe31?;x>9O^$mB)ZahFwOW*J)hSk5Hka4Wlu5G@z(h!~rzrZWRzP)Teww)|!ls^#X z0PdMx6!uzAu=9m?mey#pu0K5XO?-llq78Wb7M> z{{KY!WK6|$RH)0(BufzI{>>rhp^{#^MZI!IL0BNX2zkGfT}Y@~CMW8DG8VkX*zsEO zm)8F~Iyk&>Z%Z7v3@fk-<^hIXC_|fm{C2){zJGrO|9b)d8~Ok2@wpqH8GruD_;bE; zbG!KgHE+lEr%#&yU&{a222gXC|38rbkHkp+e-K2Q{z&wGCKB$|*y`l5ThZd^XS^Rlt|!WsH`UlOO*#%(r)$e}Jyro)5D0oc-R}<(h%&!tK&8k}r@%0&&jo1L-`%6FkEUyh3)0 zH7_R0`y=H&v=zvo@?SRQLB#oean5gC`#-J$8q0##nex8))Es4j@Ehq=9X4M5c;8Ua z|LQsDyBuo!aEwGT#$r4sVlt*;I%c8-bMU@>R$iPZ zoYVKRfLwxJeL%~|@*{o13UU?ps{8EeXHBs%BCHW!hweiKVFTGy7Q!ZS3$`Ja0XxW8 z2BbF?gi`u0>_)zPekJeAlNsdKhs+d&{k9#%VPr3i3`fc1IEh|uKBvjJ{(mLeGsU+? zCeeiQDgE+3R(?F7cBME6FeN_D0jy#J+Ua{{_qR?UKiDKo z|1A5TL!-2sv?ZKXS7=seXmO4d(mS+I;Dt0^;SCDT#%Ep_y3Y*^h4h|tBSW<71L@t` zXM68V$oExMC_hh)EDWa)TGM5P`it#DF&rb2UdP66>mQ2goosD~=R1}@9=YM34Ki$d zoWCFIAAffIb(m<|R7^*{{^8loq}M(2Zn}*xi2nyT!JoeQwr-YupswKA{&ds8@aOLL zM3J>0p9~B?HaF)dGmESV`7|6?hW@boS@_YTXW_`j7y9J>Z+&(DPx!&J7va#GSK;uJ zSK-s*f7So>^HAbkbFgRlzl6Or|0R?U{CU{@?0NX^`KRGO>sy;=+X8&AdrHi!K7Dz8SIQ(GOfUwB6CHUc^e&I*k%)i_5)o}FLSHq95_YXf&)-1Da1$Gtpm0A1h z|I$wILVhk7B0qmE)ExX;s7AxlZ|TpQqz~xVLgM(Zg^J42;b%ClE#j`S;=hdiy0Ir; z50(BGcjn|b!YXO5!8&ZfCTzhr>_GE_-w0>Xl>8q<98*v5 z9SkupU}tg2KG%P1zb2&DN%IBh@gGa=WCrJvZ>L$IJYJ$aCOeh89ppvZE@LuI;wrA= zCT`;{x>239K0wMk?x-8D|GUI_i}gqAJ8syf|GP;4_hkLwwl(VuZ#k;(`lNnovQ@Y( zWqd$J|1{YtzH7I>`(@S#Fuy-XHWwF%2hQ^dPw)(B`==cLg5G)Av5!WFSM)c??a*JY z4>?12UDy9VQMtFG*f+%vDg&!tjSdCP>R+g|y*8=e-gOKVj{gsl-5%GvYmD>~7I%w^^KGj%*P zx|b$t=f?*qgWr$;Vjo(CYbSfI(~HA+X-!1EaKq72zC-_E7+=&*#o47qOC#tvm%TUWv8utMA_tU-nU zK&&GxZx-|ai^B%`CTv0N3)d+wan*H|@c*Z)D@^9Uu6z63;~2>Dz^AeA4(#2me)hUw z*hSxs{WyrjIEv#qi8yxsG+BunB+-Nvdhb~KfsAALGGx!Ff^eR^h|9Q&&TYQu4eA{7 z%XQ(LvL%ZQy5x(P=WdF>jl1YZx>R1d$nV%(sQ%0UaW8RhP92$WPjRi#`lIeIrHt}T zJ#b9f?YI2|7u3HW2|vRNwAIM}c;EiF!nLxkS>)UQu708a&wefHSSh6Gt*8BSXo9g0 z(~NuCDu0jgolEl#_O!ii2PmKMl~B;)d&WRy%{49}d#@N9amo0J7k$D|;o%sGVpOft z9{ZqA7)y_HD&zQ{@$`xK()@#Y+W*$@Kk;rI@dQ1Ick75haV?V_Hx<({6D62~d1zzP z+tGnefIZ$x7out8kazjR&ZG zVtnWWV?N^853(K&H?3o*4*$sZ@(c3RyuRTHJ@zF$BVQnnwaX!nwaXyp)lR&!?F|Z2 zzSsTg|GU)t=@q`!fn*VeqT!%-h9nZm`|-)Xq11Od-1dx%>B8B@zF{W0cZTxm&Y&=dJ`W492uskrvVT}cu0Zyoxw%LBhgI}$W0BU7>#zZv z(7dD|Y$3N{2TBpg_N8_xFX->u-*(gYBlkwx@Io0;=w0EUxWhP#^4WvJaq=WiqgP(9 zB)iA?M#vs>x%M3LzFrRsO~NUZ@%`esw@>+h6?c6L;wq(I6Sq-)()|C}J}^`Jz*K$S{L=jVe`EC0+nUjG zo$q_w`2VZ?dwy@*8-B2GM~(UE=*sYYEBV0cjXAP<7ylo%s6ztDgZ%&9Z^!@Nbq(Ek zfJbaSwd*F676bEE=qE@*WV^9~^1F>=5G*#n&$J+)zGF zTaNhp>7HAO=Qzi6MD@5#rykjR~GNxiWX5!29PkzC_ zKd=46x6tHUI7>D!QvX=y+g!yzC0m6%JhM*iB5h>*ywBThJew|hW}D|$;2EKDs^^F_ zy|u`59O^lu_PFb|Z}kKBL9d_Vy&WmtjcB}HKsxd!X70qHH) zAJ|b8HqkrT$_{;oTj<-6Q@_p*eOt%wVn26?s~BDsO37WQnqCxkllxIwV(be0Sog*_ zo`-$IVR5m~tATw?u#ZWydK3GfWFsr>ml01Uh3ms_4d0{EIF7w+ zz#j98+Ahm;$p2^Ux;(86&LBU&=BV;l9&al0{K@8G&!0>Qr^#0Pb+7#(be}K>U|5|0 z|3Nq@?e1BF!)dbTzIK{Kzfei9LGK`ILy=8UUv2(>i+4b8NBP`-p?QtxLQj!toTp#J zWpo@=Ki%#6PP#raD;(F7tVkAygSKDAb=*XixJuln$GP*hbNYw7^lmf=*9#}f#FGBu zfw)I_f@gT|AI%Kmh43r9L4N$(7;QYQ@)-uA2-R$GO-lPj=JW9d1GVi><^Q8$vT^s- zeZx@u4aZ0nV=TsFBL2WT5**NBg62c*uchtNCO{tk774fHJkA zZ~xi;id6rw#kM^Y?_s{V%pZU6MTHJ<;_{-Kn<3k^#={|%l$;#v*6#WiQN zBasJ@R<0i=k0RQj4*9%uRY5o|d=jzmC-*=(G0%7K#`u{nzJKKBzmUK9f7|lMv%+b| zRiXw-G$Dm{w5{TAs%QT<*Wb9Q{(D#dH`%<2|GPsy!TxD_D_QK@$VlfrVqNwkSvl2s z7F?!Z#dXvc%b&wtUy=NoQYUEQpQ{Jt{f~j{-^dVd+V3{HksUHCJRtX+n-%u@?jF&f z;2B=v72e?An*UeO=6l6J6k#ZaVhly6w4Ml)n&d{V&b``-Sqi`;4|B>GUl2 zU6joV<)dbiABFMaCSo$CVmfA`1alC_{?8+KmHO_sgs^}f`-GN|%i{KhA*>){TjeS; z&VO1%uEPfGS^klF#H`SKvLxK~92SY&ge^$l^#Ar$N!Uj3eC7Y`&q~4$dMR=vKUDsH z7&7+h!Y*+Yw@boqazCmLmxP1lVN^P;usQmpfB}uW!-=dwyX11K^QMQ5&3l|l^0!XRR-19OF|85 zz00~sCF=Ly%Y%|o|D+@|4CUXyvOdREeft;r%lcsBe7vdB*_%**?&=rX*yHxeZ1i~T zUyrnZy<&q9#|Guls7z_vA^*>j|Is2}r%KqT>HOb`{M1b)VY>8YqN=ncl#rWzj}_KT znnO<>l;_dV%_cvgs}Ic+pZ||XlK=ct-^(>kT-M?}y_00XF72VrE3O&=}oQDtnKiql_%Cflr_fcubHN=jSr%{RQ zn0}##?7j72=)K|_pf{n}bI#9ybpQE(4~T0=N_>ZB)R}Tk!sn57ZMiM3vr?REoGk7n zF5)t-;yP~PHuCG|v0=5yuMeDLdv>uw{M*#i1t2lPtw465*m{shnP0u|z#)k9v< z-=LsFS&cNCnVQ!>45W9?bZ+)EyEcR(;r#l!>}h5s|9dR^yWboGW%e$VX}cS0pZxs4 z-CqgA>2x^}(dJ-kSXCb*=oNEe*QA)tUz|Y{B2#js`Ki9 zw+q86adq-&?d8I-hQ1CB!u4a>U@|e)*cow~um#&t#ty$5-@HS(Tsg6e%#ZK8GAQh( z@5ezLMzy}bn&aBvFZ1n{G5P;;pJ(6EpiTRzeU9TKPNNbvNTLaE|3Ciu_($V?yVP$o z$R1bELE}N~@AKaFLH^4BkI#`GrPGdFKRyqcj&aM>pXqT;gN_C2CG^X2AK~nAdFQY^ zL|zql9XD|sy=ROcAiMDZJwwzzVtBds(J@6n49^F#P3KvRXZO2Lncsg(8~@_3gxyC5 zg%{#`=l}QNRop(VPk2N2Z2o#E==6PKAbP9Wq=(wIPFW9fP?7JrPbd;!DSrYpRP3;4BwGWUj!l^C%NVGbx`NiNcS32{t z5Q~wXZXf>SQhMi5{x@HIIejH^eABFOM!$I%nfJe&zTkiJombm0?|+|mUwrVoWrM?d zaplHO)C)H}N@*oxkV1z|hcZH((q zGM4jYQ5wrq9^(fTh0~`C!ajP1vhQc=#Fbb3g)>8~Ay_;zRH5c{u{MH{Z1X5%*tESL z9vTw*%o+-Yd4EGg)2rdySchvz-N090A5#8>*0N(mI3UeKID%t1fm4X%^nNzddEXR; zQ}}fJH^YA_{7yJyTQ&C7s4q`f?<)RI*j@5JhwtwHQ2*bzL&CO3e6ML*__L|M84i?u zI~<(+?eP8azZrgTeLyImtL$KZemMN!g&%p|M^@?oTc-cd_w|$g-n(Pl(1mk2u`7fN zF^+9&&&Xrsuk1hDRuq~_%|%&YtOZ)wtT;wFO|~BN{bsaJX+x-8W}U$e=A_s+Y3@q> zChH6iEedO$<5T6Gu9Ccec1idON`2Q^?f=auy^8~d;hMNxxPx@%Aobrt;U2xyKk+)A z`S->I^Z&+Jzi*`b92&BJ|&-{(s8xYsmlxsFNI&DewlKO z&y+YmC?pZOLRmiyz#t64FpNNNS3wv>cJCY<#*jUW3&J>Z0w!S!nmvbU1cPj5G9VG^pFuBZ&mcHYv~5 z*`oiMu)PuG$Jk$fR||bl&|iDOe*)=UIEM?kge$0C#r_}F4~V)-?X&9U$(zb4b@K*X zv(F9O!X4bheLTctw5?$O(P6w#=S82>UG|U6BBy<$F{S-UnrC;h|7h9qwrx7Sf&E8Z zf9t7qo+E3m|CeNXneS(MpYWRAxkKLE(nq4cQ{;SGS-e}na)7wF{{A3x2&#tm3B$+{ zsB~PdbYi`Fl<*kTzhZlkuwN1r#7#oBpFZs{I!vKY!wk$q8C$-~y?yF_%K7no8a>Y= zeZy?q=3+h;VlkGYXJenRoa|N>uOxei^$DxVwOEgh*o>{%j_#-I?=-fTzuYsTf7mIm z4CUAtb#?G}@ArW4p}4K9IP@N~28X`6xc-)EN7#F-IP9@j|1sO&t-p1G-g(vY!zub1 zxW2E()KP8_drT@I+w0?iQ6n>3t<>-*B(RZ(} zJ%e%Lj=;+=EAOX0C%$jQ_09T;s~)R=9>4T|f&%^XXhajvq8TlP`o0Gmf1q!_b-M8f zMaCZtcOAv9gX|K{JTU&?iSY+y#o=#+0nW2$g8KdZf-s0)wcj|5UEc^p=ymp~#W4B^ zG&rul+dl+e_?G}iiEExbI*cL5A&z^PKu$s&2hrgkJ26Fg8nWBe%S+YMPrD9OUw41E z-Cwuy2S_Y2{s8r8zzoOC!fedNd@RIbEJfR0`Ja8uk1snY-Q(*2c-Ow$IMwsPSu~^N zh5Y|U{hw@gT!nYNTskYU8f#H?-TS{?9M;q8UU>i5NZ*Y7|1^E`$p_v)^6Ss9P~NEj ze5(IryM1<|4CRr>eZxL-?>>3=aldeYeh5c!4Ar~U|Bt>M(^pr?E@t%g-Bka-&Q9Wl zeNN#_+!qNnq6KY{E!rQ^flhSYR8%Pgvt&-W(U?;Im*!dhi_K_RqWuwRw4!^d{`bT; z-j1))7vHm58q)0^5W+dK_fX$(fxLt(xP}|(UHFXd%1hqEKSKeW*E-*=`s zw5HSzYkWVbZTI~kK~JK7hwlevmGVDeVZE z!Z25OKI)g~w? zVw2~4lKnsGT=H}IaT~E2Td^HGQHC})yd50_ou^Q`#qxiF{Eysp{=ajD zU$Xyeq%VzqIDiWExkF^-Bke70^AY+noWPgH$29RB5$DIBvh57A%IEL%)YyyS&{bkCR;m zeZzfmaeUB2vPb*OWAZ7U<0W1robz2{00v!>967%u{7(^9Vz7p5*)2r})Wy_Gyk;i}l!u&De_V*on5J z{(bv)IHr^A(r270<)f21{(WN&|6lxB+ndQ2`^UBY^W*=?`2W8$>6Bw14j^4(pTfc6 z5WSN>-?525Nk4|%68-NR`2TC1i;Q_U{{JuE4@9rpF*uyC-zikiabFYN*L3~w)~-Jz zu70xn8m|AHOpe$8e$xF{+AnE8GT}Ot+70UO+V86W4rnvjyH-ARYyypFK^wZF+~NJB z83E1SESs4_<3N55&W=<+Knp#ExW<1gnp6G5Yw4IXAL97D@E7vI zW&51(ANtWduSy#O=!1}>XOTe{hKQ?prVc@lK-IziVH7zAaSh$t{q76ngeRb$-hc#> z$hW_*`P}}NieD#wllvzdPWBIz95V&eFaxtN8*?!qZ7m+-CMoC>wUs8dfxt*m4*}aH2*&Kah;;azNZd0>I}UaIeGPc`?)@-nnK^8 zd=>llV;_GUTO8NLZeX+OSIKAVOGCnOjc7p|x^NB`(Edywf28hD#(xCl$GHpV7CDb| z#lE-3iTd6O-uA(zt}7!(YC9y;xAkjIWIv9^ISHPj@4PCbJ|f*Ks7xt$o+x*ueNFfl z>hCIdTx+8Ht)H@Ib!phUy;R?(x>Wci#DBZqvEMz!GX6gK5S`*X7O@TD;@FGaMCJKR z^%dKC@AMB($>(^9*9iZc_$l(=q>n?lzW#o~ac#5#QEm-k5ZSH1S#h}_451Ih?#(;G z2(s$Dv1j5((d)?Civ?i}eH`K%AoWekfBPiZtm?a^-oeMAw&3GXhXj(SM+0uSmI;oV zgejPYFU|j({BdaS{y21?6J5w4iyRu~d>onf#b%Z>I6F7xx zb&VR-vVChK#I7!_5%R*yoGmi>rkbXhe3YcH=U(>~WvaCftQ{xPVK@nuFQ1 z_2Y1b-rZIju8}uz3wLl2_wf*q@f5x1J`T^xmw1gh|2JGv=fD6A!n^suL+G9ISqFQc z9jA;Jen0>B1wS46{CmFd|CfL1BkVT{mGXG)Rry!D!WiM*_q1KpZ%NkM{@(tx zFLg*vWE+;S5zE-WMeL?yW53xM@(3!r{oC+(zi@hFzi{fAc1g#yBiiWBi)_?gwu;Om z_l*6U&iss6hy;n?lX;YZz@!w+|C z4M#U^4M*^U(fBa0e%9Lbyj(_=ncdypPkkFZ^Gc28Jr@R95=$+L`0}v&`unQ4 z3=9p$Lqh$dp?tKr{eaEp!JU0JDm0zfR;0bCMV@*nomBS-Z80OmWBOA($4k5)pZTV5 zcr9EpyfB1|?t35qQC)Qay=uJvpXr5R5WQ}LHk37mVF-N~8dk_d;*v;gDGVdTjY4_9 zL17Fz4&591v1HHV|20e^r(ha-tNVu;K6ou|f{ zn*8ti_FIU>Sc>IXiR|-1p?!gG&@=9IT^(dbxQooWwyb`vCTTQEvss#F$&_=nh-=+p z?t=E8>YK*Y-0i1)>=)|BD;JbUajrpre4+ATwezgSdThiO$2V*iZkwwA5*_Lp@n4Ev ze1l9%y(PmxAR9OE|M7nO(@b?Eq=%}5V9zPfePL19F0GyDJ=0%5oaY(!E7}FLZFGz4 zo?(oTc?tW(AHX3T!7-db^JMvvjB_x~kkv@j6J#SgAGk(j?z%?AYs;daZClZTHgw?} zF5nWXr}B^akF^E-W4s@q&cA#={(n6GmLJAck|<|s}|SKtKa3ik+3ak-^ccAK0G))C7&Z*IXJu|Un7-t z?rztO@Rw{9vXA)UH=X;1YoJ$Del-jbKL|rG3{_XH4}cN$x=s3kF^WD04JFo~L~_|6 zZK>MJ2711zEq0y5UDsIGg?cn#oMR?n5~g4pW?&X(qfH&I{U#g3pYJ5QgfmOn0_4yr zjV7EO@BMg>EfYOk=Se$XtM-Yx(wL8hs8HT4CMzq8_z?c{cgL7I?f1*Y)jm=dAz{ow z(mw#}(QwyzmU-;aBKAldK!xvZWjub4|K4IXJ-_}>V23BO!{dFACGr==eV17b z^lBIGefdGyvs(SEpYrV}TYp9QwvA6=`%|Rp&&ijFYfpEq@NK-%2C~4nLB@Z?=KbGg zd_=^xZWAf?Kg0f$4Z;^3`x@bY_x)o424M(>p>2cyb-e56?;6Q2BU{8y>vL?p$^NSo zoF$v>*Rn(ZJmUI1t!wnZ+crWvqc8^JP*uWz#RPhs6Boz7PohsjLq_{Ml1LQvH^9^P zx5@XnPW^d_^WO4)w@4q;?7!!pe6Kw!zdpcB^@fSA&$xtavFp3-{*P+^IO!bZue$#k z(wK$Wn2Y&Xh{afnwk`4>I?##t>%ZRgjOE4r{WrOPG#^y|ah}u??SE*+a%oh!$Cc!2 zRH;iX^}N^8E9tevUDp=AyzoZk>t%=4;i3<}s(;ulZY#E9C(2NceK>$aID%t1fiK#x zQ^MtCzUMMN)gWzm3ICec9$hWI=a}b9HlhV_E?OJeg>$%oOSpn-xPfMQ?iTrOeE%K# zJ={k}^wA&4pKs*PC-SGdc@7V4dyJ=ej+c0iaLMzF&Ne*z{L8zM8Ck!D+ zpzgHx32lGL8oqzK_KBm~Cs3{rAv?b~>|Lw>|FLKppx zzu~675z2)RV2}S3?KPLO?Y#bVbf6Pm$RLXx8dvGRl+Ib{#kKBR#_GRxo;XLn6&0sP zheOgkf@3&=s>;#kJ&q2i=&?_{_W0;_htN8mY$pJ z-B0xmpr8E)AdbDr^7RMNd#jCoAcrA0VJaX08({>!d&cN6iX4M+n1D%`f{Lrgp^=s9 zmNUp%s1i4uoQwExo%k=Hql6UC1Germ^ba3p}Io>flICGzMQ>>u_~&RJ+ES>&}h3 zBIW9EK5`-7xPXt0<<7SftFa#O-+>#+&De@I`?nW+&J#Un{Nn!|^6RfJGyZCou~uY@ z_mP^e{4Y`dd;WXODcpOhFl?7rZ&wIA$ugA3@B#04k~Kh{jt={Tdq(;1IoZu8I26PB z0GmClBlKf9fpTpC>5I;Bw;-Gn?!4~WuJYgM)yOsZ1}=N|=UppVal0_A6`w#OT2NKU z{uUd%Pp@NpYx(V6^mB;+w5#X0C)wTvc|qJ4+m9{@U%@rpKz{x23+w{@4({PT9^x_b zQ48S0?O;-Q7s(tLD_A&LnF8cfRzfP+E zEigWCnel;3jMZZM->*+RUj0KopqY&wBAsCvfl-KK_)~}bhcWa{GLGRNN1uS)8*LlN zpv!)f#8s3Uhd@q4)p3dA4_}HYfUX`TV~*|M4(i3IEpm|GoO07fNd}mLkrJSx&CR zYV_RuN?1$A_OJCZtUqdF4Ets`lX3jpR&qOjXdK*5^4<8iGWxsmZ{_rT_{H&W2gDu1 z5mdZU&n7FMnE%IDJwZQ(GpJP-#4;j*SVqLMp#fhUUr=qo1R9ZDIw-V}d(QE%$0-x& z=Wqd+a0S;8`yOiK^IFs)A@3)Z1NrrrXGZ&56mHn>7Vh94?&BdIqiwSMFTcI-|CO)u z>%VxHahykf{pBU{zr5c(QT~@^YP|f9*5Q16V-BB6r^i^r=j2PoIT)|W@PEk98$SsB z$n5G5!T@p*hM-qnU>I38-1jlk_c2-D&lNtzYV~RHvHmeC?x(L{44MD`fw2P<=#wx7 z(@??pY;mq}^ck3isv`Zzm`#s9Y5X_+T>5-8tl^uM@==i}7#tRgi$45faw*dE<>X4F z=pE+ZcVe~hT4d=tyjv%*dKLe7lXI-&|Hd@Qq;Ngi(EdSK@0g9)jIG#?ohU;&+RQO; zHwU3(ne%LruCWIha}lz}9^}>-e{k8nb)3DetpP3c6w>yK|Nq-3jW~}zjRP@Vb;11e z)GyRLTzbXQlTH>n#~iZH5p>wrIne(Dk{^U)!ujz(m;Bd-UL~#)r|5Ny+#Al&tIa1A$*E@UU!##{8xHQJ5XVt#e(auB%D;f;he@_W#dz{9Me(LM+BoEJuF*r%j&!I`+Sa zO&#g^BTLT}dj8k7|Ka`rKel=PrJg?-=U;VimuIJ$``@M=z&QVv(p-(TSdWd^jP2No zGW6az_utrna(cx{ev7`LUE0CoT9D-@jNRIz|3B3y?6W<;{*QaxGvD*P(l5Lp|Kq+B z$os#Q>=a+Wj*Mg4lc(8udc)oRVXHI_;1G`B7*606&LF@3?>7E_sdjp@YZL$aBLDg- z|C(%+PLs6HvUTzQKrQTPw5{n?>@U0AJf01dP6CZ+L3*rxrmKI@J2$YuYuH-)IpmhH zzsRf;w@Lj&T!rnI$SbH4e~r9>$}R3o`f**|Tf%oxKh%98-_PMZ)iwNo)VjX9Ch=F@ z|5W)84Jf;;u8B`IlizcEcDp($?$aOQ-yDCtME=8B#I<66ar|xkC*fo1So$4Fg?OQ`XaTVM8hj3NB2vsli5tD-u*AS?^&aPpI z@G#W-?&7)@`TvZbF9;*Vjlvj=!vsvi6imYm%))HU#e6KpVq`a4A1}-b-52yz&(&9U z;4AOfL5bgFuQssVb8R1@PGSF^TmM7Yv$!Pe)yK9{xc7{CBT>%o6V{UJu@RfG728oh zYgX7vmPH-=(4#-{02%wgo0s$rhv-Le3@4D@5!-e|3Br~29#HuOOAwN8_|L`bm3gomneVHf!M~@wZ*$Z7CHRV z{C{yNy!-!OSxVl`u;1h*>FsGWFaO@GaE0ExR2$tS^MaT9riFXZwsg*&*1iY2~9 zGI#5n;UW1LRXgPWEy`hf-Awm3wNH3Xe~JA6|9m6K;qDcOCXWk;3nzr{7flF1xIG~p z**!TNojxi2@ZzNKqtw)JtYAtwK72~}@zLqwCljZI6Pu=mlgForpO(xBe?EOiSSro* z%@0HB;)me^z4`iw;WZil+V_au?GHoN_KfXa=%jZ%`Y;Tz?ax+?3pEc0YJ(UM>I$`! z7ikwaw=LnHZ4Gnub7h8v)7{28*tepfU-%il@?^ho=4ik0U)l%c`^iqdm>EtNmV}=@ zniVQul!PJn9flDYg{JevLZiO3d_P&~u#mc`Ukqbx8;4KFjt~E7;`lIuJ_&m+P7Hf@ zO$y~BCx+elZo%a6pI=Q5Q*4`o?-fi9f3|;GIBoE-6)KO;3R~%Q^xF1WVLN>%8urgp zH<_jHWL8L^{E~JzYXy|qR*tka1on{!kfL`aOG2kLAPxy1LH1-x$c(@VlxG#8eH7UxQx=KCYAzgJ*J^Mpr6+Q|t z?fV+x|CIlbe}3w;aX!re;oj;G!ys}<)W>`nswZ3jcay%?b#MD%6UNIYjhC+{8}!MZ zkyczgDSNMP80MHgL$nniGsc2G3T^Cjd$iGPO&qJ=Rme6My&bcbW7`^U>Vv(j@0Dz} zU&{`*9cg;&8uoW4`)f>n?eWjo|83|0qy9Gk|K?Z180Q#=X|7`eISEr@7&FkO4$-bI z(V;(~6J7coGI;<0fh7N5yTVyC@8HKl2%`7-6Bq1V3u@dV=m?+?V3`qeIdP* zZS8Qai|I=d*EGzoaF0vetGdQ=aTV$#E6LS}{|;PBu1BR~Yt=pCy#9^Cn^9k^{_#RR zW2}0{aQENN|Hu3Pf0Zbk$9uPv*=cgCW4_qFvR!y5%8=b`Ug0j|gAQu{*scBJxb~0z z+CSKnEPIku&NQx4|4?q8MKfBEdSrajb>oYWZ$GT;6Y}ffO?aCx_DTBy;y=LRzrQ-i zdjA{Q*?3NRP8)h$+dG4bZtYLvkKqJPA^Ps|-(P3wb@EI6-%m9?frd@;kGSMEeSN5L z&DDpEO}ZLF?P+6^?(%ogpgpHvTy|1XXmnf)+R%k_xPVKDYp=HLaPMS?dRo3+>$W_0 zkqt(ZXVZx0IiA4+_fEE~aPMSV_=+^H;RbHu4tmYQzenCjRv%lBe{Mabckk>M^8F_R zjVHSB?+gzMFr3F#6}dPtPrbEIo%^C)q=+cCJ{* zUQ4dWMr_8O*f*uVxRt&gJ5h%6NPPCk`SA1uID{ieuW}7IM(G(6+K@8`eo;;WPA#9hE8T)`Ln z*RBbdYlpc(?tK{7^6M9F(eL0M?xWf_QsbMcCF{m}Hzi-r{}1i+7*FvWFYy{CotE{3 zHrmjRj@$Bp{QOJnUr4h_S#p+amUat0{u?Ike66q81ZfPy5Hue)cb9y(KJ5tlD2zde zHn}*4r0Wg4MP?D_vQ<6n6UNy-0ddUvB(maipD=}-hWZR!bx>WilI^0`rqr{^8Me*B z`}v;U{cPc^`b7DNzF|InAr@mPmSZJWV=eZ?G4}e_*3&m)Gqz$ocA~dY+aMX+%FD?( zFL)n$0Ech{#}MZ|oFGr(464zc@E*w6zFs!fH(K(x{?YrKpT5zWmvNs-zN5H)z)U{E zAY=U>x}MdA;f^MZcmFSKvtOrY6zc$8^mE8{)Cq40#PVa0_>E z5BKp9kMR`G(R=@cP_<6m3jH-IH}whOZ#@5`aXsV!)OXA4h--c%Q9H-;#~|B=p!zub zi`p7?3W@f&?b7x11`M;$7wyjo;ZYca>{j)!B>SIYBeffJl3l_XZS7g)(0H2vkF#h- z3*x-EbS3-mymMW5Yy*hv$F|VZCED7hJpr)|Ao~8D*fT)8H~%L~&!KCJ^C0^Elk7VM z(=Y>7JM>@JHj7>-?brq|n?4uupE~tON;|H#o1cHb$v*4c|3PuCDY@G|^agUiV-{jD zmSQ{rI6)?DuRhNIJ@30f9TG^&zxDEU19s`FFT)APe)`H7fkWosd+(=&&!Fw1{`IST zY_jvNdI3G7k3aiDJ>lf%{qy?h&!QPEi2wdbqZQ2y)D5JQKqFd^7MDUBy>qv;(M3Op z96gK7QE|8+uA<188S)CM=-0>_sHDe#{?y@?@EzpW6%tNdR|i3~soiJZuCw10*V^}- z?%A*BS_t>ahj@(M8RpMM`JBxpUq=0<@h5EAJO89S6o=R1!~Z333@Hu+$UWyguS9Ve zL?41-7=ck3gU*Y_EgF~DZcJjE@rb#r#w{Xq`RyG4=3?Itn$U<=`=-^;Q)oe4ldBeS zZhtizN`1RXB7t$vF#(e>1=BDCvoIUgPpt2+Z@zY_{%_>h#XqT!S>JWT!O@|gedc05 z7Gg1$VmVf#tw#ShI#%gtU!q@qi+*)vx9Ml!pkMxt{_m^$zc1?lPU`<|*Z++)y;VPd zGkvvm)?z)5cz)?(`(UGR=Rj%C933{(lum3+(-?=(O#hcM@$Du7T{%?it z?7~+2SJG?kTeoQR+x~CmdA+n6kkH?sU;lrV{{Kz(TW3G}Cdj04J=q}L?b6wavUq;% z!vP$^5xk#&yF)z>UFw1vl~0>sG^@EFQAfMTj;)ujAIbKf_mWw;lyzF zNmh?l2S+XHkU$dkXuvhc+`uhlSL+8L@8KaHqiw(Zj}CdgGv>*5c@tUrB8SEY`rwrx zXUY8k{pcy>MEa)upHa5d$Ul6QPc!Q1m3_lgX~zF@rjzEF(aZ0#Z)C@0>5rFwiS)@V za>Mx_kBnoseb0LKS6wmO@C>@Gqa?l`y>5ehTVq@^eGnQ}xHoZ0B)0SoL&Ock2#mrQ zbkF)m7)SQZww41q2~!aJ>9c#7wEiPs1>-Ty)M|0w&9&Pw-^iS?2% zwEwY=q)Kezw%Bt5@QkGwYD zJ`1rJOOc&5C@d#eVl~<>%m3)OEq}77U9aR%WRJ_AXq@+j@p-G*f8o?3`I~IDU-LX| z|I%5Hjo6IzBK99!>7Dzfy{m87PTz@~a8@`&b{$rJh^r`22mgObyZ7*}uJge6i=QM6 z7};Qh12$lAAOQy)NWcRG**LXxAcc(GLTX%-fz-4nV^WhE3J%~OLgIeE3lMk7LU*AH z61qVODWoEW1X4&LMG>TsKm;kIkb-?aTaG>H%$zgNbIu>%=Y7}OYp=cc+I!vJb=l-% zWVbaoK(0m>J@=?`?L-3j;9|(15Tp3p>;S-HWvFH$>`s{=gEt>ge$m)8@Po#xQFl) z{W`Qkw0~TB7yE;Yt37v9H|>pn%9`%(SAVGkzn1<-gZ<~;v=5;1{l=3&N~pN0k9XJD zj^owmQOwv^AGX-?ziRt2%SBDSI^(p^JCAxhx|ZTK8>p%*m%;}2(k-2KNj=P zyXoGpTEEcO_(B(Uu?wF6l;^+f`S~vwru4cxfLxDg?&oH58=`$aTgIiQ`JTD=qK;la z(YufOp!6Jb`1hIDpW93RM?MM=?eSTJ9OvYt7-iVqkPh#zpoqrCQI*|dq*$7z)t4e1Hwl8SFAevc&HS24GtlD7yBXqBlMR2 zN8QIUdIO5szzh4iM?`Z7w)e4zqqNv|wOz>UY`?*m9y8w3E^MCgSjc$gv9NLaBVp70 zN5lH@kAw}^9|>zu^)Sw5&T0AMVU0Fm+P)`4dY>o5LM$qIG@Nve(@350L|E|36JhyF zkAxLjp9(9ns_IiAKI}hQoppTaut&qP0`b~2a)a+^x}v4Iq);2A|}AZPRG3 zdYbUJ(B82Q=!7olhMvgL$Jlq-_&}q+2J-1G`}LkOK7eBV?-J}sDazDw<%rhsA2=M% zAyhx&;B4&!wCumvN8Q^|9h|KGADt9>yG~#9$3SF{wf+o)>3QkuKMbXhK#{tz5YgUc z`54LXMU282yo_;p6|W;otNDn^%SqhPm{A(uq`!rnqw4=Nu7T)ZzxnsY-JNZ;GrLK$ z^_jxFjfK`;FY3k~PoIT3h~tz0?<~>&M@@!&g*tjYisiEsW$k{l^eX$CaycrSE9eK< zWlATqeTZi-3NCWTK@Yon>|Q5Q9$P0RsW-9eUAD*zmIxxfHpE2{d=?0IW;(hBlyGi z-!bmW4eI|)b@VRvKWe6HTcQs2D88!x$Nt;udUbEv1@(Wq`oBc|kGh70&>)_ZIE(YR zh)Za`WGsw~Yiu%Gr|)fbFCnYXhY8u|K1_JO)t!V6{C0J=M{0L{ixg!{$A3>KXmul@aKOJM z6t()dgq)85k+3i0KN50V-Ac&o{_hE8t*#`LPx)0s-I03eU{gR`&$KYi|eS_vh z>|e^&aoh_%B0snm8JJ-b+|aHqE-1u$)|t_1KJU*onQ! zK}-GXyGliK`}+x{n~gunT>(;mi2#~^uhaShxbvwC}TdcFUV(ug&B$w7>O4#3S;mxa^@wWkhR=(Wc^*{0%b;N zvOMHkuOg~DUneIaS_61c9;udp-r#-{(b~WIiP||cv~%ch@p~KZU>4>e>OaKEXf040 zxfsjQtlnvvulxJ<`)YnI?LKp<7ji+#ANQ5dx8;Sa`ZxX5f8*5?Xt959Q2%AB zW65Zr(Dlwei}SdMOSpn-xPg}We@Xh*>hfx`MxVcyY#Dz~(f_}#eV|U=PnJ$G{yx|E zJ6k~oqWOPK9g@N=@ig8@40p&++Slsrd)(ot(&+c|4-)Ck);P8y7qQ1K)E?gBUfXkb zKqquTwmLNn-ROCugddhdHw>=Pxf_A ze+U@G{1sqK`JzAae`o(b&g9VP25&cK03sVee6%%qNR>Au zTR`1;dvB8!*e3p#|2AF{_Y&7Yjd*LLxapBCV2^TWr*M0bgM36bfT-_ZOwT*2>_8d4 z5=HdL1`zEr8rcA9_#MI#^!B|q8^1e7Z@@`3v74MG8<)2VXUX%ph%2~;mi?b+DF5cN zAB;55_6B>vtmJURv0J!_!{3M+bC57j#2U^hRIw$3QeUXrEuxr?(H#VD6?a z_8=igU?g5d>1As{$h5SF!x-{qe7yeeReH4dWXt-$o$bTx-1WVDo5Pd=gM6!G&UO7i zeW+ah)I2gj>RXd7`{a?A+2+lNZ1V#q>i^!=|J9!@&C>s`)&C`k)lY9$r;~mVx9MnnwIGKjU zSdJ$1C{~k=_MF&r+1`Nk&De&W$mW-Io9%)gttrUWAJ3uZBU)2mIP4=EM?U$<|My<; z{P<*lgX!!aD0WU6Dp7+!w67hC{7>os;~=VCx2945A9bilaYxUO{U}A*P0x=C96+gi zJ0_l%`M;IsqtK&ck&Waez50x}ahiS>^+Uvs+M}+8^ZfQ?>R*zVkX>%yKk^!~q;+IF ziRLie;J$^z=^s5;WIKuGFnrSg*VfHjA@64>15mg`8Gz{jb9bC~57GYH2DsyiXoL3n zr2p@n>NV=cD)l0wHh4XX@9O_!zjCEixl(pP|DUWltp6`uX}LAF;^~Ag=!S}}!f0Fc zq*uGIgXm4~i|C))b&l6M9{p>(Kfh=Xyn*CUME|}WL5@V$cxBpb&xM7f)Ulo=+Ec7Z z*^*BdIR1b7|JRG+8HF);8PUH0;~DyaE%_pvd!+owC+mMM$XnW0pX~qW`ATX%|9sDi zPyWBC-4^ZtI8L0e;&n8imfzKfljv_mZvFqdYwZ8m{J(;`sT+F~8TE(Wj@`op)U|7AMsbT|0XE^CbLCoGvo;K@%sCGddvF1amw%O+6U_18btf{)Sc4* zKcRgfOc^RsgF`ri=8=iv7}+?Gjh}279_Y!#p*Kh;b_wC<{Tl75R z4!PPIcj)&}WE`MyqWYg-zGFG=VV`@+MYK-fKmC6<)4ci%zIU`id;DSk@4#KTgna`o zu=dt#POU##vhT3Jmjx22M~oQGH>s7VJ6`X zyotB)Hr~N3RMwjRG{yXRvS;vXk}v+x&n1t?uE7vTrf}<-xT=|HNrH$txe3osSR5h+1%h6$U`_1`CIRKjBGsF zDm0KS^Dl073(^1oPIB*2ww)%=;yf-Q+c%YU&VEw#yu-e&yY@$>Uqj3Of1bGj`NE{} zOYJNVU$(v zlb+?tP=5GH-}jU3f7aq3^$&Oc2hT{0edY&LnIC{Udi^fvSfd|)CXb;F+N0u>II^rk zr&mYk-nRcfy$kAHr*4=yQp6=(H-0_Q8-39qjgy`X1IebcZecJv6eG|ax`&bEix`D5 z*fXR@c$plBSMfTsjjLz%F)x9hXPiBE20L+%J|K5dmbn2{#?EVvp$iuu`#<`>rN^z2 z6{cLeR?y#~FWjk}l;|5!CsY@B=jGnLw67zhy%=&bv4O$o~VxKTQ0n=_h{FUGV*%@%@jL|ApBvOetBG;@c!EdddH;wOSnO zu^G`nq_&Y+8S*uKCw(t+kSnj}$-DWt)tmP}>KhbAZpVsIhN%5pN!H*Hj-bU}==)0T z?mOmgz)37Bi}wFcY_b2HYGbo*GY@ck8 z4(NoB*Z*{(M}7W-=tl2}`bK4gGNBgH_;+!Y`WgG%N*|PUl|D(*2M458CE4D^XJ@R{}hxu>T|3!813(7KO<|O_fum5?2 zo;T68(enSF5p4F2zODPdG2|mBT_13T{x6xAq5e~sMf>6w*6Jsgn4c$(H}MwU#ygmW zIf$cD9a&YN{>w7{-B$hHRsD{Z{eP0w@2Av%*pE{D&;JLE_5@55&tfdcYE-n<|82$g zK(FpBZfvG+L;Y-Z>vVN&lDZZ<`8DoL4139@*R@BIrNt26TMF9-zkC!UZIV9YTk^yW zV{EI{1N@pd+y8&X10k)Ax;5yxEjOoufA$jXhxwj`p0`2U0Eg&DP{ejn$aYYWuKj>x z{7SF34h`f!KD4Y;Op*S5r2hcvKU(^~pl#!vE4YRm zxP?2ohj3B)kM?~c+S{pR{n>2aC+d91E&u;J`u}486E0C4ZO|TjcG>5T%p$YViN31W zFB7_uxyioYB;Rj$-!ECnT|{m&rnYg4v9*^vhV_|`g>};(3v0)Bw!h-95>`+6Wx|fh zt;6;qNg?y4$HTU%kB6<}yBeeG5i%y2_w&oZtUPQp{!MJYYT^mx`FM(zT?k^oWswCJnPhQ8~+RY#TS)P zUkF8ap9_VpMuc5m*)F4ar#9_zcFOA|M$z}IJ}D25&ipf68Q$+#9OGy)PKr|g}3R|OZ1;= z^%3Z^(6WEoE81`qwB^=l|MyD{bNI)RhQ-L9XdS_X_nT>WkOW)x-E`)!Rq z;K=+}lK*r2e}K8v!jzL0$E|;-x3vG;D1*7HkFpOOW*d{-6_1KFXi-f8hK)5oB{$QH*2W$YfX@l!mNA2vU5&GHUe+zY; z62seMT7E3dA{WhKpKqTO=FsD4wpUN{DE;r7>TvGGSdP_b`3G;Kby4(aPq>3q&B>sr z?Mw{y!{qY`%4mIrs4cNA@?V}9c1AJ`4SUHP0w*>4(sd+b96 zw|{R}o5Ie-##55TuEgGTfV+$=pTTxCF|tb~hf1<)d}NnOX7^9l_DK%4x06Hk|JQe1 zrwo;-LAJTbQ5|rI9<5!=?Pr}m{TQNu0u**tM|D(Jska*Voy2LJ#UI-5&vReICA93n z$zFJcehoKp3prW8|G!_Y{`YZp>N#~Q3h70MwIj5H?g)1e;pf_XXoL3X0Bf;B)fw*} z)xPnlO;@|Y`_J_L$zpYR$rS7V^!H2W>TA)$#iKb_tGk$T(UT^d~X`Fk4ouB?DBHMjr!!IBsTg6-aN^A8K z$aheoPM<~2K{@^4c=j22^07zXqL|DYr2m03 zdL?QQ{Tm?qM?k)IN3?JE$NTd(chUasERE&$$QFM{*kf3DXM}Ro+yeSZoW@x+z4bsy z^er{MY3(id|H=M$QJ71(f@`>eTeyRJsGOnwHCH;5)hFa><#_D{dHlHc*J1Xb^8W|> zpZ9WLtg^@2zR={pZkY#4rgcdPZOHcMfKKRwZs>{L=!<4+75bC&=X&nxAq=EPb6kd! zBckK#jkzhv19kD2&0vv(o35HS?J%;fUuL%~ut$&|r{X1DbO}sP2 zJJ+5KWIcEBaqHi)|D5&jD5IC7LYM>7pKtkg_B6Uqi8=+@*PjnN#apMpnRPQ-m+!ui zcF9;Q|3Z2Z@=<`&3nM}f|Cad$#%uEF71qU<-yRW)>DAWGA1tw!oL-6g3GVF`_b5ya za$fQLXnFq0;za>I`Ty5W-<0zX;RueQ0Vi=9mFmJOM0HFxYUKHr_CLKI#rOZ9{ja_( ztJ1$k1^s}wXX)jXa8^9$aS@kL(P)jlwfk4-(LZ?)4vU3r^c$$3%l3!b+1igC?JL1= z&#shkhrEaEECt>rS7qWynHkRR{=Gm`!yM&bYK{y)+FgJYck zGREOmypCw?-z4(?_x?X4pASvi5>0Ka%FL?*Z$9L! z7obobRD?8ftjA_-!%pl)4)Res-TZ&F{NL|_KJ!uie>U|xvi`pQzqm@&ar^N<_P%R)x-A$s+6_3t$85Bf3G>xeLQD(^Qn`io%$pC{;oKF4L6W>p1p}&7-zrh zX?-{S9>QhcHrk*)I-u#D#L$V1_QdN#c0*6}Mql*DKn%uEG?yiY5#&hZ=<|Ov|6-r= zL*6VvA-xFE{QVb&8-+1=8RPIOUPt9L`CmWbpgv?ZYUs7e%Kky}KZ?)E|JW~F=@j{2 zT~$t2;D9=4er*C9xUqL*Igt%K+M9locoPQE^bJz^Z%(?Gok3nfvr5tP7 z|NpAAxhsuON3Tb*<0aUSXs^VwjvtL}e!TzxS@E66MO?xaT*D1CpSA}S8TJ3~kWG8- zjnKYxxJM7a@Esx=`)otDM+bDm$N&H8LeDGrZ0JVsiI4X!F39q%_+L#cbcsFJ_)o$ccoP-# zU7!D3^y&?+zv7|rHvJvc-*tV@QR|vDn8j}n;z&bN`{b~gZ0y2*LN@2O539-b*oIK-j+OMv?$TfS9PA_gugd>-qjtUH z^m-JJmj16uf0UwZh-VXT1$n@EdzR=6if5s*@#y~_hv?bw3RFQRUcegSG;kq1${ z#vGvxbAxs%gHU!#-bDqXe{@ye*1xzb51$avQRiK79$Cj-PhJw|6j8`N@8p+S)z2 z_qg}IWXt?}>-+lCvtLmjmYe@b&(r?Noxy%WABv(O$`(X(527~E2!12+B1T~hUdA{y zYxC5-l^7c1_SEEV>ZE^2PQn{_6K~;dyn|?d=Pa^Zn=ksm^Bj5{X^7^KSEu_vGJPLI z#GB&#AonF1gZ6KfJoRF(`mi9fK_mM2PqbFNw{xSq@niedV(#TwjrFK{!M^aBZ2#mz z_Rf~|b!?v>udj>#{TKZsIQmEK{#NRBZH%%KW8VeFz{x#pv@w<&pXe+9YX5B(XSOg| zj%}mo9d}>%+4<>v(PDp??mlO`&r|N5UulN=oGeDvt|%ib(enQwef^ew3AqoU<-de4 zX{Ye7Wdk_E?-&|z5~pz%=h2*#6fTmDgOkD~vdKRkuaMVp1GliJyZ`SE_^1uhV&CJJ zea8K3bKIln-E{v~_3^HF4}I0?ho$QU_uuIL`TuX%A9xpq;w>WUuCe_&w`KifPv0iJ z13IA#D(4!1@eU6rdk18VG_GyyykYhqK(Y2v$qZ>PTq#*L!8>?K+IznT$ecCuM}}{7 zm;8bJQ@+g$zR@!ujZbxRjh^U@`u^6#lKn9dgHb8JS51^Z_2;Xn$e;T3wWvcqTE^E_ zsQ=Q{f9!o_Gvx2N@;7;ad#E@@pj18`NtVkyX^WNh^a}Z-O#ZCv+s*f-j-8=yEpZKX zb3IvCW&YpPSQsVj7`%*e$nGxxb+zXpy=DG#iaEXXNhng5M0vk}jOH)rr~{)t$a8Bw zKbc?P`Q?j3`J*VxC;hvHH=Oq--oo3Mg+*_gAJDCPsFbIwR%m~b)$&#i{gd(C+3ZiR zXn*08{U2{?f1&07>zB>pKdWt_-kYDL{=*z`$B~BSBldM58`%bylTAad5imbsb>!z; zT2F39%X~j+lQy!OHl8tDemk)j^;0|pYR|Yge7qhg$FY3u@elEL$YPYC64}PYv$Fh) zoSxU&_%{yGkD#c$M<_%=lJRdG=;49v4xryqx~H`!18lXSIR2XK?2j z|KHd759SvPGQYssexW%9MZKP6ANe2QXa7&<&*iqqN~k)={>R2$jhZ#=f2c#t{@3Ct z5$ArCqO7gC1^D>?dM)w$&rS2V?m<$7`O!6MT=S%R>}PzSkN=&mNeH_-cMm(0jaS8c zgmht=YUoFN(`T&J*FKD%Y02-;t*~!Ew8ww`MRVe$)xj)o&%46$`W4piqn2L{_H;Jp z`|)H^I^@jeql8>-R&4Q-s~7Qr1!UWZGgD|Y~btJ z&~APvELib$NZasqNN?;P7IuFoESmjH$amch?qlir&xd9G9`WBZ>*2aS8dl!y6jpVA ztmU~f9A7*BQ(@hjE@8cwyy5QSVdLFyVbfguGh9pv>mD@zHQv2?PuqN_3);)?s9e%6 z z@=l#nsZ4EIkIlVH9A(Pe@)OEC98m7I?8kChd6>!eOQw(e`-Cjv7TwdHYSk_j(672i z3AudD--H!=&AYjwJ#VP%FqU)@u4}untbGWNYo7fns)Vlx@A#TSt-?RhE2p&!KO|4% zr?`NNua>#a58DS3}rI{tgAG>iW6Z-;vc5I>nxB6XLJAhY{rGrRATK!xV+{$?4?Z zlb<7d3X?*j(i=KG8+($weu{5J zKC5LbzU&_2Q}yp=7=NAKDlE9rE0#9(fsj7m{|K>&-{M`Z!jghkVX5)xWgccZ`@@Q? zR^h*jYZbY=u~k@e{eiI7k0#c!cdQp?!z-=ju~uP|e)r~6twM%+cZ>RWtMYf-)YkS- z?G)QC%nt0F(J8j8s&&{s+FG3S0kOTk*c}J3L-K#WT>ZSOV{G3monq1diAs8Tj+!B{ z6C;Mi+e^Q7?M8cCp;*7Z#9Yw|>pu?&TXx$XG}o;QP0@Bd>_2+s zz9IIFJYkNBWBFvk(WJ1O|DL3zu+uem4KuHxds29xdoP*Q*?u?fVXO0>BgbR=g`}`! zP@-}2Sjaq+6u!f63HG|@flozK!PRZ_Ro>mN@f)o6&STzxYi#uWujek37ysI^W!Qw> z$c=F9KYVA8JnZ{ML*f@>CyEEg8hVl!$qslEuMUizSRUzvVkbHciZz^$Acw|mA; zocwI8VIaLX|MnwdC*FKM*3jko*oncBOo^R1^o3Z%izDd6VhuyT5Id2>y_mf7#aP1} z*BMFnCEJk4dc{uk{!FYPA0vmx8aDTdoj5i;*3hkY>_ms&v4*rSIsZ$J^RMBbFU%d+ zzc@5@;+>(f&w7U6N}tKT@x?gN^J(XPE|$x$zw~O@>wb3%KlC%PA0%=Q9q#)PUx#6_ z6O*2cHAu|56k~zc`^SFbc(pP$Q5sdHJQe##aTtn=XYn@;BEFyeYu}CLi2pr@_zZec zt^e6&JQFL(Y9GGGoppCAJNZ2>-tO2jv3=P7N_+D| z+K10_Z=KL9Rz#Md43&5qKg3C_8}OOfkIA>Z>%YdTr1oLu^!8!Jl=fjZ_p+CI#wNM{ zB||B~{l+e}35!P9Gp>J9 zSh6=MEDf_l8^@RT)&?g#p#v(FC2Nj+H>@4?Zdf;FcId{h!5p@YH`<0u1=;FDZ9{K< zYq=ZlD8tM6vpM#S!gZ7Hv4-=*({$@IL_k-bwm=l^%^ z2KDw|lO^%*hDvRaN_Lcnd)h7T{cob<+9ZEVM%OsF%|9|c%x!9}Wwm>ovHyt|7b^>F3Q2x@`W%5clPNS4DpAExMO= zkv`DzNS=&^t>lhKUu@4%^EI-@+t+U5!y$UUoPEP%`wE7~?70#42ouHkeiZg){r7U$ z?{D5fO1s%IQ-c0I&()FUbVO6Wpg{O zh`-icg5yzLuYBDrU&fz`otPuft(VUe`^8RNlV{!#?y_)))d^?p&wx!O_J;I)MdB>Q z0Yqs~P5u*hO#M>qpUL{jUj^2n%@}9xx{(=0LNIzv9Ad)wQYxrX9Un5;X z-f-oMv0p{HG}t%AvqXO0ZDg}QFg96VE;*F5=R~%h%JCs|{@Ajjg)eF07u~M%%KDf9kdg>!vBMtY6sBy-nDd?p%6%Oo3KS& zZ>#gSIX~0+4Wk_EV24C7fB+>I4|0OGv&Cj3D?Yqg%ut)er?M3%d zu+OnR<_8REV}4n2Bzb9de^pLVMbg30d5->H?|E+`t*s#lvz*U8N7gT z_$#a~XcyLuPBQ;1DXg2E6xL${edCm*u&HBG*gPb9{-m&l+?s4nMQ3y2)P38@9c|T% z``U(G_tlfP)swB%i|WQK;j+p1otH!I=Ube7|idZq+uF%(cJr z4E410%ciz9cTznq`~l}zl2wg;Vh8zEqvlkfSZzvM>vP&#$J#b57?>1x&GH}Rl6m1> z&$E}FHA)+Y{%=?|Cox=5=RVp>{W2;sbR&DBH*`Vc{h@go-ye+S-1E;xx^7s!;*#;5 z8xMqE(0_%47p4C;c9+HME;;_|#IN-5ys)PsG2EvA4huqJ_>fFDo|4s5c`UBQV-iDK z@*#W*MFZU15pzU(vHA3SATR^QEge-4=)+EVApD%{uAccLyoSHUO8i*3Ye*8d2Q)F_ zHJ&Z?!slWOE`Kf-zb_x${amc&y?gg*jqUV^TGjCPM8;}aIoJz z`EOpR=`}CZqOR+_P>;3h@?q-be+h`+6m0@Vxg?HZxi{{SM|6$kQ zUOHi>|DeAcmJ7Ec^WCtr#J)Y<-wmtRd_A_N-^{Rf{>-rM#7yhOXNC<|XUYdN!=_Uc zVw-o(3>i~ii*30%Gi)9ET5Q{?nIUuXyJ7qMug7+rnHhHKf9zWET5R`_Ka1^YJ2UL< zHPiF^{+7HJdq3l~SWeYzv3)aUhTMsN8_s!O7tpc4Zw90B3dUnH=3+iFkcljmd(k7MCG-wlguz8muCl{kcB zIE#NpGoJXoz6CyuwQtV}pCiABHDl(4FOh$Yzrw%Z-;mcZC*)k36ZV~-6aJn1ZwH3> zcd-h)56ubd$gS9ehLNLVC&K7h!@aNScl>cIkKX}Y$9;5^FIHj;a4mH%G&4+eb`HqX}o#TNQjY;!CV+vz*7v&vj=>^`pli@oPQTDz1@ ze;+w_jZq+%o`-17QUMC-MJTRfU&a2>#vFOc{Jeu-b>H~1Ysz$0H&=HOY(#JfnvLafAERNx?ff}i4N_&I)y#|Nu>@D#p+ zg;7xsXOFT_&pMmD~OAK*v$34V@W;@9{MeuodR{aN1v zwx_obzaW2!-yqJ0J|C&a6SV)L{VM#k2Mg(ou=s95Sc0YWWmxXm3anJ|t}_0%T8C zJ2BLvZb@RO&(yA*k!U_&qVbHxP>r(biJ=@7IDlf5U_VNcFH8Xn=|#wSAu;SjF7mK9 zQ+sZ5V#vn(*vY*sU3+hZ_8z%yqW0hv?ZIi$_`D^55wACUQ0R znk4PbWbI9I`5^fbE3pcTu>?!83~5NmLM%cY^O1@L_{@LtP2*X72Qx7bsaS~ps6ZWl zgrDFy_#HmLQ$s`i%lH#~9pAuGti*PF4?o1u@LPO<*f8HX+Tk%gj;HV(zKpNn8~7%^ zg?UKDLafAEY{V99MT&izq@A-q z2n*?pu=uXN9+uLVVYy=~u#&zCtE<@mu$CQQ9o7#DVFP-I`zbt)FCazRWs16JD*E*H zJd^GO+F)R;# z!-}%cg_V_0hgCKG{2zjC`W^e=>Gw6U3#YziUA^-%w4>@?Lh;Gd%K^cF59gny3QmBPP{F$v*cqVS{Ww}hepYvdkGXYY^JcK<4J z{11GH=C=|G} zvBB;A)FExyFxW3LCaUwu?c>#fk()hv4OurT7Isv|!dJwXS?2gybzt#F;WN1F()jIT z>s=lT-xOwV+%fKN(RVpF)wPQ*k;10F?ikxcbj{Q`2+Eqh&cDm=Z8o27iD7#>-I(=$ z@m6S`)#dB|^=lhi_GdK4o4Q;d@JeDxWefU_YgFEFj`84(Y1U{>CTANr7I&ma*Q*=p znvQR0xNG@dvwnZPmb;tdog6o=Qa482*m6Nw>^RywG_~hf_2-!NWr%ki6z8=u*M z?dQld!X*mRhkd-4IA_sQKJrU_QF#8*vo&vXPf5aeZXKfap<8CSSMF5jM#r15o!^dR z@wWQt9-9V>i#yu0I?5BxH@Js5XSHw8cfZ~B(xk!GqYp?|a|-A1W5?gY9eu-@FB?Za zrJRh;J?-2RbYY_FWn6f`b2=xu_M6Tt5J#2s=uw#H*`s%#vClcLI{%0;8Nx?#`5sc& zbFVOulU^NybT^06c+&Ri=1p*?j$?aIlz+unDvxE9u>HTSj`q9F!qp~1-r+Zv2ykkwi=V%Z7CLPWu9myu0 z#+LGAg*_<;y5EM6&dIRXW3&5i9w-05jo#=htT?yV3hS9$-k;?20quZ#^$8Mftwdp_%Yc{}xNLbxy7eD5au z$Ig?VjqXJ__mjpiwXZoT9b(~W{)2bS?e(dgOLctv3*zZ#UVvk%X&=3Zx;C!M-MX*)_H5g)OM{c* zo&Hh0Q9g_Q%bejJn!U?T!WH>OQxnZC9Mdfnhi>*kwZ@6QeWLg$JHO_m@OOPr)0KtM zyPHAQj_4LrO;o8r{A8FNm62uLKYCx**f-ti-cnD@<=^sNrC-bY^V~}!zvvzM9-?=2 zH7ZZfyMN!)-7ex8(LMBfTpxtqs;fGV-VXyX2tzOo&3j`ZC6c|nhZo3_qah>-UrO%3 zpznX&8m+_DX1PWM8SU**iRu$<*r*kz2KB<%3ES4W$ry{t=!ni}?ldc*IgCnZJUcd_ zacEAg>GjzOO=T}96gXCba#W$^|DML>QZK0gr+N2e`gHHUmHBYmT8sOze-2^)Pe}^P zw856Mv#rn$TS;H#dsdJ)U#nUITn>A&xR_zLLZL_n21+|olH)_c=`lP z#1u@$G)%_~%*I^IM>>{Z1=e5#qB1vwoGPvx^l4-!_bz0i6?YQWyc7Lf>r1h=^ro{< z#gfU6=!~xDjx08$@$3Wp$ipbWMt!;FW6#IR>5c3EE!MdCN3o{${~l}V@cmfh`ifZN z>daVE+U{6Wn??`_K(EALiTZQ{h)~!{}T+L4}xkX{te7TDi&iU)?*74XYn^M zMA%`NMt|M4Qs^&0@fcS$#1%vFcd;4Y#9DkCih=lP#EpT^pQv6Qul=h&N}Fv?1{Ts6 zVey34VaYE2nJoR8+6TgNa)tVKCHWiYU&BSLZQCiVvu=C+P2=y_h)vjhN*#zT!#ahn zftSRvtwy zy$1Elf;yDYi&5&>ew5S8a3E72NA&N&oD#N?8EhnEL1#7+6kSk8AhNCO)TZ3CgpI}V z=>Kqg*RZW{NB@U=#(Vpo^!p)xfuACgUom+HPtn_9J3WpHsM6veU?Eh6@mx}~Bfgw` zjO>QCzUg1Fh5dj%BkcioTatW*-uNQEj6cSW{{A6LPQ+hgEt*DsJ7Fu?IOf|4drAM? zNqC>kMFGAtob6+bzTSBK);FCuU7mbb|8luL=nnUlBgqfC))C`0CtUw0>|OujyRJ5G zrO>@(h4@|ha0y%DF7h&77^Gal*WB+JeYDo{_)`6vZ*q6%UQ2H0_sCFXlk?K~{Z9DL z40D{j0Nc51Jo9nKCSWMNKi2GQ9e(ZHu2C3q&T#C6^eOf%ji;YaXnghYgr@VKPw@Y* zgvL(4NoXATzr^1qlnPUU=2^c=Xny@S303sVk^9A;B{XmDlh9bUETM74ri7+B zyAzr=(+9o~8eh&&XzKP}Xj=T0a7EZ(Md62jH#A>-A^e(dJ1_mfr3uYF7bV0GcM0<^ zbP1`(#ugy$>O&zN3ymKxy4gkj_i$Kp{qeB$@)KcMukK;_u7Sb1Cv13cHbSnEF zmd|GY!%F%ptafY-)@m=U8=(C&MEgfszEPRJiQGI*Uv0+k=Rc9#xHE_7r|IYIAa|ZH zPw$NRPs-Un>zw6|#KL}erQ zC;u0jt85*u{ezbEe{;=`o?^bBdx-W>jeGumoqc1id|STJTom|D^HH?RItLTocFlM& z?40~y*mM2Cu-iA8b@{=t7w`MFvM00&Rl-&3q*Rj!g|8)Rs*=MXao2H|a2KCTwzeTT z?B^~g%N##IR&Wm$ZWR6;`%!~6*oBjre?3`YmK+w`w?_%m>1kLb&V{|Y$Xm%_GqxJz z-(oyJb9S;hmdRnq)MWk1=p02Y34fq+_ z_%;XN%ecY)JG7^F#UOkQuVN;?ioe9S5yx_@#~%C)+pr(+qZX&}bF})bwiceif5lq- zYoz|vCwzw$V zHT*L_(=g(lg85GLd(j_o?61i75er`qL0L0l)1-ah~#F>!Rr`-JJ03`iAC{y%AC{8KxR>)^;rL2&75C~H&xbYk5?D*F<6h5yL)YiS z#VI+*_vRVufAwx;v)#(Qjen-&+sPf= zJImGowd#Lz&k6Pa8EdC5v<}f4s`t5bZd)IHU;R(!ap&_daJ-N#;w~;{8+Crm`tq^X zH;rc-ooK#4d4RiehCbF@bD+s;?i&8Jj@Oa(-1**JL6-C^ke*}?_r598dz$nn_j=b^ zFL@thy$^CH_pT)G%e&q~)?`Z0Ea~a|da{zcDqVW7kltk31@D7@h2savV(yZo-q#85 zi(EcscvvxQcvwlUnmar!erdS*7Q@3*av66T|8&O}l8d9}rkd>lQK{0G$Hh_daxy6L;>q939y_vi5y`~}{{8vK*`?FQNk`xJ&?6#g9F z#(b>D?_e5SybC^yXYmrghCjn3OvMIl$3*w|^@r367%n;T+o@w7bRPXVY-2xV>j}mM z!wxjH*N)-;u71}|b>MFDHDmcl_4%sw!?p|@60QwUkKzjc$^45QX+y$~^w%<9>=(W{ zL>S}bKS4X^_R#;EsUD4qqYz&a?wgK%h5vW-1+R*Cg?N_=^CQPMI`%#O>(#~Yaxdh5 zTs#lDmpXBm#UBslXzV5Y9ySLf6&aXrL-s_<< zTntq<6FF#OoNAjj?L8DiR%joxj~Oqw2;%4Bf50;&xqbm7O}??if=iEv!Yhx2qDdV? z&W(0qU$?d)_s(M>?_Q_yxO?j18J>3k{q#={V}Ser8~2wgjvqSy8^>3QHyb~2@4psy zw`Z!uGy28fMveFK8PD?-VcvE8Yr^f4o}YFvKXcqTWIQIGXX#rU`-L!>j=vqHi|ah& zI^V=&!aaxV=ziSCHumdW?i23oIJP+dY3^^hPN6X064%4-^)>R8w0}{2-*wF|xYmA* z6~}YV8Rp#o#=k_EU%TEi=R6{wXu{`wqz?OhSa4te=d%9Kb^RZ5(cEXk;uoF?OM1n^ z(%IHO_3>{(YYA77EBUSJZVY3J`OoaeYsqzU&F>~RaBn0xIc_>X8&fovWuCKhw~?9L z+sPe{@4RgO({+0ul6&r(|4c^z?#U+KA8Y>e4C@o;n*Yj%lZXF#e~3@^chJ9W`M=cc zr^3hUe-B(Y|K*JJi5IL-BtO}ImQAQmT=m!`eO7o6A4sdel%~%~$Ai-67kE+{_2BnB zS?G9@G@Os4_`2gCNY~#=!>>B$HSc1GxUS+;()oSo%=13KEX+p7-u12g5Wm1x$FiOO zYv(n2U-iQOGyXgOZ#(B8zU6u=o%;pxZ^X}p{kD5)Ev|XE>K;0~)_vp(|69ir#PK)y zFXGuR{+~MkGveti>;%u4<67@I{$+mOz_+9K|BCVdR>qnpvj2}a{y&!ee}?h@xyJv= z#oSByFU|a1ST@1<|7hd?SSbxxVfAq~LaaT<{*U$a4cK^>jS!pZ8Q9|3R&1kZVtbYO zPuMxyJJ`TB_YxbOd)P~6xn?%`zI(_a_i^XW)<=E8`$qu^QG{aS1|`^!Qk0<_6*z$C z-yBtl{>@R18q}f=^~bH9kOtqBo`IG_Kyp9j^A!F**obY{RiO@VZMs5373nf#ru$J4tJejiFd#7FNk-!IKLp= z&%NjW%6|ujIsc&JHyuAG&RN2I&ow&p|0aHmuZpt{uZi<_(f8r}pEzft@Lv<(1@Z^v zw}n~ic%5UpuKlCvIh|iJ?9-vRqqOTR4N=a1QHBGkKvidHh-w@}^l!r&ME^Fd!!G^m zoono)w=dZ`QTrxTC7Z*Z9Ny>7Cigj>!@uP}#ALy2W%6`mH&fYX$u(W|&pYa$570j+ zH>BvFV-xpAWOUcR$JS~3_h{+g&o_5!>PP+i`Qncz#x8KZRPm>~ep(6p0QW+2iQ|j; zFB5(#xnj0`DW)g0-6UH#lN_FwHs6J#YrTNcNXHtyKXR@a)5}x~usb*SvxOPtwl zTZPx?-@r`diZ4%m`2#+(@fV7(NPNZatz?Gzk8_p(WEpok{|d(skd?w$4N?9(znZK$ z_e`kue08o-kAt&&glbftu#XK{x1oo5aMmWjU=9Mg-x}W1mwK4z?LX$$B3DfG-*WCU zvfP^9kN+>8lM!um{x3IL9UGfEX%5-+hOm<27s?qF} z$A^U##ye9xTW_qMjVHyzd~)Fl^?HeVo;&@he1F$`83yqZvbfQj2@}dIIw}XcDhIf$ zhsgJ+>Z2S$^w04+ME@YK!5&8A-H6t3?8N(-)@~pR(cHg02K(F+YdHFRHbno@EXuHU zV@*;hAoF*b!`I4OKG$9|V^~;y+Z;dcja|+8>uByDxz#$2EyI#R#vpS8$sOIz6-2a# zWE&PNv2WA-q%fb14Eu{1pB5rrgE?*fXTmc2QY_UBzI+o zD1Y`s3NmmBH_%?5?S==%^(g)be}Q$#!&Vfb8UL63`=|JGEI?a%xjVig|9u?`@t9}( zc_ihtZ(<&H>l1%&gAT3|Hgm%|DAXW3)CT5?BA2Sh7vaN{paL=?bfo} zzFqCu3i1GVSEtF^cGrFTstDUw1g+c-`v zXUtf!1Y^KMJcH+shq1Z8I&!$*eZKpAkMygzs!pA%daKS-+o^_UU>H8cHSr0^g}b>f z*233eZUc42^|2h@#QnbC&|Zf>qmTA6_yYZiufkq<6ut~k(g(frA?gx-1^*8GB#xt> ze17S})Fq_Deeg|q8om$p@Ct+=4js@3H#qlw$Pa*HC-tAgb0U@c-$niJr~Z*{+*xaQ zuIyi>9O}iM|2WL^{|NIRd#V3@^w;-M|2#{Jp=3MlKPbaq4i%KM{r&eP>K~Y9aIA$- zz-IUq?1az5{|fW(PymHc zRLuJSa@PMdNKi^VWiIC5891nbO6*l(=HD4Os3G3kKIY#^qn>qfW<6XZ=XejiU%rFy zXK!|X3-}P<{~h7`v%~LK3$$Wy+s*#KsqDeOi~WIj^3B(NzWv(A{=oS6;NP3V{=VDT z-**S!pKWLFpY`nTyN3M*VFY_<1#9b8vNzEke1DeA_h-xb{%jd*@vg9ji1&(P8TG%M z`+p_%zk>Rom0YYZa-U&8BGzza6E25vx!1+_5A3gaiG4hH_Y^GfP3s)*rFy>qt!C|4 zkoCXqtp6S2z0@b(Q&J`NsvOp!d05Bgl4{f0|AO%Kgm1vV5r6Ly$#rtus z(BoDh?QyH+0N=T??x}4b-+b<;54>0E+`~R2sr)|9ZhjwU2lxMW?*9#sTYVcJx7hx} z@qV~LJuXLn5LUqK-+a|tg$pfG1onP3 zJlD)`kH-$Nrzd%M5;3ohJpghD_b~D|9B=dSZoj-!9YNmp0p96x-tEY4+&xHdJ8gsFoy;A6 zM&|bwn&G|QS7^olN8

`Thp%2d32>^?R4N#+>}_reZ%3>=4P8-NosAaf6G190zOsy!Uv2R{Wr^uP&t17_j2j}RYhhDTu> z{`@xP&fr1#5-^?McphGa8fbzM_yt^q9k=tofG@z;;BVn6cn*FHB~S^C&KyNwy|9am4OZfh!nD_rmzJFQD_b*}I{}9^F_b(pP zhUNR0WWIMv=iYS$O9e^jz-{ib_@VZG93R2 zNnCqJ;3tp?3-D{Wlk>O0gYa3{LwzhKzK_5szy~dmfOm;!8Rz7{UlR5$uC<2=e;>!b z17q+7{GY9m);`R@>}FaX05yhMQ%}g`z>;=xJ4Oz=&Q3wg$D|t z5VAMiqMSQ!Q7&QgkYzh=Q91EfkbWhyXvr;$-(w+5p^o(G@o(UGBeDv2^#a!;>DMA1 zhmw>I&cjJOACi<=pCp$nNp7BhS;%Y_qUT`G#h%Bpe59wHwQbjuR0u^mNvttQqWzns zQYh<7;{BJz_!sj(!6a3cC#jmr>Y78$|3KYQ=6|4p^>2-Q1MOYT{Lf0(R;*zDXBTTL zmNNg7%KD#e%>NAWjYBf?KhOc4+xXsUW0HbPlGvLtNj)o>|2fY5Pa5+-N7&zvxCZGP z4>H9KjtTp70Lle~?k!G5q5kPasEeJFe3Hxz0L$WCriA%*(w0>HoPQ3$jz_S3)l2 zLB5Oj&mHtLSJKZ!7NwbfDeWKpOY3j8e{fgeu8hAb3OXF9-#+%kI?>se%ea|F24UizEne3`TqYp zec5U5|7%=t5aC@D1$+E&;(hu>GS6qotowsL#=Y`MH~`;;m$*-yP!2V)3z+hDJjt;y zaew+D4&zXCg!T_Ql<=M{g|a38#q;k5&%YZy|8DU7yP@hsH&k=<27TEZs>5Cn4f}4W z5xjvL@&#|u2D!oe@doe58wy}=!LJqBR(*ppv>WQUbVHq&Z>a0)4F#{d2l?}noFHxxt0({CsNqtG(TZvic( z-TCqdSQ|-Sp8X{J;H5w98~T8{m(b3HF4~zv=-5F!bUp1-WcxPSna5~na=!h$)}|h& z*@xjE?NaKa5A5H;=KL7v#vwwxGzuZk594;zX3c`kjeMtvbS~#QBwj}{*Cp{5(9SJ{ z{8X;HGxX(OW*s_YW6wFvb%v}&hqBbADyOYd!n?h6mhS|wF{VI$Hyr+e8jrE>#sT)- zfEwJjM;S|)_yF%EetVlb`~z5A3+WJrLHH!~{ZUv2Prz4TKTN|au7fP(e}`u1gdO-j z4nKy!2RF1r490*Z3XXq=-@#)%mybav{69S7SHMSrE#(~Rkty&~$beG#Yd8dr&;s9v zFY!M3D*PMv|Abk%0PFDkOX!8?(W?(S;1}=~+=_0?pb)+V4&H_7;Dijwtmpmj;{Bh) z`yaA-r|0l)&qd~Wc>kC4{wGWUvJiJRaXf(jCD(cXU*-LOnfE`(DhN}FtZJwK2Q^{F z|DXoEpZ0(0|Ks0!o;LRaZSMg0e~|kh*%jyhNB*h*f1dZv0`D81 z=L5(=;vd>;_Lscb|3Ab1PnZZYN*uA{H^={pXLRU&I!WFO;1TNOkCgv|lwmn#mjfdZ z#+`@%66~)LW*qy^@e9I#V2{H=%AgH?fZs{V_#60r8~JJC{S|(+Gc*s-PA=iOx@e z7hZ>dhTp)y!E(ytR(KI!hE1ltpb{FPm+PpH>!}}Ueg|d{IfQ!{IkKPe--C?*q%qEm zG`|HNL%z*<50Q^U;8+?~`W$Q1Apk@0*XIX!LtxvgE`&i@NNBg%=o~!2l z&l>mZtc~Y*F_dt;j`+%QO04oLtp6ZPB~)>&`b@uS&iAX9c0*m5_dhgXZ%k+XhmZb0 zG_C2EKfrUCcVd8kWy{V1oguz9-j(g;Ja_AP?jpa;weuR>Pq}@X>!$ZQ&si7GUBZ12 z_h2>8P2@0PNAL>`@tlqG+(kxl$MBC6R{}YT+i`?7`sH5BS?#qlAk*cwTyd}E?(C)=?()lv@L3)QH`?jP?&S(eCjtVg1WW+CSJ^@N1=g(ng%^&_NpJcMQ5Vu>J+Q z*R%eGu)POp|3H5l?Vm%me~!@pIY#^EFzp}gq2089AVOZFJ8A#yr~QLHfuC9b;^?D) z2u}K~8DaW|0s4pZUERDpvan}Y^PYj+a^5?|j2U|vJLdgUKs%$5cU{pC?-|Z5A&ye) zW%PH;p@MTOiKD8XvFi;t=f9aVuiL@=_a)}Pncr@lWB!{ldEYkXzqd30&9P>}1fXRj z^WU7;#`t~v3g*9&oyqKrvz%`mkMb?x5x$MR<6ZS0dRKk;_Y=eKv4=UX|I1we3ta!#mgp?k)p;1b>b6>$2YUzmuiymt=PV3xug`I59OIethqc_3 zZ@8@2FSx8fVo%?|J$x7M7VtofvWY_ioky!BWs^Gt#(ko_?^T%>adHgIes%s%C{_#Fbr?BiUcNVXnw|m*tvk+tctH{b5iYY(2$nuoORIssEg%fpF(WRJ*U!GFQ z&LXSyO1)KfCZ=-2SIos!nObC39W1h{j}=)p;TZS-2kHN^{y+Xf{TumhWWduKWc~ko z_WxoJFh6^MH81}l?LXF5FMWeKRMuu!)BlBz4a|Q-7wfKr(2czZdiSyZzy6f^%TLk% zeN%&dr!)k^vu|hwLYLlP|C=`yx%P&l5W^maMEDJjf@Aj?rGs4&8qK?S+4(C4Fu28|DQ$wbIkw$qejl3S7_n9!dK5La{WB- z|MQCRM-1(jc%e$Vn_SjqYGJnlSW6ih&`}yqw=Da(X&<{MoT>2i`J-kF)>2q}9 z4o+D558};w{e61(>+Gxdctpi#lT`crw^jU6*4=DnD5{7`C!b` zm89#O7r&ZowXGRd`^HgqK9h^N9A4d0r?mVZ6cgMWkAh@39KJg z%klxWB14x36h8hl_6r$MG`vhP+S+k8)lKZYg?Uu=XWKr?`zg-)%u(iNe#ZLDQO0LR zHGFx1e#0pJ%Q*eZxN=#LoVPx%d=^}L;_R=tKB2;#TV!40nX`PEoB=Z)z`kVZACL>& z)$C6hyhT+9;;M$4v}MdM*IRW=4Ad`=u~z}#*BoA^qB{~QPHwbHwl!L%%Nwn-WeJr- z#o@Rr!5KAxR%R4EzxF2^Iu^!e_h9Ovox zcd^z9LLu6j)Mo`Bjj=DV{>+5^G4WC76?{zA!}))K-)h(Zo8e(_{Rh7diOh1)=K2Rq z`>=l(UW5vWZf8$)WFzFjD0OM-GY&DH^X31rQf1?-R5`_-)2x{*Ud@>N8??pGGIuQI z+Ae%Vb*zDFq*2wty7%gLSE%auf1%p{xKlN{o&FK+aBz^nF5*A_LE03z&^K7lSQ9)0 zF9FLY96yD>2PYImCDcGYtmgiiM7|0C1%HGOG0()1n}Z=a$45E78Obo5<1N}Jw=%|j z4jIxt3g_RaNcO#o@ZNFP-Kp1q%yYPEqg;P;r$+F54YI{LgnMq+Gw=v}5gvnP+J28Cn~pHgNE^^g`^5*b!x6=chm}B%28I>75>og` zM3L$lMUjICA{t7IX!vM^_C!RzDof;F~Yt8GmM8+SXIPZJyc=U zY>%imXPEs|BKnUtTnmRHj3-rCMdYJ+TZHyjgz>Ei>#=6!Sr}HqiW;lX7g09k>>O4u zGVjQ+GM9wt&qd@uG{bmSg!WEE>Ey%N9#KXBcRKCg}65qV9#960$F)=L{*ng zsCw53{l8kPmU!yIQPzJ%Y5zsl_;OU<{<~Ru<;}`plWloYvaJGS;pJxDe=RCbZC1(NHkIydRvDBZXj28U@>sK~b~mdU zd(FOP)$VOpT}rd+cQmVEL$ey!H_LmVMZSe*H7z?SKeBnQS%D>OYC*PcYg5~XlZ=10 zvHmU7>U_CbT}PYweTHUr6Hm{QX1;%KR^RSCtAAOu299T2gQPcfHQO2{+{knr?Y~Sb zO#ULg%cBd~RxH(J#Sb^r{%zLiu4Zn~94mcEj^#vVoDb0cX_aebKx}j2$Vv|Id=4ll zxt;O9fWCp>-+%}If@J{}E)6i=8elzjKqV^zD!n7X_-CvBjxZIor&#|JP!(aTue7TM zSv%3Lx@)KO6vrFdT~;H0-rWJ#lLyp9JpPn`nvsDOxmL@*0BcEdthVg|^v|(6NUt;8 zuCCoKD@gvjcbrnsP>$7mEYs@S8BqVx0PBAOtpCfghEC;J!^jcx6*|zW@a2HMN1D;O zfMO7Txm^k5bNG#cxFM*vKDnO{9O6iyFAUb;ab$OzJ)$b z3uVVMbbE^`wza5yc?!6eUa}{3gP{FPid8Tu%eDaxhCD+O&y__RC zR(4toFQZnauW6NYdn@B^t;*cfsw&cQ?{4My6KMN?F~>Tdonz&;pH|+kR^{((mB-h~ zI-^zh0LmHD4M zt6{d4HGG}&9?i0R8(aCF;I#a%Jga%5+X^JNs)c;Du5abr?mVmAmuGb#JCC(84jE7| z-m30Er+WHYx&K?4d+k&|a^P~S2Cubh=t`@GNoQoCRUsesA8b`5+^XnME8o9R|K+XB z|D4t+(oyVX?NFiRoc8klOTLwv5DY@9ARP(W}~2uj=-BRgZte zZm$}5dF6fCE8lT1?O(6_XS}>GniRO|W&VryKeFwVmusa<9mvk5U9|t-P_Vp9-G>}j z4}QJNyy_#K{+(V89QJAuIkd0P8eZY0{hM!vj(QcI$hRV-7roY{*fNI|Cx3}!wEwT> zTaLKHN>BI68TK*%=i|EdSgvG`1PFX@=sC!d;P5ceoKYOBElA*>Y+X2SLs3z`=7q0 z@*_Q}$jPuO@vCa5{ulhJne(gml%M{eU-j4Wt%k6larr{ayTVWX7g|lE=dbQz{YQos z*x~2I3!zCx=T*+ag1clsDl@T>n!59`1EJpcU~ zn(Lwe=hw(}ze3Y~)_?mInf0^&%dc1;^&j^$Ug4*`=hxG`?@!Tgq?_fKV$9=3Qv7q;@gS<|ibK$o1m&(fajQl_s#W~Nd6n@?*6%Rz zTfiW0;k0=E`;?w~yPRtn|D6k2ncF+$0(WxU%Gz@~nBbYFTb7pzgvNv+tpId_&w9nDxK- zcU9lc+R@w9jlBo#??1RV`!6y6dnRfP9*mv7f_dCVG_yPfC%?FvKWWuKyf+ZWgW ztq56(Qyr}TgI%lzg!r~Q727##MK|2Zw}K%nyd-3W7>^sd9Mmv!Xyu)ZE!?RA=!ZV& zWsI(8U&!iCWlZkSovck_j4t&~#%hCV-+8CnQa`9}>gW`87&vyPn!)euW~|3)`Hnd) zFS2oaw;Hx}(`W6bZCGg4Ry(bl$_Q85=d?p_~RMY>t19&a+8DK>u~Vf z9>`3GRkMYzBl@Lqt>(}UNQEciStx}gP!9*-@8R`#NhgzZ9uHZ^yP}rkIQu_xEjrIj z8OTiBuB)s+<9f|PX5)U6YyD-?IK!IB<8U6r^i|hzuM{1mFP_FcQT8Xa4fhV%1-l^? znzV^M*0;#}&wOY5+fS+G-TT%0`;V(xTNU^(ZZsAFrPDrf?oU?6-ehHNNRsPP+;Ve` zWZjXlvX3PzXIC=oACr_f8@KY8_sGLJ1&5jcCw$TFkX1Ys=lV}rrQx_$wkKqjx5urD z-M6Tc^QziARedy>`G3Y`Qj@vO(Ja*@;s)1UlgL-Hkr3y_NMp zx2i3N_W$Ku)sfEnvYj!ji|aNB-F;!J=gO_>y~6smZ858VV>|tyQEM>$Rt;^Z{qMO| zBd%K&nq@qCMa+s^3R}^IQ7aa_mGlh%2+`ipJ)H2D={lGxr4rKuH`nzc62HIz(?fV$-Mi)k0^6R)N*mX zxL5G}zr^*wCTiu(Cahc*LF6$3?%BXSc!~GZ@ebY}c~((d&kvr4GTQYf2kCy(u_ zxYn*ps4D0AUz}&v4CPt19@_so9jb>0`h<<(UDGZfG#%@Z|0wPM)I2M2B#-r<9cqQP z4ehl5DBET2eE&`VZ&!zcJM*mWgLziZ{yeLfcU2$s@1*`=@Nm0^V0dGPMmFT}49~N| zOY^J}%I3IC*HL@time9!Q{DR`oj8G&AOfdkS}6_%5|$&$)J&3d+|p zF1b#9CVb9atb1FhWb9!mo?9pXZ|KuatW(*Qb(+Jkns{o9*Xf$EpI@iz$i`(GwT$Ok zz2^-5{&lJ+%ud|CtLwB2*}7sKx;waO}9tL*Z%?D4f;6}%Jj`qnEyeZ4$TP`_4%oKr-&;!A5) zGPG8ur`D?M+FJU5YgKW6ttyYNRTb`P+%+7ltzN5k((Z#f(vBl}eDm$dTKfNM|7&TxuT}rHwHny7 zR)a_R{&VkI4ewv8k<_&c;SS@Da4foFt*)B1;WAtShvz=pPxr|gUe{tn>FNd*MFiT)vO}pYGE&!Y|=Hw-lDc9Z*!vZwo>RltNkGeJY2Fq5F9LZDDVk zEvnwTh5aA4sCLB`)m3ld`rpF6y-AI`H_`vwB;WKV`hUEq_+YI0_$CF|$i8J^6XQRd z)Hal&_Hc?iE~luouSQ*`DirK{NZmsZX$9%`?kuDITS5D`hW2j-?cWO8zcsXfD`@{# z(EhEU{aZo%w}SR>1?}Go+P@XFe=BJJR%pVM|FP9-N?)xF#Md0WTN{zvp|yCm_9NS` z+|BPV+)es-t1JC(1);lswR$+GH@sSXmsYEPXtfRycJSJ2=KEGN-?v&L$5(3?eqr3H z$Y}Ly<{G%y;0PQA$M(CGetor^OI9mm#cHMD=h}C-+}l?(53^d?cdTZe&)t;&-Lzlt zW*p^i`k!~R{%o}hIj4wl#fMg_WZP=qKdV)CbhXO&u4et!YUbZos|t5D?i!BOu2{{p z^1Jxu?X{V#C4!c(k&Xa9@i?0<2U{V%Sw|3&o_V<+ra zu;i~u|F2ZwnNs2QB;Nl?{QeE|0Q*?~w>OD(NJ*?i`jjeo&sQEvQWgHyYm#XH^8VlU zhU$>@v%lo~XWsuSljOaWB;VB}HIbGd*?b4@|7&lkWz7`*-zob4Z%}4Q>L9Jo)K?bQ z{~ej)`cG01Y4mR3{hz}6>>WuOAj}|Phpw~!t)2C6@Td9Mqs+%L2O3+lRB=eaD3~8N zOy9=3s~wB$uIzPK?X1gUP9*E#8TLP6{R8vwxsbG!j-4;}xdhFR+~5@7w`^e>r@X8q40*8f24DC>V9 zfqfJlqW#G=cMQtmmzQ`xA%`Fir{MSS8~on|amggX2jIW(zZL1*z%vUzihVU)CETZw zpMl5VKXHE@`3-o0wro3LpTo|Sha(e`e$8_PxeQr{tO76e!XN|*KaPAIPQn||hu?lugfqN%Fu^Lu{xmwi71?E7*2 zSL)vNF6%1UXJz-hj8XhvO$XnRpL@d_X5SBJDQ6#$diDh&d;l3)_9=azctXf9{0Ms% zOpwOQ(0`tNK9;giP3rH|$2~QC>|MrK=4nH*PtBoEX(f3pB)lJ5VFZ2#V=xKMp+D#Z z(s7;plzZ_a_Fust;r;Z3YvSxHbN#rgkk#nXu#bH}pbqMxB%OU}4qc+3IW6hFaeNZ)As=@m{{ptaU%^)(1^*|Jk8+)S3HdpgMz6m`J`LZ8@4#{V zvXJ)^egJtDPQfhxXW$gn^X~V-HT>tGn>yO@3F;<=z6`8@uWY(7t}1^`)$@hrt}y*TXQ!MxgBy^YhcJ&!6S@DcUF0HNpD){h){Es{Sl!Kdp$IqrwvRzMc00<;=}B( zc#QoOkFdWY>nF-NUctJDO33D%9LQbvVdX*oiVw>J1A9{B%!Kg+tCtYgd%v(CnatYHqm_Bi`o z6f@7|I;RHQjkxR1u+9frvYdS}Si4h%EasdFWaU-%$v~Fl?qF=8a~JD;knP)8*R!87 z1>9Z8;9k}R?SEDNC9D(Tn2&Q>u(wW}RWoaT0ub5G9_kQ+FeGNFH?Y5RMD`Q!0O|B1 z`-o$B1!Ed8xRNoAORwnHTnGOO@4yZC5beA_gSGH6_z~C02a%tK&%$M{sjnh`ivRt{ z7a$$G8(zYm3r@H~xB_GY)IbZ2aqSHw^9cJ2vK;v)^53Bb`v)0I_z*N=Z-!4`e+aG; z*Qb%!kWV1L3Ezbm;5vS}$accFAiI&b!7JDYkRk5d5Hb#IbKzJDlh|K@*TJ#m7fQdv z`wuc8^A75jvT-A`DBtY$zhM2#FO*B!=0QGdvOG{g85SOUO+~4{VEmhUhSJL~tLzx{ zdXRcv^AD;##C5Tcbr>nHsAezM$wsaV>Y$#wXh`8Y0q-g56`H^g%~z;bXn|H}JH>SY z9nc9~DO?xO4L#7ihkAv67=XdU)GG|b2!!@S@gL#%AaaP~{T|lY;qE6p!@ELX7rHnWWbo78r;7e* z+XQ`MWD8+ikp`>Q{%24o}DK|Rz!EmVQ|?F9Qf3S`;g^D0M{aBeAa z6(Ng>s{mQ(qFsVC>oRl6L*A0}$|g@{Z({e$>@^IT*j?Z(o@AVpLE1~Kdz|%d~1?!UoDAo&xt?cr_wKZbh+_6yh-;MY)r-%9+NkS*{#{C*D!>}&Ank>&U~_BUZY z?i;xO75fJ47SH_u#Qs6#2Y4s1MQ+6ZUgZ7oc}T(i5suvn+pzy7@>}??K|X_g2|kAX zlTdWg-*V*Y!TN|2>|D_n*w$6bM} zJhn>V3Gx&2@3lCFfA45I_pN3DsXWkhMB;Zwe4So*$ z2zw6s3)qkSUyx4|#|@-KeICH?L&$QTi_hVI7!Klh2p+=!CEO**3g`yM5cMCY{*f8f zb>`*Y%XNzShb+jxLj6N7K_WA5Q>hC@m?EK$^L69C7!YX>;HqS|3_BR2dFy8 z_f~0qZ-uNq#`k~7dfW}j#_eN_|Iz-3Ch$Wu1fT_4p>2-#Km6(YKkPy1h93C8w*P&6 zj|L+UI>5Rth(Hu#5QjuyOryRrMNAn`CO^MKIb(kXX5cN{=a4Rr<-*69ci9XNK_Tu> zBfkqTz)SEvIKd5N94|qpQRX$sYTSph{~UWW_u{XS4fwUe5&VzBN4C?>$9)WYKXNnf z5#*!Dbo{=F{4P8XFM$ic7{^<%_d^bD4-8{}8UJGJuOfd2C!rkozjN$A!D63*55hV8 zi@3LoxwlJ@rI)F{tJL4ie~|tA?brUGDz1rYj@1yx{{8mpKgjIgw2Sx(dAB-fW2A%g z64wu8UgP-zZtPi*&9NNFUG}o_xGwUCxPG`!3Xp}ky~N>zCh$Wu1fT_4p$+)nQXS9< zT@ZwB=z(77gMQ$fObx;i48sVR_diP}6#?^GGckxm0?cpCXth?qo(>t12`+F$7GxV< zci5&0lYQKYnG1PPYOs%%+A);cN6YXnvwh2K-!j{`72h@(&}!p8PAap+F`I9gEyna1 zY+C`g0^3%Itw{2$GW{z#< z*=D|N7T9K?Z5G*Pv2B)M&WB8#^C1)4e8`+TA7-%k7m$S;r6Re0(9;g89qJfHCW`ZPTRq%eppWdOM8S zYhwFVx{3c+Ibmbgn$&;QoG@m<dtyNt^D`rhB#No%GBt+Vm@DSSd=mM=AFz zWfT8KP!h(h%f_?=5W*Z!u@9u!2U6?;qeDudPbi ziq}@#YpdxI~rRd6V+yIzDH=fc@zIe zn8Tax!r7dn&35R`c9Co*s9hYJ?Sk06MVswH*lZWV=B@15WuI#oz~*h*Y%9O{er>i@ zW{tyUTTxz^n{D-Yn{Bq0+DyiX4U@C%I865JRJLf#J)*oBec57XZA*%_*qPe0Sz9=| zRa>@c%l+E&fVMoSE!(w)Xz;WnGbWxU$}J{JJEvQ<^&U}QTQ_NIinean)-BqK=QeG< zUt1s0)(5q9ySCa9*cxp$r`lR=)3$p=y)p#9jpO&*=7X{o-DX1BHGRMCzeiNHt@i!K z+pg^g^uRr$x*xDJWf#Q*=Ad2g59+~tM8(?$^Pusz>wdeo-yNeSYUR^aXw4v--jpf1)ouen4M1sxRt`kL!z1=!;Ja&)wQ> z{^3#W-mBeDX!ketsJ^5xeO6z>Ox0t0>`^`T^+)vBGkWZm5v3~ivr7G(QXf$&5u_4p z>Jv(RQmHfT`m()L-jw zv|r!S-|FvlKnLzkyH^M9`wqOQ1Ft3YZGHQ|3;K?}bKm#jh`w{+IeiD`Q~FLC|C?cp zp3+mF)l&!b)N^_&jsMMn?@#m;C3#Q>pU}YJ(s5Ep4W5Ve_GG|SkF=TX(AMX(snBC38g)$v~MfzX{8-{PHFse*c>+jzK`=o zrBQM>&CPmV&+pXppV#wG==mr0{4;t!P0zom@9XctoI;t%yh9nnjA=@9=mhx8-;SU*0*{~;aKPsEQ(>zH0? z(kp(w63{CxdZkOR^nMOLFLWAnXe?c0PK{+~EK_4Hjkz_JrLk;{5hBX$}SVUt{jm0z;*Vw4W9U4#9xKra9 z8qd_YOXF^hXK6fJ<2f47)p(x9^EK|#c!9`Z@6>pg#)BH~)_9M`do|vt z@j;CbX?$4YBN`8DJfiWa#$y_fYdoQe3{7Ne!lenfCbBe^>{E3)&LUCKYTA){p z^r}~{`t)k2UhUDVy_)oBvOtrCnk>>}u_jA2S*po0O_po2LX(x6tkPt)CTlcVtI0Y| zHfXX@lU_~wG})v{zb0EW*{aDlO}1;YLzA7F?9yaVliix^(IiO>YH~=EAx(xg8PQ}^ zlQB)kHJQ-lsHPm6O4pQAQyH4d)Raq8ZcSxrDqB-In#$Ewo~H6O<d{oMrusD1uc-k|jcB?^)5V%D(R7)n%QaoC=~_+KX}VFa`5LmZR5l^;(`@%hzi~daYQm zmFTr{y;iT+8ueOOuf?7h-5dHgbd4}dHRIMywq|lPldGA0%@k;+P%}lEDb`GhW(ZKG znR3lkXr@v#Rhp^ROpRu0HB+aVdd)OwrcpCq&GC#M4Gu@i$(M+#q`ZUw8nE}lVYGz0?!tOhPlGI^ocXbe(YOM21de>V!)t+&Yn^6WKbEqZ7G0k*5>+I^ofY0-dPSiF%!M z=w!N1X6R&=PUh>RM<)w(vRWs@IvLlgT%F3-sUn>!)~QmRs?@1EovPQV2AvufpQD^k z*J-Ct7wL4dPM7F(sZLkubfr$$=ya`4*XeY3*Fa(CI;)9@6QMPKR|S@TOB|2G0bH(P@mJF@|*}f$>%m z|8I^MBV>#i##?oItC)Y|D1LKPZ#|Qnm`D)j#95bp2Es&dveaR+)UoJ`Unl?OyiOBar-`l8#MWtocAB7_CTOQQ zolspSZVcmRgz=rtTx|@y$Ovz%g<*$WOdi)_NOP945s9rMhV3*dqOFP52oqw_jtwok zP14=wEV~%&La>XVM`wHaXXM1>zVGb7Z1!v>7Te?Pe3K;^l?M1XabTEO5GMM8MZ2Ao z0h12KqQ4#afQfU!WDUbUdr>J|i=HSuV23vu<^mZsfiP^RiF?p!vnWgmlfa%;H{j*_p#`&NG5D!;5xo5&q4&5fc{UJwaIGG9gUZ z2)0=V_bx_(ZOj%MQD_^B_9A0918@l2DA8;>|3r@wG)A{=kj8BKO~b^SP6`;#7ePpjMK6;#37O;0%oSgb8j=%rGZ%Jd@`j zhTj+gV-QcKi3j7RFsI_OPcV^Z+Si~hh{(81R3=3XJMc}z4!oFI^Wd|uD^oEt$6Y2R za}yZDgfxOU5Q~TH1Z}-B7Q>lqp8$xnJ&P6-*Wy{`jyCqiAm(;kR2t0}?c~+Axi#xH zVKGdy2y==NgfN0L-HS);^sp};vGsJD)ZAzLjM0x_pG(B%+?(<9oWqX3?nPhX!;xju z%re0-ZrT@JoSJ14&Egrud*x3z7*C!-nJ9A10RrTj(=Zn8>~l??a!sCcO`dX1 zv>5LRbEe4?cds#sFxQU3#LkJ7Bxi;-8xbG9&L%WFs&fvV%h0(@opbA4md<7CT)xhE zbgn?>igm6^=X^TXtaB|o*RFFxo$JxLKAjuTxrok1buOW|9eO)mZ#(t2OS~OdOKgdCAq@iF4cwR@(ax|M8%@0ToiSS!nY`z7DYS3Iv1@yi=q#oL0xF^ zU1%YWmZn9~V~f^2U1-hMg|>(|@F$M}<1y!G(OgEI81II3a6$%Tf(zV`1=)}TxsV6>;DG`tgd!-0 z5-5Xmr~wmqJv4w9e9#1bXoYs@gf0j|H}pV13~SCszFeg5B0X0nR6#Wmmy5Vu#N{F` zS2F|veO$!rY6IeSbpY|Zh~Gv0F5-6)zpEGefOK2~FbG4MbECH#z1`^RMjto&xY5T= z{@tYKCO$WDxrxh79B$%p6Nj5P+{ED~jx6$<#rav}Ig9eiDh2YMMZU8ri>yZAye!V^ zqI|mu+eNu{5x$G?U8LC+g}CN|=pUqBg4D?%bu(B2#5G8{4ieWOIu8+cm^_CF8zO9o za3R8n&?RL2Nh3s_LZlO-4nw3Bq7Fl(7orYBq#2?PL)2jiokOG_BF`c84pEOG(hQMi zh_Vl%Lx_3})dG14k%tgv6GEpDbrPbCLexo!ItfuHA@UcZPD0d4s12x-5M>ymPC}Gp zh;j^3Cn4%2M0tiN7w(<85ak)7JVTUci0dFU0wD-P1kgDY1Ij3rfKknb(Lap-Ve}89 ze;ED4=pRP^F#3nlKaBoi^bezd82!WOA4dN$`iId!jQ(Nt52Jq={ln-VM*lGShtWTb z{$cbFqkkCv!{{GI|1kQ8(Lap-Ve}89e;ED4=+DRDb7AxkqkkCv!{{GI|1kQ8(Lap- zVe}89e;ED4=pRP^F#3nlKaBoi^bezd82!WOA4Y#R5t<95e;ED4=pRA<2>M6RKZ5=d z^pBu_1pOoEA3^^J`bW?|g8mWokDz}9{UhifLH`K)N6BCSqGuF6qv#o>o}<)rlzNU*&r#|*N zKZ^cQ^pB!{6#b*hi=kHxy<+GUL$4Tm#n3B;UNQ8Fp;rvOV(1k^ zuNZp8&?|;sG4zU|R}8&k=oLe+7<$FfD~4V%^opTZ%;-hE#;DgA^%|pIW7KPmdX1xR z9DU>H8%N(b`o_^Wj-GM!jH729J>%#ZN6$F=#nCU0esT1RqhB2T_@H4fPCdraF^+z5 zbc<7OarBF$UmX47=od%7IQp?k!dx8v;^-Ggzc~8E(JzjEarBF$UmX47=*Qc5j#u$q z9G&9m!{#Y-ar7|n(gb-=koN?6PmuQnc~4MR3F<0=4heKfphE&366laXhXi$%pso_= zl0cUPx+Ksgfi4MjNuWytT@ut;0(}zHSpuCB)L8<(5?qrBbW5OH0^JhmmO!@zx+Typ zfo=(OOQ2f<-4f`QK(_?CCD1K_ZV7Ztpj(2vOQ2tZx=WyA0v&0f%q7q>LA@o=HG!_9 z=s1dwqv$w_exv9&ihiT$Hi}-O=rW2fqv$e9{zu9GDES{H|D)u8l)R3T*HQ91Nv9{vBTP@3@J^T?4hyt#`6KPymH`C)){`kOeuA51g0HdD+d- z0-gNd6#s{9(b4@*Hj!oMt_!k(IC8sy^K&^rm-F+8GtURao9BlPAdh*3%OhMFYE=-XGEeWc5KoI+?{wC` zPAWq_U38>_6S5%(av=}$!2<&sKZ$sENG-Ds?fP+@zC}baIhS4${d%I>*whpa$xo0ld%z z0ceGG=!HHQfI%3B5eP#Bq8HOqI6Ve&U36w&bW#RRQedf#S!r|8Nz6{l%1O-5ri&TH z7c*iPGbk|Q=)9Owe)H(XjDd?8HTW(bAxtMWKQtRIX5^U=PV!?Mgr)4Axae%h z6C3$l2J8ixR??6Q$QRy{eooQj?biHyf0?>>~ymim0k2A&o|TSCRq1k zO5GQY{w7^)}O|@q@ApMS$DYuW9pcRITnjyXUdWr(%fVeFTz|>CePTY9;(IIxEQFUj9P zW+LK=lED)tgC`2lCzFL%XoGezNs}(wzFD6*jK4XP@-8F7MH>Z1RT*!Sovy_UB24*k zr!B^8io+;P$(R_ial2-4Q)X~0+G!6WxrZ`LjhOo3w#kTsNrl^^gwid+zXZQhimHsO zsg|ckEyo+UQM|(NpIoHRagnz0McNeeyzk~6RS?#Ex?A(k3e8ih^NdT((-4@?Zq3RCk^YoeL=`YXIU!JFr zJWv04p1$!s{a02UYMwsgJblA?`i87Ogfc)U`f~I1)8^@W&C}1Cr|&dR|ArNbP^@{{ z(DSr)=V{l@(~g~|{Wzb9!l)K#D=auVwZNNmf%oGA7ykmcbNV6K9s)@In@~Wi_YAM57(yXK2>TuVQ=Q?z3$kRe&0T9kh9=+t-OWFH~&qsOq z$X65bG*P!rq~)&y&h?YFzx#htbf0T+UTL>RIrNsSND?{c90Zu$?Y3Jdvqe~Q9>gBs zk9FB56z^RUP_@=G%9h(W^zn!Otd4yBBVYT-{c_}*k9@wL1;3A6(~-YhN2f1WGLGE6 z$5M`^AG^$Bcm1&sId=1>ZvNDlI&-;aKIPoo&rc?PU-)D2Bjoo_m-6#}6Th$AJy$ZW zyzknjUi+iG_TM-DXl~rbjnBPtc{gtJ#@&76kMB>H_NTk_Pq+D}xBcmDw|kcVzV%n= z)*s)k&$)FM+_@|4IRExP4Ez1)HXnVxN6C-g|M>UB?9^gZ-*%0D+qL;^N9MPknBR6_ ze%m?uZRg~--I3pRM}FH0`E3W|w_S?g|8=u=A%5F!_-#kwx1ECDo`wJR?EAN8*1tWo z{_Pp`Z_l28|Ks-jg!t{5?r+ave|zTo+cVJLo^}5A?D6+6nVuQ`{^jod^`D6s&mLa> zwKehb;c?>S;|FVL;>E+B7Y_hl{PKIT1O4*(ll6c9oVI4IJ?qT+YvSdL|Mqn5#ZQEn zFE`fR#Eb9#7rU%4U!NvkeD}Y6^SSPWmv5Wawe?_m-}fnN(b}GPnOwGRCtiG)zfAqG zynV{GO#N;BZ{lUzHBJ9B@iOCfW_;|7uRrtOiI>?wEY~x8KJoIyb^iElxt2Lia|aVI z^ZMq!Z~obGI}00@uea!9mt60XYw=z8vbgm$liKJD0KMa@V|T?dtb}dud&T@57f3aOU67e1Qw^yY$aXU;L-L z<)@CHF5=3^UrF^H{Bq@w^Xl1hH(dLI*ZvrET41Zc6`^p{ORBS ze3*FgOaJBebmHYs=-sc0mwR{6y^p7F!yDoqIU*~TpUKd>Ng7+<~O}s8@T=c%hlZn?QpR?p` zOS{&Q<+_)&FS*U7d&_;W+ZV^_s@prO<&V5CEUR}wY=Ro!t2%t%eDIj^}4Nh+sFAO^}6F@c76P=+xHj#b>IEF z@4h^kw&o{Z4}ILBe?N3>M;Dg+#!2V@{r!LSo96Xc>#^(d3M*KJ==gcTMNscJAZOU9VSde!cMd7w*$b*Lmr4Fa6p1 zzIna&F*iQ`#(jF@>;LKY{Kk2`b^W)l^UnQs=WX}?dGEUJ-52*deILAf(E0k{b01vu zgXRZ+{*Ugp!l`X6dU<(N~+GuQoSd{YrTKpFbCyd2cqG-v0I3nzk0K`-wLXzTa%yynVQ_{HwTt)pS>z(@zs`Gp==J z(emH3zTT{l`{DXLKzs85?d`{wb!zz<9+{pe@&+q3)o+1njJy?HqE=8)&jLB^ZIi#G=mZw?^dJY0FRnf_)I z{mlmWn?w0Go8IsLI<)>h@$TpEyKTbv4|mqz6Yn2?So7A>#Jio9cTfM{J)wR7|_%>HS;&drQ~$H_LVT<^JxM`}?+!-PXM8YxtS>zPo4n`hM=c z@BW&2_jl-h@3ZCod#-!W$L{@i;@yMwcfZ@;4}FdY>FP>^U~)#1b_c|Y`Kk}uG6^2`_+c!zi-_yx2qHH zes90uyYC;}-jn4VU#zdzH|x7KX-!$v){Hf4{jlb&dF%gf zYtdS=maP?Q)mpRGtqp6_+OoE-9c$NeO@HoN2iBo=WF1>4mTUd<%sRI&tV`>sb!A;! zI{y6Ay0z}Cd+Wh^w4N-Dp23Vi|6~1S{nz^2`p5d8^=o4I^BK4LZ}s2mztw-M-}m{r z)$eA{kQsW z_225h)$jM*xYh63)wtE~8P@O&YuxJh_;1|mztw-M|5pF4{#*T?fsI@JxB5L38~*P^ z!|%&+tN&L2t^Qm6enyR3{r>L>!|&2@tN&L2t^Qm6clz)2-|4^8f2aRW|DFCj{T_dg zJNA%x|r~gjB-_hev|DFCj{dfBB^xx^f(|@P`PXC>LKQqUj{yY75 z`u+SIclz)2-|4^8f2aRW|DAq6TgRP#KVQe4{yY75`tS7L>A%x|r~gj>o&G!hclz)2 z-|4^8f2ZGbj&Y~|PQS+!!{h4V+3WB#d-$0>?)Bg6zt?}S-_P*jXZX0+ZXQ^vjid;RzN@Acp7zt?}S|6c#S{(JrR`tS97 zTr=+V`&lvW^?QsnJjNOK`tSAM>%Z4;vu)h#zt?}S|6c#S{(JrR`tSAM>%Z6U@zJ=~ zf3N>u|Gj=chsM4Bd;K0ejR*Y(cE*GL2mKHFAM`)y_q%I6=zq}vp#MStgZ>Bo5Bfds z8V~v(^grl-(Ep&{^OIrtXFTY)J1`#f`yDt80}aDK!}FH$p#MStgMN?Q#)JL`{RV}` zgZ>Bo5BeYUKj?qZ|DgXt|AT&y^~Qt#2mKHFZITYpg~o&a2mKHFAM`)yf6)J+|3UwQ ze$SW2gZ@YTo;MAT8HXK=@u>e%|D*m#{g3(|^_!s^X6S|)y78$0QU9ZUJ09av|D*m# z{g3(|_1g#?kNO|=Kk9$f?=k6k)bD?>J|6Wy>VMSl0CPO*_jq-9j4&ScKkE0pVm#{i zoNjndH|(s8NBxib?XZkT{T@$@NBxibAN4=#f7Jh|-{XwoIpBEI|ES;a)vzlw9`!%! zf7Jh^|4ILo{wMuU`aNeHPx_zqKk0wc|D^v(|C9bF{ZIOz^gro;(*LCYN&l1nC;c|7 z$CLgi{ZIOz^gro;((ivWKc4hI>G$}2Jn4VZ|D^v(|C9bF{ZIOz^gro;(*LCYN&l1n zC;d3`C17iv7|_n2!u>GzyqJn4Vd|E&L6|FiyQ z{m=TJ^?S}Rp7lTLf7b7@+IZIgtp8d6v;JrO&-$PBKkI+i|E&L6|FiyQ{m=S6{~mV1 zhS|$u_HvlL9MAfn_1hU6cE-lD{%8Gm$c7!V@vQ$@|FeG2-N&>3XZ_FmpY?m(I6Q6~ zcGAYP{%8H3 z8x9(VLA>F4;4p|c4B`!gc*AqTVGwT^#2W_jhC#ex5N{a78wT;lfAkx~8wT-)LA+rQ zZy3ZI2JwbLykQV;7{nU}@rFUX;UHre#2W_jhC#gHP-7Ux8wT-)LA+rQZy3ZI2JwbL zykQV;7{nU}@rFUX@gMyL@rFUXVGwT^#2fZ*hC#ex5N{a78+J{HLA+thba+fT4B`!g zc*7vxFo-v7qYi_3!&d4rh&K%44UapA{hwhFZ#ZNb2JwbLyy38AIBXdP@y1{J4dM-h zc*7vxZ~!w5;thj%!yw);h&K%44TE^YAl`5|GYsMlgLuPZ+F=lH7{nWY>HkZ=y{7S( z{=f8lygMA^3kE@3PyJ3e z4+DC`fZi~mH|&QE1A4IHyh5@}{KyMh(8@9fO0li^BZy3-U2K0vK4&%T2|EvGM z`aO>ro=1%T>NmJI9GH#2^&8w9w#;>=zD$d&6^*@wa{he8T|WFu*q)S`Gtz!vNp-TmRqsJ#QHX`G!5l zVUKYb8J;Na1FvvFy@(qK0!*i!$kZ%~|8~^D4NB=+i z4fGAqsfK~RVW4jq=o|m&|3|+gg<;=lc)m3pEewNw!(iX|N58?o@sEDPed8bf2K>f9 z`VIMwfAkyl8xARlt=>JFmKl&Yz4#R-MFyJr@ zI1B?0`(DGL=`aj93Fa3rL zhatmZ$Z!}k9EJ>sA;V$FaM*Vr1`WqA{lE18(*H~UFa3rO$1nYc4~Ju_VE}O$KpX}T zha;zno6nzW{@X=}!swSHK0 z*1WY~Em}*~vbAEZT5Hz2wP9^qTh_L)v{>9<3+q+4{HjAL}pczt-Q@Ki2;&{U7vy(EmaI2mM~OCLi?M&&>z@ zAM}6FZ+|W3>tcj2AM_jJ%Ln~N`SL-(alU-eZ=^3D^c(BT2mMC-@{pozv?+7y=^?%g=QU6E%AN7CK|53j^ z>=-7Dee8VH|55)({q`91QNMl0eANF@zx~F1)Nc?aAN7CK?;I!w5c5(0NBtl58%E4W z{U7yz)c;ZcNBw3DW0)`>^?%g=QNJm{m=cVMz=q*O5VxTJK++xlxX4+!tDrVYZrY(QyH_4Vi^#7sXBwI|f`=o z74uSYWE|sAF}M@sP%-)xqfar65(7^$*c1a#F~t;9OfkKcKlB?}ib18=*Ns7?7*xt1 z`v1`XhkgS~F|ZT^OEIt%14}Wm6nnriw3I*e!&HH(0#n7=L}03X(hpMwri!ENz*PC9 zAEpXS6__e8RbZ;XRI#5Nm@1$2162X4@=5HnnvlYVC~G2|0NKKZQwv;NQe9oxqlWj^bNx#9>v_Q~^E|7ZP<^J5?=j`Z_c z|7ZQ5^?%m?S-(M{eAfS2KLi#CED%^e>;J6(v;NQeKkNUj-&jE4u)tw~!(zxNhKvG- z#h_7)48)*O?9<1vQQ)u`IEwSrz+r*I0*3_-3mg_WEO1yp>o=5?&-y>>H+GQE`akRc ztpBtA&-x85<%@oUOZlQ7DhpJWFZ#dehs*Ls{}=sV^ncNBz$r!;@C z4}cbfPcirugHQRQ-}ypJtH-o@ocZR9en>4}^ncM0tL2OSFZzu{vN7yV!Kf6@O{ z|5yEA^?%j>RsUE0&O!pm1&)g$t$fx0RloC*n6{5|>o_Ngv+Fo3$yfdGT)yi6s{gBg zkS@;o0@LNI{;&ECdIhYDagV@tG4PeI`oHRT9+UU-f_0|5d+HlQ?gTVZj(U zi*v_()BjDs;jZ^<3NgGG^8hjU7Bc}c6OeEEzv(v;lW+RJ z>HntxoBnV5oms`)K+Fx~n||k4`KI6CT+9*VoBnV5zv(wqknj5OhJ4rmT|efK@A?hc z#k@hj>qj2)UBA(reAjRAF5mTk*KZyn-}QgjZ!9O@^?%p@UH^Ce-}ReU$ano_7V=%c z(VcwP|6Tug{onPQWyp8^-}Qgj|6Tug{onO}*Z*Drcm3b>V;6B29B0A#t{=OIGvR#K zk6#48h%>tw42*N)I490`{onOF$IEy9-}QgjZ+0ShMke)7>c=#KX=GCWq<&l@h6^*P z-&u1e^-t=b)IX_zQor-(OzNN1KdIj=Mke)7>YvmRCGQ~Ia$PwAi1Kc#<4 z|CIhI{l>2{rGHAl^U_S|pVB|2e@g$9{we(iGBc$gdkOXu>?PQX7p1iHJG;-6{we)a z`f-@xFu`Gh!(>YTl>RCGQ~Ia$PwAi1Z^$!K`ls|y={N8h1D~1FZ!RWN`i+NWO8>Ne z!@-%>Kds-{ZVZOTU}%hwWm^BV{%QTw`lt0z>z~#?t$$kowEk)R)B30N8z{}RenX|1 z)<3P^aA~IXPwO|6kZJwX`lt0z>z~$dP9f9!r}a2JqX7n3$&5ZsT{RUoR;59S) zXY|kLH~bps-I>upqkl&KjQ$z@M(biQHfAegwju^(W7IA)`e*dd=r?p7L$jIDZ*Vp< z`e*dd=%3L)qkl%fxs1%{pV2>~e@4HNy%@C3jQ$z@Gx}%r&*-1gZvZzl`e*bT#LbL; zGf$b(Kcjy}|BQY!P%-+K8T|%zGo#Nm6-V}UUzm0A6>`prvaR==64%<4Bc zm0A6>`e*ge>Yvp=tAAGito~X3v-)TC&+4DmKdXOM|E&I5{j>UK_0Q@zhL~CXv-)TC z&+0dum0A6>`e*eUQ_QUXS^cy6&8}ot|E&I5{j>Vba%EQkto~X329PtWe^&pj{#pIA z`e*fHN_DL;nx`=5+EyzgeC9&~K1BKlK05|3m)|{Xg`Z)5s6~KlK05|3m)| z{Xg{2>7Ua-r+-fWoc=leX5uoZ-%xhU#$`@FFpxR@hO{%Me@_3L{yF_~`seh|={FOS zIsJ3`=k(9%pVL35--u}pcV|xjoc=lebNbERWlsN`{yF_~`seh|>7Ua-r+-fWoc=le zbNc7>&*`7jKc|0A|D66g{d4-~^cy0NA@a=WpVL35-&|AX^v~&^*FUd+UjMxQdHwVH z=k=Qf%)EXxP?^_1uYX?uy#9Ip^ZMuY&+9ibn|b~7`sek}>z~&@uYX?uy#9IpW(zZ~ ze_sE*{(1fL`sek}>z~&@uYX?uynaLPF^iaa{jf&n_0Q{{*FUd+UcWiT%R;5qsNei(7WFUcU(~;-e^LLU{zd(Z`WN+^G0md>Mg5EV7xgddU(~;- z-#lIx^)Ko-mzPETX7jSBe^EaikwyKB`WN*t>R;4vb}x(i7xgddU(&y%AFs`l{w4iO z`pvgyN&k}mCH+hKm-H{`U(&y%e@Xw6{w4iO`j_-C>4%?UJ}^uA&B$g+|C0VC{Y(0n z^e^c*JDVl_OZu1eFX=Z+n;_U(&y%-^^{6^qaelx!V{` z&yxNn{bq2pq<=~Ol792JS<=6x-&}5%^e^jQ*1xQOS^u*BW&O+gm-R2}U)H~@e_8*s z{$>5k`j_=D>tEKttbbYmvVIUHM&q-re_6lz-z@82*1xPDw#u^pW&O+g!K*CmU)H~@ z->h(!^)KsR*1xQOS^u(r=Sc{S<#OnWJUjq{uTWz`d9R?=wH>psvioR;8rs^2_#@QAGHU)8^=e^vjg{#E^}`d9U@>R;8rs()4gs{U2|tNK^vpujw}z8Z+-%({JuQ{72UGujyaYzovgp z|C;_a{cHNy^snh()4!&FO~2XFtm$9Vzovgp|C;_a{orEO^snh()4!&FP5+wyHT`S) z*YulH&bt0}{pRnpu76$sy8d-yLAgDF|pZ|GNHl{p8$Ht z*T1fRUH`iNb^Yu5*YyK5S=Y}IWL>|R+pOzf*T1gc{B8Jxtm|LbzoCCa|Azhz{Tuo> z^qbMmhJN$f+0ehCe?$L<{tf*b`Zx4%=!bQ(p?^dFhW-ux8~Qi&Z|L99zoCCaKh%>A z{Tuo>^l#|j(7&O7L;r^U4gDMXH}r4lH!q$I{Tuo>^l#|rBeJ1?L;r^U4gDMXfuU^Z zH(MOD#W7nPv&GrazoCCa|Av0BHGD-j^>6Cm)W4~JQ~#!Z=qQ`|H}!Ao-_*aUe^dXa z{!RUx`Zx7&>fhAAsee=drv6R+oBB8PZ|dLFzo~yy|EB&;{hRtX^>6Ao_Z)N2+0?(O ze^Wp25p&Sl)W4~JQ~##^P5m55HuZy1+0?(Oe^Wmjl1=?)r?aUan98R9P5qntH}!Ao z-_*aUAEpXRk}dsP`nU9N>EF`7r605kXq7GfTl%;3Z|OI?pDq1c`nU9N>EF`7rGHC5 zKp8kITl%;3Z|MiMvZa4ZKf4lkC0qL8t!(Kx=bbJ6Tl%;3n^g`UkS+aN`nU9N>EF`7 zrGHC5oS7~C=EJk4e@p+C{w@7m`nU9N>EF`7rGHERmi{gMTl%;4Z|mRIzpZ~;|F-^Z z{oDG@oM&7Aw*GDX+xoZlgJ#**zpdX)dbah0XW7=jt>27#w)Jo8-`2mae_Q{y{%!r{ z*R!pETmQCxv+UW{zpZ~;|F-^Z{oDG@xMy2GCy{Oa+xoZlo8!*5{%!r+`nUCO>)+PD zt$$npwtluE+xlTR|GN`Q|F-^Z{X6;r_3Y^1(Z8b~mcl7V*-_gILe@Fk0{vG{0`gip2=-<)5qkl*Lj{Y6}JNkF@b0yi)zoUOg|Bn70 z{k!^i_3!H6)xWEMSO2d5UH!ZIclGb;=M1u|e^);YoL&99`gis3>fhDBtAAJjuKr#9 zyZU$a@9Jk1va5er|E~UB{mepmm+)HI)xWEMSO2d5UH!ZIclGb;-_^gXe^>vm{$2gM z`gis3>gOH;C}&syuKr#9yZU$a@9N*xzpH;&|E~UB{d@X3iR|g$)4!*mnaH00J^g$7 z_w?`S=P0tLAAruD{yqJB`uFtj>EF}8r+-iXo_^3W+(q{E@9E#uzo&ms|DJxnCwuz$ z^zZ54)4!*GPye3&J^g$7xsB}U-_yURe^39OelMz!J^e6tVC?Ma-_yURe^39O{yqJB z`uFs+AKBBtr+-iXo_@Y4`}+6w@9W>!zpsB^|Gxfx{rmd&_3!Ivk+QF!tINLref|6T z+0pFl-`Bsde_#K;{(b%X`XTY`>)+SEub)B5zJ9JL`}+6w^G(^;zpsB^|Gxfx{rmd& z_3!K7*T1iSU;n=Tef|6T_x11V-`Bsde_#K;{(b%X`uFwk>)+SEuYX_vzWxLK%xezx zALu{O&%fqC|AGDk{RjFF^dIQw2y>vHkIjMp1N{g35A+}CKhS@m|3LqN{sa97`VaIU z=s(bZp#MPsf&K&i2l@~6ALu{Of1v+B|ABr^CkOfu^dIOy(0`!+K>vaM1N{g35A=ig zInaNg|3LqN{sa97`VaIU=s(bZp#MPsf&N4Nhx)m)9O^&Rf2jXZ|Dpav{fGLw$sFoG z)PJb|Q2(L+L;Z*P5A`4FKh%Gy|4=`pmP7rA`VaN8U@5Khf#^q2y9~b5` zj8hKvAL>8Uf299NzZWIQk^Uq7NBX@$L5}nv=|9qcr2k0&k^Uq7NBWQSGj}=Cf299N zKQA3GYLFxSNBWQSAL;iZ2RYL3g${D0|49Fl{v-WI`n~u;j`SbtKhl4s--{uH#|n=X zRy9ZZkMtkuKhl4s|49Fl{v-WI`j7M<=|9qcr2k0&k$%oKNBUXU9O*yOf299dKQkD{ zJIDHu^&jg$)_<)3SpTv9WBteakM$qxKh}S&pOuYQXUMVsWBp#GA;p#|i ztluj`p#|itp8a5v3@=>Om9x~bG|Hzn51FJDC&xC;CtH zpXg^PbE5x5KLZ>FI4Al~^q=TI(SM@Oa+gs{d5~ss2;_r~3KQ zoa#T-f2#jf|Ec~{{ipg*^`GiL)qkr0RR5{|Q~jsOa+gs{d5~ss2;_Tw+f3pXxu=f2#jf|Ec~{{ipg*^`GiL)qkp=W6Y_39yVwCy{rWLbN%P~&-I_{Ki7Y*|6KpM{&W53`p@;B>p$0juK!&Bx&Cwg=laj}pX)!@f3E*r z|GEBi{a(!_=laj}pX+B%bFTkf|GEBi{pb46^`GlM*MF{`wa>Z!bN%P~&-I_{=kIf_ z|6KpM{&W53`Y-ff=)cf^q5nevh5ifu7y2*sU+BNkf1&?E|Aqbw{TKQ#^k3+|(0`%- zLjQ&S3;h@RFZ5sNztDf7|3d$T{tNvV`Y-ff=)cf^q5nevh5ifu7y2*sU+BNkf1&?E z|Aqbw{TKQ#^k3+|(0`%-LjQ&S3;h@RFZ5sNztDf7|3d$T{tNvV`Y-ig>c7-~ssB>{ zrT$C(m-;XDU+TZqf2sdc|E2y*{g?VL^@ z{!9Ir`Y-ig>c7-~ssB>{rT$C(m-;XDU+TZqf2sdc|E2y*{g?VL^gngV7br+zQ`ouB%> z@OOUd_u}9AssE?`pZXc-{M7$b|4;ot_5alWQ@KlOY4hj`tT z{M7$b|4;oab$;sqsoyIZ<){9i`hV*GssE?`pZa<0{M7$bKX)B3NS2@af9n6KpK;Dl z{TzRO>i0tBVYB1K%H!3M@>Bnn{ww`g`mgj~>A%u{rTH(_xb_3($9nEO8=GqEB#mcuk>H(ztZnDV{)bcO8=F9 zuh^6;{a5<0^k3<}(toA@O8=F9zC2g@uk>H(ztVrD|4RRrey=@~EB#)7CRh5e^k3<} z(toA@O25}O$hH1!{nz@h^?Q}5T%Z22t^ZoT z|0hAN^jPkf35#o|F!;W{a)BE*ZRFCPOkM|>%Z22t^Zp8wf<}U*ZQyZU+cft zf35#o|FwRvrITy@*ZQyZU+eeUI=R+=t^Zp8wf<}U*ZRHYPQ2z$uJvE*zt(@P-|LR# zM*od|uZoo${Wto(7D;aO-{`;5f1}^)_2fqXjs6?`H~Me%-{`;5f1}@P`Q%3bjs6?` zUS%sc`fv1mji21;ztMlA|3?3f{u})_`fv2#=)ci_qyI+#js6?`H~Me%-{`;5?^U~U zqyI+#js6?`Ud1an`fv2#=)ckLRlRbf|3?3f{u})_`fv2#=;!~#|A+q%{~!K8{D1iW z@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2D~f1zvX=l{e1hyM@%AO1i5fB66K|Kb0` z|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+` zhyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=> z{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci> z5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q% z{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@% zAO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk z{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$j zKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8 z{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5 zfB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG z`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A z|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW z@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K z|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<# z;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e z|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe z!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0` z|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+` zhyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=> z{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci> z5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q% z{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@% zAO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk z{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$j zKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8 z{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5 zfB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG z`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A z|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW z@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K z|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<# z;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e z|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe z!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0` z|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+` zhyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=> z{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci> z5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|A+q% z{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk{y+SG`2X<#;s3+`hyM@% zAO1i5fB66K|Kb0`|A+q%{~!K8{D1iW@c-fe!~ci>5C0$jKm33A|M36e|HJ=>{}2Bk z{y+SG`2X<#;s3+`hyM@%AO1i5fB66K|Kb0`|Cj$S|6l&U{D1lX^8e-k%m0`EFaKZu zzx;ps|MLIk|I7cE|1bYv{=fWx`Tz3&<^Rk7m;W#SU;e-RfBFCN|KN&S=hC-qP2pVU98e^URX{z?6l`X}{I>YvmYvm7UX+rGHBQl>RCGQ~Ia$PwAi1Kc#<4|CIhI{Zsm<^iS!Z z(m$ntO8=DpDg9IWr}R(hpVB|2e@g$9{we)a`ls|y>7UX+rGHBQl>RCGQ~Ia$PwAi5 zKdpaS|Fr&T{nPrV^-t@c*3bW!|1baFY5mjsr}a|Rk!@?DJWOe*TiQTTM@3H@%{r~L$Xa7I@|Jnb~{(tuWv;Uv{|Lp%~|3CZx+5gY} zfA;^g|DXN;?Eh!~Kl}gL|Ihw^_W%2$|BHV6|Jnb~{(tuWv;Uv{|Lp%~|3CZx+5gY} zfA;^g|DXN;?Eh!~Kl}gL|Ihw^_W!g0pZ)*r|7ZU{`~TVh&;Eb*|Fi#}{r}$dzv;LC zpZ)*r|7ZU{`~TVh&;Eb*|Fi#}{r~L$Xa7I@|Jnb~{(tuWv;Uv{|Lp%~|3CZx+5gY} zfA;^g|DXN;?Eh!~Kl}gL|Ihw^_W!g0pZ)*r|7ZU{`~TVh&;Eb*|Fi#}{r~L$Xa7I@ z|Jnb~{(tuWv;Uv{|Lp(0{qEZm24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja5 z0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV% zfMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A` zU>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2! z7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_ zh5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~ zVE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g z7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh= z24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ z0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja5 z0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV% zfMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A` zU>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2! z7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_ zh5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~ zVE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g z7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh= z24EO~VE~2!7zSV%fMEcJ0T>2g7=U2__R#;(|Iq)?j{z74U=RHd{SW;Q{SW;Q{SW;Q z{SW;Q{SW;Q{SW;Q{SW;Q{SW;Q{TP5@0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_ zh5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~ zVE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g z7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSWZ{ZIW*{ZIWEfMEcJ0T>2gPyJ8*PyJ8* zPyJ8*PyJ8*PyJ8*PyJ8*PyJ8*PyJ8*m;OutrT@}@>A&<}`Y-*L{!9O*|I&Zyzw}@F zFa4MPOaG<+(tqi{^k4cf{g?hr|E2%Zf9b#UU-~com;OutrT@}@>A&<}`Y-*L{!2dw zU>Ja1`Y-*L{!2dwU>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5^`1|4aW% z|4aW%|4aW%|4aW%|4TmxU>JbC^uP4K^uP3D0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g z7=U2_h5;A`U>JbC^uP4K^uP4K^uP4K^uP4K^uP4K^uP4K^uP4K^uP4K^uP4K^uP4K z^uP4K^uP4K^uP4K^uP4K^uP4K^uP3D0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_ zh5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~ zVE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g z7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh= z24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ z0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja5 z0EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV% zfMEcJ0T>2g7=U2_h5;A`U>Ja50EPh=24EO~VE~2!7zSV%fMEcJ0T>2g7=U2_h5;A` zU>Ja50EPh=24EO~VE~2!7zSV%fMEc<{in6{XaJ)Dj0P|oz-R!Y0gMJP8o+1(qXCQt zFdD#U0HXnn1~3}HXaJ)Dj0P|oz-R!Y0gMJP8o+1(qXCQtFdD#U0HXnn1~3}HXaJ)D zj0P|oz-R!Y0gMJP8o+1(qXCQtFdD#U0HXnn1~3}HXaJ)Dj0P|oz-R!Y0gMJP8o+1( zqXCQtFdD#U0HXnn1~3}HXaJ)Dj0P|oz-R!Y0gMK)pY_uKMgtfPU^IZy07e5C4PZ2Y z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU( z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpKmL8v07e5C4PZ2Y z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU( z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU( z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU( z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR z7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|n zMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y z(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifp zG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C z4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU( z0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy z07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=F zfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfP zU^IZy07e5C4PZ2Y(EvsR7!6=FfYAU(0~ifpG=R|nMgtfPU^IZy07e5C4PZ2Y(EvsR z7!6=FfYAU(0~ifpG=R|nMgtfPAR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh z5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC? z4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1 zAR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ( z8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2 zKs1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4Immo zG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4 zfM@{G0HOgz1BeC?4Immof2^Md5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$ zhz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?W^e6gh0MP)V0Yn3c1`rJ(8bCCF zXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks118 z0MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G z0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLaw zq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V z0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?W zL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz z1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$ zhz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c z1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh z5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC? z4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1 zAR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ( z8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2 zKs1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4Immo zG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4 zfM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCF zXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh5Dg$2Ks118 z0MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$=(qZ50MP)V0Yn3c z1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT(Ey?WL<5Kh z5Dg$2K!2v61`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G0HOgz1BeC?4ImmoG=OLT z(Ey?WL<5Kh5Dg$2Ks1180MP)V0Yn3c1`rJ(8bCCFXaLawq5(t$hz1Z1AR0h4fM@{G z0HOgz1BeFDq5sf-=s)z+0HOgz1BeFDq5sf-=s)xy`Vakw{zLzv|ImNvKlC5^5B-P! zL;s=w(0}MZ^dI^U{fGWT|DpfTf9OB-ANmjdhyFwVq5sf-=s)xy`Vakw{zLzv|ImNv zKlC5^5B-P!L;tD&)PL$f^`H7r{ips@|Ed4ff9gN=pZZVzr~Xs_ssGe}>Ob|L`cM6* z{!{;{|I~l#KlPvbPyMI2MC_aE>4&HsDH15fimzw1{| z_kR!m>GvP}^Y1_S*@NHws}KG0l>hyX2S1+ae;@pKrr-SAFCX%|cYgVhUmp07Pp)p= zpS<*S@LZp~ki0+nj&y%AzP~?N2H&6T+wM<>AopiK0^FZ{-Mc^gPk(D+fPO-fzDA-ftGz_nV#9{bo3Hzy0iQ`OUq$_nVuX?l*U>+;49ExW77A zeSdWl^8V_q%>C8Ng!`-iclTEu!2Q)t`1e=0gx_D?+u`63e|Kg5es`JD;a?nF6mcpAFim}A1)8v zAI{U?A5OB~AI|FCAAZ!jKm06le|YhDe=sE+d}<$;=ll5aseQOZ`~Gk{?fv0?qWi-Q zB=|K literal 0 HcmV?d00001 diff --git a/baselines/models/xlnet/squad_utils.py b/baselines/models/xlnet/squad_utils.py new file mode 100644 index 0000000..3b4a406 --- /dev/null +++ b/baselines/models/xlnet/squad_utils.py @@ -0,0 +1,327 @@ +"""Official evaluation script for SQuAD version 2.0. + +In addition to basic functionality, we also compute additional statistics and +plot precision-recall curves if an additional na_prob.json file is provided. +This file is expected to map question ID's to the model's predicted probability +that a question is unanswerable. +""" +import argparse +import collections +import json +import numpy as np +import os +import re +import string +import sys + +OPTS = None + +def parse_args(): + parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') + parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') + parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') + parser.add_argument('--out-file', '-o', metavar='eval.json', + help='Write accuracy metrics to file (default is stdout).') + parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', + help='Model estimates of probability of no answer.') + parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, + help='Predict "" if no-answer probability exceeds this (default = 1.0).') + parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, + help='Save precision-recall curves to directory.') + parser.add_argument('--verbose', '-v', action='store_true') + if len(sys.argv) == 1: + parser.print_help() + sys.exit(1) + return parser.parse_args() + +def make_qid_to_has_ans(dataset): + qid_to_has_ans = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid_to_has_ans[qa['id']] = bool(qa['answers']) + return qid_to_has_ans + +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + def white_space_fix(text): + return ' '.join(text.split()) + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + def lower(text): + return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) + +def get_tokens(s): + if not s: return [] + return normalize_answer(s).split() + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + +def get_raw_scores(dataset, preds): + exact_scores = {} + f1_scores = {} + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + qid = qa['id'] + gold_answers = [a['text'] for a in qa['answers'] + if normalize_answer(a['text'])] + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + if qid not in preds: + print('Missing prediction for %s' % qid) + continue + a_pred = preds[qid] + # Take max over all gold answers + exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) + f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) + return exact_scores, f1_scores + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + +def plot_pr_curve(precisions, recalls, out_image, title): + plt.step(recalls, precisions, color='b', alpha=0.2, where='post') + plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') + plt.xlabel('Recall') + plt.ylabel('Precision') + plt.xlim([0.0, 1.05]) + plt.ylim([0.0, 1.05]) + plt.title(title) + plt.savefig(out_image) + plt.clf() + +def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, + out_image=None, title=None): + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + true_pos = 0.0 + cur_p = 1.0 + cur_r = 0.0 + precisions = [1.0] + recalls = [0.0] + avg_prec = 0.0 + for i, qid in enumerate(qid_list): + if qid_to_has_ans[qid]: + true_pos += scores[qid] + cur_p = true_pos / float(i+1) + cur_r = true_pos / float(num_true_pos) + if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: + # i.e., if we can put a threshold after this point + avg_prec += cur_p * (cur_r - recalls[-1]) + precisions.append(cur_p) + recalls.append(cur_r) + if out_image: + plot_pr_curve(precisions, recalls, out_image, title) + return {'ap': 100.0 * avg_prec} + +def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, + qid_to_has_ans, out_image_dir): + if out_image_dir and not os.path.exists(out_image_dir): + os.makedirs(out_image_dir) + num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) + if num_true_pos == 0: + return + pr_exact = make_precision_recall_eval( + exact_raw, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_exact.png'), + title='Precision-Recall curve for Exact Match score') + pr_f1 = make_precision_recall_eval( + f1_raw, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_f1.png'), + title='Precision-Recall curve for F1 score') + oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} + pr_oracle = make_precision_recall_eval( + oracle_scores, na_probs, num_true_pos, qid_to_has_ans, + out_image=os.path.join(out_image_dir, 'pr_oracle.png'), + title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') + merge_eval(main_eval, pr_exact, 'pr_exact') + merge_eval(main_eval, pr_f1, 'pr_f1') + merge_eval(main_eval, pr_oracle, 'pr_oracle') + +def histogram_na_prob(na_probs, qid_list, image_dir, name): + if not qid_list: + return + x = [na_probs[k] for k in qid_list] + weights = np.ones_like(x) / float(len(x)) + plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) + plt.xlabel('Model probability of no-answer') + plt.ylabel('Proportion of dataset') + plt.title('Histogram of no-answer probability: %s' % name) + plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) + plt.clf() + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + +def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + + has_ans_score, has_ans_cnt = 0, 0 + for qid in qid_list: + if not qid_to_has_ans[qid]: continue + has_ans_cnt += 1 + + if qid not in scores: continue + has_ans_score += scores[qid] + + return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + +def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + main_eval['has_ans_exact'] = has_ans_exact + main_eval['has_ans_f1'] = has_ans_f1 + +def main(): + with open(OPTS.data_file) as f: + dataset_json = json.load(f) + dataset = dataset_json['data'] + with open(OPTS.pred_file) as f: + preds = json.load(f) + + new_orig_data = [] + for article in dataset: + for p in article['paragraphs']: + for qa in p['qas']: + if qa['id'] in preds: + new_para = {'qas': [qa]} + new_article = {'paragraphs': [new_para]} + new_orig_data.append(new_article) + dataset = new_orig_data + + if OPTS.na_prob_file: + with open(OPTS.na_prob_file) as f: + na_probs = json.load(f) + else: + na_probs = {k: 0.0 for k in preds} + qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(dataset, preds) + exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, + OPTS.na_prob_thresh) + f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, + OPTS.na_prob_thresh) + out_eval = make_eval_dict(exact_thresh, f1_thresh) + if has_ans_qids: + has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) + merge_eval(out_eval, has_ans_eval, 'HasAns') + if no_ans_qids: + no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) + merge_eval(out_eval, no_ans_eval, 'NoAns') + if OPTS.na_prob_file: + find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) + if OPTS.na_prob_file and OPTS.out_image_dir: + run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, + qid_to_has_ans, OPTS.out_image_dir) + histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') + histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') + if OPTS.out_file: + with open(OPTS.out_file, 'w') as f: + json.dump(out_eval, f) + else: + print(json.dumps(out_eval, indent=2)) + +if __name__ == '__main__': + OPTS = parse_args() + if OPTS.out_image_dir: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt + main() diff --git a/baselines/models/xlnet/summary.py b/baselines/models/xlnet/summary.py new file mode 100644 index 0000000..b0ed5b1 --- /dev/null +++ b/baselines/models/xlnet/summary.py @@ -0,0 +1,128 @@ +# -*- coding: utf-8 -*- +''' +print summary +''' +from __future__ import print_function +from collections import Counter, OrderedDict +import string +import re +import argparse +import json +import sys +reload(sys) +sys.setdefaultencoding('utf-8') +import pdb +import os +import math +import numpy as np +import collections +from prettytable import PrettyTable + +def print_summary(): + lscmd = os.popen('ls '+sys.argv[1]+'/result.*').read() + result_list = lscmd.split() + num_args = len(result_list) + assert num_args==2 or num_args==3 + + dev_input_file = open(sys.argv[1]+'/result.dev', 'rb') + test_input_file = open(sys.argv[1]+'/result.test', 'rb') + if num_args==2: + print_table = PrettyTable(['#','DEV-AVG','DEV-EM','DEV-F1','TEST-AVG','TEST-EM','TEST-F1','FILE']) + elif num_args==3: + chl_input_file = open(sys.argv[1]+'/result.challenge', 'rb') + print_table = PrettyTable(['#','DEV-AVG','DEV-EM','DEV-F1','TEST-AVG','TEST-EM','TEST-F1','CHL-AVG','CHL-EM','CHL-F1','FILE']) + + # style set + print_table.align['FILE'] = 'l' + print_table.float_format = '2.3' + + # data fill + dev_avg = [] + dev_em = [] + dev_f1 = [] + dev_file = [] + for dline in dev_input_file.readlines(): + dline = dline.strip() + if re.search('^{', dline): + ddict = json.loads(dline) + dev_avg.append(float(ddict['AVERAGE'])) + dev_em.append(float(ddict['EM'])) + dev_f1.append(float(ddict['F1'])) + dev_file.append(ddict['FILE']) + + test_avg = [] + test_em = [] + test_f1 = [] + test_file = [] + for dline in test_input_file.readlines(): + dline = dline.strip() + if re.search('^{', dline): + ddict = json.loads(dline) + test_avg.append(float(ddict['AVERAGE'])) + test_em.append(float(ddict['EM'])) + test_f1.append(float(ddict['F1'])) + test_file.append(ddict['FILE']) + + if num_args==3: + chl_avg = [] + chl_em = [] + chl_f1 = [] + chl_file = [] + for dline in chl_input_file.readlines(): + dline = dline.strip() + if re.search('^{', dline): + ddict = json.loads(dline) + chl_avg.append(float(ddict['AVERAGE'])) + chl_em.append(float(ddict['EM'])) + chl_f1.append(float(ddict['F1'])) + chl_file.append(ddict['FILE']) + + # print + if num_args == 2: + min_len = min(len(dev_avg),len(test_avg)) + for k in range(min_len): + print_table.add_row([k+1, dev_avg[k], dev_em[k], dev_f1[k], test_avg[k], test_em[k], test_f1[k], dev_file[k]]) + elif num_args == 3: + min_len = min(len(dev_avg),len(test_avg),len(chl_avg)) + for k in range(min_len): + print_table.add_row([k+1, dev_avg[k], dev_em[k], dev_f1[k], test_avg[k], test_em[k], test_f1[k], chl_avg[k], chl_em[k], chl_f1[k], dev_file[k]]) + + if len(sys.argv)==3: + sk = sys.argv[2].upper() + print('sort key detected: {}'.format(sk)) + print(print_table.get_string(sortby=sk, reversesort=True)) + else: + print(print_table) + + + if num_args == 2: + summary_table = PrettyTable(['#','DEV-AVG','DEV-EM','DEV-F1','TEST-AVG','TEST-EM','TEST-F1','FILE']) + summary_table.add_row(["M", np.max(dev_avg), np.max(dev_em), np.max(dev_f1), + np.max(test_avg), np.max(test_em), np.max(test_f1),"-"]) + summary_table.add_row(["A", np.mean(dev_avg), np.mean(dev_em), np.mean(dev_f1), + np.mean(test_avg), np.mean(test_em), np.mean(test_f1),"-"]) + summary_table.add_row(["D", np.std(dev_avg), np.std(dev_em), np.std(dev_f1), + np.std(test_avg), np.std(test_em), np.std(test_f1),"-"]) + elif num_args == 3: + summary_table = PrettyTable(['#','DEV-AVG','DEV-EM','DEV-F1','TEST-AVG','TEST-EM','TEST-F1','CHL-AVG','CHL-EM','CHL-F1','FILE']) + summary_table.add_row(["M", np.max(dev_avg), np.max(dev_em), np.max(dev_f1), + np.max(test_avg), np.max(test_em), np.max(test_f1), + np.max(chl_avg), np.max(chl_em), np.max(chl_f1), "-"]) + summary_table.add_row(["A", np.mean(dev_avg), np.mean(dev_em), np.mean(dev_f1), + np.mean(test_avg), np.mean(test_em), np.mean(test_f1), + np.mean(chl_avg), np.mean(chl_em), np.mean(chl_f1), "-"]) + summary_table.add_row(["D", np.std(dev_avg), np.std(dev_em), np.std(dev_f1), + np.std(test_avg), np.std(test_em), np.std(test_f1), + np.std(chl_avg), np.std(chl_em), np.std(chl_f1), "-"]) + # style set + summary_table.align['FILE'] = 'l' + summary_table.float_format = '2.3' + print(summary_table) + return 0 + + + + +if __name__ == '__main__': + print_summary() + diff --git a/baselines/models/xlnet/temp.sh b/baselines/models/xlnet/temp.sh new file mode 100644 index 0000000..a3af974 --- /dev/null +++ b/baselines/models/xlnet/temp.sh @@ -0,0 +1,2 @@ +a=`pwd` +echo $a diff --git a/baselines/models/xlnet/tpu/run_classifier_inews.sh b/baselines/models/xlnet/tpu/run_classifier_inews.sh new file mode 100755 index 0000000..74f7e0e --- /dev/null +++ b/baselines/models/xlnet/tpu/run_classifier_inews.sh @@ -0,0 +1,28 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="inews" +export XLNET_DIR=gs://models_zxw/prev_trained_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --spiece_model_file=${CURRENT_DIR}/../spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$DATA_DIR \ + --output_dir=${OUTPUT_DIR} \ + --model_dir=${OUTPUT_DIR} \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=8 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=True diff --git a/baselines/models/xlnet/tpu/run_classifier_lcqmc.sh b/baselines/models/xlnet/tpu/run_classifier_lcqmc.sh new file mode 100755 index 0000000..51e49e2 --- /dev/null +++ b/baselines/models/xlnet/tpu/run_classifier_lcqmc.sh @@ -0,0 +1,28 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="lcqmc" +export XLNET_DIR=gs://models_zxw/prev_trained_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --spiece_model_file=${CURRENT_DIR}/../spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$DATA_DIR \ + --output_dir=${OUTPUT_DIR} \ + --model_dir=${OUTPUT_DIR} \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=8 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=True diff --git a/baselines/models/xlnet/tpu/run_classifier_tnews.sh b/baselines/models/xlnet/tpu/run_classifier_tnews.sh new file mode 100755 index 0000000..864bd66 --- /dev/null +++ b/baselines/models/xlnet/tpu/run_classifier_tnews.sh @@ -0,0 +1,28 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="tnews" +export XLNET_DIR=gs://models_zxw/prev_trained_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/hard_${TASK_NAME}_1 +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --spiece_model_file=${CURRENT_DIR}/../spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=True \ + --uncased=False \ + --data_dir=$DATA_DIR \ + --output_dir=${OUTPUT_DIR} \ + --model_dir=${OUTPUT_DIR} \ + --train_batch_size=16 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=8 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=1e-5 \ + --save_steps=1000 \ + --use_tpu=True --tpu=grpc://192.168.0.2:8470 diff --git a/baselines/models/xlnet/tpu/run_classifier_xnli.sh b/baselines/models/xlnet/tpu/run_classifier_xnli.sh new file mode 100755 index 0000000..df48093 --- /dev/null +++ b/baselines/models/xlnet/tpu/run_classifier_xnli.sh @@ -0,0 +1,28 @@ +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +CURRENT_TIME=$(date "+%Y%m%d-%H%M%S") +TASK_NAME="xnli" +export XLNET_DIR=gs://models_zxw/prev_trained_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12 +export DATA_DIR=gs://data_zxw/nlp/chineseGLUEdatasets.v0.0.1/$TASK_NAME +export OUTPUT_DIR=gs://models_zxw/fine_tuning_models/nlp/xlnet-base/chinese_xlnet_base_L-12_H-768_A-12/tpu/$TASK_NAME/$CURRENT_TIME + +python $CURRENT_DIR/../run_classifier.py \ + --spiece_model_file=${CURRENT_DIR}/../spiece.model \ + --model_config_path=${XLNET_DIR}/xlnet_config.json \ + --init_checkpoint=${XLNET_DIR}/xlnet_model.ckpt \ + --task_name=$TASK_NAME \ + --do_train=True \ + --do_eval=True \ + --eval_all_ckpt=False \ + --uncased=False \ + --data_dir=$DATA_DIR \ + --output_dir=${OUTPUT_DIR} \ + --model_dir=${OUTPUT_DIR} \ + --train_batch_size=32 \ + --eval_batch_size=8 \ + --num_hosts=1 \ + --num_core_per_host=8 \ + --num_train_epochs=3 \ + --max_seq_length=128 \ + --learning_rate=2e-5 \ + --save_steps=1000 \ + --use_tpu=True diff --git a/baselines/models/xlnet/tpu_estimator.py b/baselines/models/xlnet/tpu_estimator.py new file mode 100644 index 0000000..cc0f801 --- /dev/null +++ b/baselines/models/xlnet/tpu_estimator.py @@ -0,0 +1,3522 @@ +# Copyright 2017 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =================================================================== +"""TPUEstimator class.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import copy +import os +import signal +import sys +import threading +import time + +import numpy as np +import six +from six.moves import queue as Queue # pylint: disable=redefined-builtin +from six.moves import xrange # pylint: disable=redefined-builtin + +from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result +from tensorflow.contrib.tpu.python.tpu import tensor_tracer +from tensorflow.contrib.tpu.python.ops import tpu_ops +from tensorflow.contrib.tpu.python.tpu import error_handling +from tensorflow.contrib.tpu.python.tpu import session_support +from tensorflow.contrib.tpu.python.tpu import tpu +from tensorflow.contrib.tpu.python.tpu import tpu_config +from tensorflow.contrib.tpu.python.tpu import tpu_context +from tensorflow.contrib.tpu.python.tpu import tpu_feed +from tensorflow.contrib.tpu.python.tpu import training_loop +from tensorflow.contrib.tpu.python.tpu import util as util_lib +from tensorflow.contrib.training.python.training import hparam +from tensorflow.core.framework import variable_pb2 +from tensorflow.core.framework.summary_pb2 import Summary +from tensorflow.core.protobuf import config_pb2 +from tensorflow.python.client import session as tf_session +from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.util import nest as data_nest +from tensorflow.python.estimator import estimator as estimator_lib +from tensorflow.python.estimator import model_fn as model_fn_lib +from tensorflow.python.estimator.export import export_output as export_output_lib +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors +from tensorflow.python.framework import ops +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import check_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import init_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import summary_ops_v2 as contrib_summary +from tensorflow.python.ops import variable_scope +from tensorflow.python.ops import variables +from tensorflow.python.platform import tf_logging as logging +from tensorflow.python.saved_model import tag_constants +from tensorflow.python.summary import summary +from tensorflow.python.training import basic_session_run_hooks +from tensorflow.python.training import evaluation +from tensorflow.python.training import session_run_hook +from tensorflow.python.training import training +from tensorflow.python.training import training_util +from tensorflow.python.util import function_utils +from tensorflow.python.util import nest +from tensorflow.python.util import tf_inspect + +_INITIAL_LOSS = 1e7 +_ZERO_LOSS = 0. +_TPU_ESTIMATOR = 'custom_tpu_estimator' +_ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop' +_BATCH_SIZE_KEY = 'batch_size' +_CTX_KEY = 'context' +_USE_TPU_KEY = 'use_tpu' +_CROSS_REPLICA_SUM_OP = 'CrossReplicaSum' +_ONE_GIGABYTE = 1024 * 1024 * 1024 +_TPU_ENQUEUE_OPS = '_tpu_enqueue_ops' +_TPU_TRAIN_OP = '_tpu_train_op' +_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference' + +# Ideally _USE_TPU_KEY should be reserved as well. However there are already +# models that make use of this key, thus it can not be reserved now to prevent +# breakage. In the long run, we would like to mitigate this by migrating models +# off of using _USE_TPU_KEY. +_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY] + +# TODO(b/65703635): Flip the value and remove all dead code. Currently, this is +# only used for per-core based deployments. For per-host based pipelines, if a +# user returns a Dataset instance it will be automatically wrapped in a +# tf.while_loop (This can be disabled by returning features and labels +# explicitly). +_WRAP_INPUT_FN_INTO_WHILE_LOOP = False + +ops.register_proto_function( + '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR), + proto_type=variable_pb2.VariableDef, + to_proto=resource_variable_ops._to_proto_fn, # pylint: disable=protected-access + from_proto=resource_variable_ops._from_proto_fn) # pylint: disable=protected-access + + +def _is_iterable(obj): + """A Python 2 and 3 compatible util to check whether `obj` is iterable.""" + try: + iter(obj) + return True + except TypeError: + return False + + +def _create_global_step(graph): + graph = graph or ops.get_default_graph() + if training.get_global_step(graph) is not None: + raise ValueError('"global_step" already exists.') + # Create in proper graph and base name_scope. + with graph.as_default() as g, g.name_scope(None): + return variable_scope.get_variable( + ops.GraphKeys.GLOBAL_STEP, + shape=[], + dtype=dtypes.int64, + initializer=init_ops.zeros_initializer(), + trainable=False, + use_resource=True, + collections=[ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.GLOBAL_STEP]) + + +def _create_or_get_iterations_per_loop(): + """Creates or gets the iterations_per_loop variable. + + In TPUEstimator, the user provided computation, the model_fn, is wrapped + inside a tf.while_loop for peak performance. The iterations of the loop are + specified by this variable, which adjusts its value on the CPU after each TPU + program execution and before the next TPU execution. + + The purpose of using a variable, rather then a constant, is to allow + TPUEstimator adapt the TPU training iterations according to the final steps + specified by users. For example, if the user sets the iterations_per_loop as 4 + in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop + variable will have the following value before each TPU training. + + - 1-th TPU execution: iterations_per_loop = 4 + - 2-th TPU execution: iterations_per_loop = 4 + - 3-th TPU execution: iterations_per_loop = 2 + + As model_fn increases the global step once per train_op invocation, the global + step is 10 after all TPU executions, matching the steps=10 inputs passed in by + users. + + Returns: + A TF non-trainable resource variable. + + Raises: + RuntimeError: If multi iterations_per_loop variables were found. + """ + graph = ops.get_default_graph() + collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR) + iter_vars = graph.get_collection(collection_name) + if len(iter_vars) == 1: + return iter_vars[0] + elif len(iter_vars) > 1: + raise RuntimeError('Multiple iterations_per_loop_var in collection.') + + with ops.colocate_with(training_util.get_global_step()): + with variable_scope.variable_scope( + _TPU_ESTIMATOR, reuse=variable_scope.AUTO_REUSE): + return variable_scope.get_variable( + _ITERATIONS_PER_LOOP_VAR, + initializer=init_ops.zeros_initializer(), + shape=[], + dtype=dtypes.int32, + trainable=False, + collections=[collection_name, ops.GraphKeys.LOCAL_VARIABLES], + use_resource=True) + + +def _sync_variables_ops(ctx): + """Create varriables synchronization ops. + + Gets the variables back from TPU nodes. This means the variables updated + by TPU will now be *synced* to host memory. + In BROADCAST mode, we skip this sync since the variables are ususally too + big to transmit via RPC. + + Args: + ctx: A `_InternalTPUContext` instance with mode. + + Returns: + A list of sync ops. + """ + + if not ctx.is_input_broadcast_with_iterators(): + return [ + array_ops.check_numerics(v.read_value(), + 'Gradient for %s is NaN' % v.name).op + for v in variables.trainable_variables() + ] + else: + return [control_flow_ops.no_op()] + + +def _increase_eval_step_op(iterations_per_loop): + """Returns an op to increase the eval step for TPU evaluation. + + Args: + iterations_per_loop: Tensor. The number of eval steps running in TPU system + before returning to CPU host for each `Session.run`. + + Returns: + An operation + """ + eval_step = evaluation._get_or_create_eval_step() # pylint: disable=protected-access + # Estimator evaluate increases 1 by default. So, we increase the difference. + return state_ops.assign_add( + eval_step, + math_ops.cast(iterations_per_loop - 1, dtype=eval_step.dtype), + use_locking=True) + + +def _extract_key_names(tensor_or_dict): + if isinstance(tensor_or_dict, dict): + return sorted(tensor_or_dict.keys()) + return [] + + +class _SIGNAL(object): + """Signal used to control the thread of infeed/outfeed. + + All preserved signals must be negative numbers. Positive numbers are used to + indicate the number of iterations for next training/evaluation loop. + """ + NEXT_BATCH = -1 + STOP = -2 + + +class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec): # pylint: disable=protected-access + """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`. + + See `EstimatorSpec` for `mode`, `predictions`, `loss`, `train_op`, and + `export_outputs`. + + For evaluation, `eval_metrics `is a tuple of `metric_fn` and `tensors`, where + `metric_fn` runs on CPU to generate metrics and `tensors` represents the + `Tensor`s transferred from TPU system to CPU host and passed to `metric_fn`. + To be precise, TPU evaluation expects a slightly different signature from the + `tf.estimator.Estimator`. While `EstimatorSpec.eval_metric_ops` expects a + dict, `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`. + The `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. The + `tensors` usually specify the model logits, which are transferred back from + TPU system to CPU host. All tensors must have be batch-major, i.e., the batch + size is the first dimension. Once all tensors are available at CPU host from + all shards, they are concatenated (on CPU) and passed as positional arguments + to the `metric_fn` if `tensors` is list or keyword arguments if `tensors` is + a dict. `metric_fn` takes the `tensors` and returns a dict from metric string + name to the result of calling a metric function, namely a `(metric_tensor, + update_op)` tuple. See `TPUEstimator` for MNIST example how to specify the + `eval_metrics`. + + `scaffold_fn` is a function running on CPU to generate the `Scaffold`. This + function should not capture any Tensors in `model_fn`. + + `host_call` is a tuple of a `function` and a list or dictionary of `tensors` + to pass to that function and returns a list of Tensors. `host_call` currently + works for train() and evaluate(). The Tensors returned by the function is + executed on the CPU on every step, so there is communication overhead when + sending tensors from TPU to CPU. To reduce the overhead, try reducing the + size of the tensors. The `tensors` are concatenated along their major (batch) + dimension, and so must be >= rank 1. The `host_call` is useful for writing + summaries with `tf.contrib.summary.create_file_writer`. + """ + + def __new__(cls, + mode, + predictions=None, + loss=None, + train_op=None, + eval_metrics=None, + export_outputs=None, + scaffold_fn=None, + host_call=None, + training_hooks=None, + evaluation_hooks=None, + prediction_hooks=None): + """Creates a validated `TPUEstimatorSpec` instance.""" + host_calls = {} + if eval_metrics is not None: + host_calls['eval_metrics'] = eval_metrics + if host_call is not None: + host_calls['host_call'] = host_call + _OutfeedHostCall.validate(host_calls) + + training_hooks = tuple(training_hooks or []) + evaluation_hooks = tuple(evaluation_hooks or []) + prediction_hooks = tuple(prediction_hooks or []) + + for hook in training_hooks + evaluation_hooks + prediction_hooks: + if not isinstance(hook, session_run_hook.SessionRunHook): + raise TypeError('All hooks must be SessionRunHook instances, given: {}' + .format(hook)) + + return super(TPUEstimatorSpec, cls).__new__( + cls, + mode=mode, + predictions=predictions, + loss=loss, + train_op=train_op, + eval_metrics=eval_metrics, + export_outputs=export_outputs, + scaffold_fn=scaffold_fn, + host_call=host_call, + training_hooks=training_hooks, + evaluation_hooks=evaluation_hooks, + prediction_hooks=prediction_hooks) + + def as_estimator_spec(self): + """Creates an equivalent `EstimatorSpec` used by CPU train/eval.""" + host_calls = {} + if self.eval_metrics is not None: + host_calls['eval_metrics'] = self.eval_metrics + if self.host_call is not None: + host_calls['host_call'] = self.host_call + host_call_ret = _OutfeedHostCall.create_cpu_hostcall(host_calls) + eval_metric_ops = None + if self.eval_metrics is not None: + eval_metric_ops = host_call_ret['eval_metrics'] + hooks = None + if self.host_call is not None: + hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])] + if tensor_tracer.TensorTracer.is_enabled(): + tt = tensor_tracer.TensorTracer() + tracing_calls = tt.trace_cpu(ops.get_default_graph()) + tracing_call_ret = _OutfeedHostCall.create_cpu_hostcall(tracing_calls) + tracing_functions = tracing_call_ret.values() + if tracing_functions: + if hooks: + hooks.extend([_OutfeedHostCallHook(tracing_functions)]) + else: + hooks = [_OutfeedHostCallHook(tracing_functions)] + hooks = tuple(hooks or []) + scaffold = self.scaffold_fn() if self.scaffold_fn else None + return model_fn_lib.EstimatorSpec( + mode=self.mode, + predictions=self.predictions, + loss=self.loss, + train_op=self.train_op, + eval_metric_ops=eval_metric_ops, + export_outputs=self.export_outputs, + scaffold=scaffold, + training_hooks=self.training_hooks + hooks, + evaluation_hooks=self.evaluation_hooks + hooks, + prediction_hooks=self.prediction_hooks + hooks) + + +class _OpQueueContext(object): + """Manages work queue and thread for a infeed/outfeed thread.""" + + def __init__(self, name, target, args): + self._name = name + self._queue = Queue.Queue() + args = (self,) + args + self._thread = threading.Thread(name=name, target=target, args=args) + self._thread.daemon = True + self._thread.start() + + def stop(self): + self._queue.put(_SIGNAL.STOP) + + def send_next_batch_signal(self, iterations): + self._queue.put(iterations) + + def read_iteration_counts(self): + while True: + iterations = self._queue.get(block=True) + logging.debug('%s read iterations %s', self._name, iterations) + if iterations == _SIGNAL.STOP: + logging.info('%s received shutdown signal, stopping.', self._name) + return + yield iterations + + def join(self): + logging.info('Shutting down %s thread.', self._name) + self.stop() + self._thread.join() + + +class _OpSignalOnceQueueContext(_OpQueueContext): + """Manages work queue and thread for a infeed/outfeed thread. + + This subclass only signals once. + """ + + def __init__(self, name, target, args): + super(_OpSignalOnceQueueContext, self).__init__(name, target, args) + self._has_signaled = False + + def send_next_batch_signal(self, iterations): + if not self._has_signaled: + self._queue.put(iterations) + self._has_signaled = True + + +class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook): + """A Session hook setting up the TPU initialization, infeed, and outfeed. + + This hook does two major things: + 1. initialize and shutdown TPU system. + 2. launch and join the threads for infeed enqueue and (optional) outfeed + dequeue. + """ + + def __init__(self, + ctx, + enqueue_ops, + dequeue_ops, + tpu_compile_op, + run_infeed_loop_on_coordinator=True, + rendezvous=None, + master=None, + session_config=None): + self._master_job = ctx.master_job + self._enqueue_ops = enqueue_ops + self._dequeue_ops = dequeue_ops + self._rendezvous = rendezvous + self._master = master + self._session_config = session_config + self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator + self._initial_infeed_sleep_secs = ( + ctx.config.tpu_config.initial_infeed_sleep_secs) + + self._feed_error = None + self._finished = False + self._should_initialize_tpu = True + self._tpu_compile_op = tpu_compile_op + + def begin(self): + logging.info('TPU job name %s', self._master_job) + self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + self._init_ops = [] + if self._should_initialize_tpu: + self._finalize_ops = [tpu.shutdown_system(job=self._master_job)] + else: + self._finalize_ops = [] + + summary_writer_init_ops = contrib_summary.summary_writer_initializer_op() + self._init_ops.extend(summary_writer_init_ops) + # Get all the writer resources from the initializer, so we know what to + # flush. + for op in summary_writer_init_ops: + self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0])) + + def _run_infeed(self, queue_ctx, session): + logging.info('Starting infeed thread controller.') + if self._initial_infeed_sleep_secs: + logging.info('Infeed thread sleeping for %d seconds.', + self._initial_infeed_sleep_secs) + time.sleep(self._initial_infeed_sleep_secs) + logging.info('Infeed thread starting after sleep') + + with self._rendezvous.catch_errors(source='infeed', session=session): + if self._run_infeed_loop_on_coordinator: + for count, steps in enumerate(queue_ctx.read_iteration_counts()): + for i in xrange(steps): + logging.debug('Infeed enqueue for iteration (%d, %d)', count, i) + session.run(self._enqueue_ops) + else: + for _ in queue_ctx.read_iteration_counts(): + session.run(self._enqueue_ops) + logging.info('Infeed thread finished, shutting down.') + + def _run_outfeed(self, queue_ctx, session): + logging.info('Starting outfeed thread controller.') + with self._rendezvous.catch_errors(source='outfeed', session=session): + for count, steps in enumerate(queue_ctx.read_iteration_counts()): + for i in xrange(steps): + logging.debug('Outfeed dequeue for iteration (%d, %d)', count, i) + session.run(self._dequeue_ops) + logging.info('Outfeed thread finished, shutting down.') + + def _create_infeed_controller(self, name, target, args): + return _OpQueueContext(name=name, target=target, args=args) + + def _assertCompilationSucceeded(self, result, coord): + proto = tpu_compilation_result.CompilationResultProto() + proto.ParseFromString(result) + if proto.status_error_message: + logging.error('Compilation failed: {}'.format(proto.status_error_message)) + coord.request_stop() + else: + logging.info('Compilation succeeded') + + def after_create_session(self, session, coord): + if self._should_initialize_tpu: + logging.info('Init TPU system') + start = time.time() + with ops.Graph().as_default(): + with tf_session.Session( + self._master, config=self._session_config) as sess: + sess.run(tpu.initialize_system(job=self._master_job)) + logging.info('Initialized TPU in %d seconds', time.time() - start) + + session.run(self._init_ops, + options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000)) + + if os.environ.get('TPU_SPLIT_COMPILE_AND_EXECUTE', '') == '1': + logging.info('Compiling user program: this may take a while...') + self._assertCompilationSucceeded(session.run(self._tpu_compile_op), coord) + + self._infeed_controller = self._create_infeed_controller( + name='InfeedController', target=self._run_infeed, args=(session,)) + + self._outfeed_controller = _OpQueueContext( + name='OutfeedController', target=self._run_outfeed, args=(session,)) + + # Enable the worker watchdog to terminate workers on coordinator exit. + watchdog_timeout = int(os.environ.get('TF_TPU_WATCHDOG_TIMEOUT', '0')) + if watchdog_timeout > 0: + session_support.start_worker_watchdog(session, + shutdown_timeout=watchdog_timeout) + + def before_run(self, run_context): + self._feed_error = None + + iterations = run_context.session.run(self._iterations_per_loop_var) + + logging.info('Enqueue next (%d) batch(es) of data to infeed.', iterations) + self._infeed_controller.send_next_batch_signal(iterations) + + logging.info('Dequeue next (%d) batch(es) of data from outfeed.', + iterations) + self._outfeed_controller.send_next_batch_signal(iterations) + + def end(self, session): + self._finished = True + logging.info('Stop infeed thread controller') + self._infeed_controller.join() + self._rendezvous.record_done('infeed') + + logging.info('Stop output thread controller') + self._outfeed_controller.join() + self._rendezvous.record_done('outfeed') + + logging.info('Shutdown TPU system.') + session.run(self._finalize_ops) + + +class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook): + + def __init__(self, ctx, enqueue_ops, dequeue_ops, tpu_compile_op, + rendezvous=None, master=None, session_config=None): + super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__( + ctx, + enqueue_ops, + dequeue_ops, + tpu_compile_op=tpu_compile_op, + run_infeed_loop_on_coordinator=False, + rendezvous=rendezvous, + master=master, + session_config=session_config) + + def _create_infeed_controller(self, name, target, args): + return _OpSignalOnceQueueContext(name=name, target=target, args=args) + + +class _TPUStopAtStepHook(session_run_hook.SessionRunHook): + """Hook that requests stop at a specified step. + + This hook is similar to the `session_run_hook._StopAfterNEvalsHook` with + following differences for TPU training: + + 1. This hook sets the variable for iterations_per_loop, which is used by + `TPUInfeedOutfeedSessionHook` to control the iterations for infeed/outfeed. + As the hook execution order is not guaranteed, the variable update is + handled in `after_create_session` and `after_run` as + `TPUInfeedOutfeedSessionHook` reads the variable value in `before_run`. + + 2. For each training loop (session.run), the global step could be increased + multiple times on TPU. The global step tensor value will be explicitly read + again in `after_run` to ensure the latest value is retrieved to avoid race + condition. + """ + + def __init__(self, iterations, num_steps=None, last_step=None): + """Initializes a `StopAtStepHook`. + + Args: + iterations: The number of iterations to run optimizer per training loop. + num_steps: Number of steps to execute. + last_step: Step after which to stop. + + Raises: + ValueError: If one of the arguments is invalid. + """ + if num_steps is None and last_step is None: + raise ValueError('One of num_steps or last_step must be specified.') + if num_steps is not None and last_step is not None: + raise ValueError('Only one of num_steps or last_step can be specified.') + self._num_steps = num_steps + self._last_step = last_step + self._iterations = iterations + + def _next_iterations(self, global_step, last_step): + gap = last_step - global_step + return min(gap, self._iterations) + + def begin(self): + self._global_step_tensor = training_util.get_global_step() + if self._global_step_tensor is None: + raise RuntimeError('Global step should be created.') + + self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + + def after_create_session(self, session, coord): + global_step = session.run(self._global_step_tensor) + if self._last_step is None: + self._last_step = global_step + self._num_steps + + iterations = self._next_iterations(global_step, self._last_step) + + self._iterations_per_loop_var.load(iterations, session=session) + + def after_run(self, run_context, run_values): + # Global step cannot be retrieved via SessionRunArgs and before_run due to + # race condition. + global_step = run_context.session.run(self._global_step_tensor) + if global_step >= self._last_step: + run_context.request_stop() + else: + iterations = self._next_iterations(global_step, self._last_step) + self._iterations_per_loop_var.load( + iterations, session=run_context.session) + + +class _SetEvalIterationsHook(session_run_hook.SessionRunHook): + """Hook that requests stop at a specified step.""" + + def __init__(self, num_steps): + """Initializes a `_SetEvalIterationsHook`. + + Args: + num_steps: Number of steps to execute. + """ + self._num_steps = num_steps + + def begin(self): + self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + + def after_create_session(self, session, coord): + self._iterations_per_loop_var.load(self._num_steps, session=session) + + +class _StoppingPredictHook(session_run_hook.SessionRunHook): + """Hook that requests stop according to the stopping signal in prediction.""" + + def __init__(self, scalar_stopping_signal): + self._scalar_stopping_signal = scalar_stopping_signal + + def begin(self): + self._iterations_per_loop_var = _create_or_get_iterations_per_loop() + + def after_create_session(self, session, coord): + # This is not necessary as we do not run infeed enqueue and outfeed dequeue + # in side threads for prediction model. But it makes the + # TPUInfeedOutfeedSessionHook prints nice message. + self._iterations_per_loop_var.load(1, session=session) + + def before_run(self, run_context): + return session_run_hook.SessionRunArgs(self._scalar_stopping_signal) + + def after_run(self, run_context, run_values): + _ = run_context + scalar_stopping_signal = run_values.results + if _StopSignals.should_stop(scalar_stopping_signal): + # NOTE(xiejw): In prediction, stopping signals are inserted for each + # batch. And we append one more batch to signal the system it should stop. + # The data flow might look like + # + # batch 0: images, labels, stop = 0 (user provided) + # batch 1: images, labels, stop = 0 (user provided) + # ... + # batch 99: images, labels, stop = 0 (user provided) + # batch 100: images, labels, stop = 1 (TPUEstimator appended) + # + # where the final batch (id = 100) is appended by TPUEstimator, so we + # should drop it before returning the predictions to user. + # To achieve that, we throw the OutOfRangeError in after_run. Once + # Monitored Session sees this error in SessionRunHook.after_run, the + # "current" prediction, i.e., batch with id=100, will be discarded + # immediately + raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.') + + +def generate_per_core_enqueue_ops_fn_for_host( + ctx, input_fn, inputs_structure_recorder, host_device, host_id): + """Generates infeed enqueue ops for per-core input_fn on a single host.""" + captured_infeed_queue = _CapturedObject() + tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id) + + def enqueue_ops_fn(): + """A fn returns enqueue_ops.""" + num_cores_per_host = ctx.num_of_cores_per_host + per_host_sharded_inputs = [] + for core_ordinal in range(num_cores_per_host): + with ops.name_scope('ordinal_%d' % (core_ordinal)): + user_context = tpu_context.TPUContext( + internal_ctx=ctx, + input_device=host_device, + invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal) + inputs = _Inputs.from_input_fn(input_fn(user_context)) + if inputs.is_dataset: + raise TypeError( + '`input_fn` returning `Dataset` is not yet supported in ' + 'per-Core input pipeline deployment yet. Please set ' + 'TPUConfig.per_host_input_for_training to True or return ' + '`features` and `labels` from `input_fn`') + features, labels = inputs.features_and_labels() + + inputs_structure_recorder.validate_and_record_structure( + features, labels) + flattened_inputs = ( + inputs_structure_recorder.flatten_features_and_labels( + features, labels)) + per_host_sharded_inputs.append(flattened_inputs) + + infeed_queue = tpu_feed.InfeedQueue( + number_of_tuple_elements=len(per_host_sharded_inputs[0])) + captured_infeed_queue.capture(infeed_queue) + + per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( + per_host_sharded_inputs, tpu_ordinal_function=tpu_ordinal_function_impl) + return per_host_enqueue_ops + + return enqueue_ops_fn, captured_infeed_queue + + +def generate_per_host_enqueue_ops_fn_for_host( + ctx, input_fn, inputs_structure_recorder, batch_axis, device, host_id): + """Generates infeed enqueue ops for per-host input_fn on a single host.""" + captured_infeed_queue = _CapturedObject() + + dataset_initializer = None + + with ops.device(device): + user_context = tpu_context.TPUContext( + internal_ctx=ctx, input_device=device, invocation_index=host_id) + inputs = _Inputs.from_input_fn(input_fn(user_context)) + + is_dataset = inputs.is_dataset + if ctx.mode == model_fn_lib.ModeKeys.PREDICT: + if not is_dataset: + raise TypeError( + 'For mode PREDICT, `input_fn` must return `Dataset` instead of ' + '`features` and `labels`.') + if batch_axis is not None: + raise TypeError('For mode PREDICT, batch_axis is not supported yet.') + inputs = _InputsWithStoppingSignals( + dataset=inputs.dataset, + batch_size=ctx.batch_size_for_input_fn, + add_padding=True) + + if is_dataset: + dataset_initializer = inputs.dataset_initializer() + + tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id) + + def enqueue_ops_fn(): + """A Fn returning the TPU infeed enqueue ops. + + By providing as a Fn, it can be invoked inside the tf.while_loop such that + the input pipeline for multiple iterations can be executed by one + Session.run call. + + Returns: + list of dict of ops. + """ + with ops.device(device): + num_of_replicas_per_host = ctx.num_of_replicas_per_host + # Convert user input to features and labels. If the user returns a + # dataset, it is initialized and the features and labels extracted via + # `dataset.iterator.get_next()` + features, labels = inputs.features_and_labels() + signals = inputs.signals() + + inputs_structure_recorder.validate_and_record_structure(features, labels) + unsharded_tensor_list = ( + inputs_structure_recorder.flatten_features_and_labels( + features, labels, signals)) + + infeed_queue = tpu_feed.InfeedQueue( + tuple_types=[t.dtype for t in unsharded_tensor_list], + tuple_shapes=[t.shape for t in unsharded_tensor_list], + shard_dimensions=batch_axis) + captured_infeed_queue.capture(infeed_queue) + infeed_queue.set_number_of_shards(num_of_replicas_per_host) + per_host_enqueue_ops = ( + infeed_queue.split_inputs_and_generate_enqueue_ops( + unsharded_tensor_list, + placement_function=lambda x: device, + tpu_ordinal_function=tpu_ordinal_function_impl)) + if signals is None: + return per_host_enqueue_ops + else: + return { + 'ops': per_host_enqueue_ops, + 'signals': signals, + } + + return enqueue_ops_fn, captured_infeed_queue, dataset_initializer + + +def generate_per_host_v2_enqueue_ops_fn_for_host( + ctx, input_fn, inputs_structure_recorder, device, host_id): + """Generates infeed enqueue ops for per-host input_fn on a single host.""" + captured_infeed_queue = _CapturedObject() + dataset_initializer = None + + with ops.device(device): + user_context = tpu_context.TPUContext( + internal_ctx=ctx, input_device=device, invocation_index=host_id) + inputs = _Inputs.from_input_fn(input_fn(user_context)) + + is_dataset = inputs.is_dataset + if not is_dataset: + raise TypeError('`input_fn` must return a `Dataset` for the PER_HOST_V2 ' + 'input pipeline configuration.') + + if ctx.mode == model_fn_lib.ModeKeys.PREDICT: + inputs = _InputsWithStoppingSignals( + dataset=inputs.dataset, + batch_size=ctx.batch_size_for_input_fn, + add_padding=True, + num_invocations_per_step=ctx.num_of_replicas_per_host) + + dataset_initializer = inputs.dataset_initializer() + tpu_ordinal_function_impl = ctx.tpu_ordinal_function(host_id) + + def enqueue_ops_fn(): + """Generates the per_host enqueue ops.""" + control_deps = [] + per_host_sharded_inputs = [] + num_replicas_per_host = ctx.num_of_replicas_per_host + cached_signals = None + with ops.device(device): + if not inputs.is_dataset: + raise TypeError('`input_fn` must return a `Dataset` for this mode.') + for _ in range(num_replicas_per_host): + # Use control dependencies to ensure a deterministic ordering. + with ops.control_dependencies(control_deps): + features, labels = inputs.features_and_labels() # Calls get_next() + signals = inputs.signals() + + # All the replicas share the replica 0's stopping singal. + # This avoids inconsistent state among different model replcias. + if cached_signals: + signals['stopping'] = cached_signals['stopping'] + else: + cached_signals = signals + + inputs_structure_recorder.validate_and_record_structure( + features, labels) + flattened_inputs = ( + inputs_structure_recorder.flatten_features_and_labels( + features, labels, signals)) + control_deps.extend(flattened_inputs) + per_host_sharded_inputs.append(flattened_inputs) + + if inputs_structure_recorder.flattened_input_dims: + input_partition_dims = inputs_structure_recorder.flattened_input_dims + if signals: + input_partition_dims += [None] * len(signals) + # pylint: disable=protected-access + infeed_queue = tpu_feed._PartitionedInfeedQueue( + number_of_tuple_elements=len(per_host_sharded_inputs[0]), + host_id=host_id, + input_partition_dims=input_partition_dims, + device_assignment=ctx.device_assignment) + per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( + per_host_sharded_inputs) + else: + infeed_queue = tpu_feed.InfeedQueue( + number_of_tuple_elements=len(per_host_sharded_inputs[0])) + per_host_enqueue_ops = infeed_queue.generate_enqueue_ops( + per_host_sharded_inputs, + tpu_ordinal_function=tpu_ordinal_function_impl) + captured_infeed_queue.capture(infeed_queue) + + if signals is None: + return per_host_enqueue_ops + else: + return { + 'ops': per_host_enqueue_ops, + 'signals': signals, + } + + return enqueue_ops_fn, captured_infeed_queue, dataset_initializer + + +def generate_broadcast_enqueue_ops_fn(ctx, input_fn, inputs_structure_recorder, + num_hosts): + """Generates infeed enqueue ops for one input_fn on all the hosts.""" + captured_infeed_queue = _CapturedObject() + dataset_initializer = None + device_0 = ctx.tpu_host_placement_function(host_id=0) + with ops.device(device_0): + user_context = tpu_context.TPUContext( + internal_ctx=ctx, input_device=device_0, invocation_index=0) + inputs = _Inputs.from_input_fn(input_fn(user_context)) + + is_dataset = inputs.is_dataset + if ctx.mode == model_fn_lib.ModeKeys.PREDICT: + if not is_dataset: + raise TypeError( + 'For mode PREDICT, `input_fn` must return `Dataset` instead of ' + '`features` and `labels`.') + + inputs = _InputsWithStoppingSignals( + dataset=inputs.dataset, + batch_size=ctx.batch_size_for_input_fn, + add_padding=True) + + if is_dataset: + dataset_initializer = inputs.dataset_initializer() + num_replicas_per_host = ctx.num_of_replicas_per_host + + def tpu_ordinal_function_impl(replica_id): + if ctx.device_assignment: + return ctx.device_assignment.tpu_ordinal(replica=replica_id) + else: + return replica_id % num_replicas_per_host + + def device_function_impl(replica_id): + return ctx.tpu_host_placement_function(replica_id=replica_id) + + def enqueue_ops_fn(): + """Generates enqueue ops for all the hosts.""" + broadcasted_inputs = [] + flattened_inputs = None # Cache result from input_fn. + signals = None + for host_id in xrange(num_hosts): + with ops.device(ctx.tpu_host_placement_function(host_id=host_id)): + for _ in xrange(ctx.num_of_replicas_per_host): + # Note: input_fn is only called once at host 0 for the first replica. + # The features and labels returned from that invocation are + # broadcasted to other replicas(including the replicas on other + # hosts). + if flattened_inputs is None: + features, labels = inputs.features_and_labels() # Calls get_next() + signals = inputs.signals() + + inputs_structure_recorder.validate_and_record_structure( + features, labels) + flattened_inputs = ( + inputs_structure_recorder.flatten_features_and_labels( + features, labels, signals)) + broadcasted_inputs.append(flattened_inputs) + + infeed_queue = tpu_feed.InfeedQueue( + number_of_tuple_elements=len(broadcasted_inputs[0])) + captured_infeed_queue.capture(infeed_queue) + enqueue_ops = infeed_queue.generate_enqueue_ops( + broadcasted_inputs, + tpu_ordinal_function=tpu_ordinal_function_impl, + placement_function=device_function_impl) + + if signals is None: + return enqueue_ops + else: + return { + 'ops': enqueue_ops, + 'signals': signals, + } + + return enqueue_ops_fn, captured_infeed_queue, dataset_initializer + + +class _InputPipeline(object): + """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue. + + `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from + call site. To be precise, based on the configuration in + `_InternalTPUContext`, it invokes `input_fn` for all cores (usually + multi-host TPU training) or for one host (usually for single-host TPU + evaluation), and sends all `features` and `labels` returned by `input_fn` to + TPU infeed. For per-core invocation, `features` and `labels` are piped to + infeed directly, one tuple for each core. For per-host invocation, `features` + and `labels` are split at host (with respect to `batch_axis`) and piped to all + cores accordingly. + + In addition, flatten/unflatten are handled by `_InputPipeline` also. Model + inputs returned by the `input_fn` can have one of the following forms: + 1. features + 2. (features, labels) + 3. ((arbitrarily nested structure of features), labels) + + Internally, form 1 is reformed to `(features, None)` as features and labels + are passed separately to underlying methods. For TPU training, TPUEstimator + may expect multiple `features` and `labels` tuples one for each core. + + TPUEstimator allows various different structures for inputs (namely `features` + and `labels`). Both `features` and `labels` can be any nested sturcture + supported by TF nest (namely, dict, tuples, namedtuples or any nested + structure of such of Tensors). `labels` could be `None` as well. + + These are flattened before they are passed to the infeed/outfeed library + as that expectes flattend lists. + """ + + class InputsStructureRecorder(object): + """The recorder to record inputs structure.""" + + def __init__(self, input_partition_dims=None): + # Holds the structure of inputs + self._feature_structure = {} + self._flattened_input_dims = None + + if input_partition_dims: + # This should have been validated in TPUConfig. + assert len(input_partition_dims) <= 2, 'must have 1 or 2 elements.' + if len(input_partition_dims) == 2: + self._feature_dims, self._label_dims = input_partition_dims + else: + self._feature_dims = input_partition_dims[0] + self._label_dims = None + + assert self._feature_dims is not None, ('input_partition_dims[0] must ' + 'not be None') + else: + self._feature_dims = None + self._label_dims = None + + # Internal state. + self._initialized = False + + @property + def flattened_input_dims(self): + assert self._initialized, 'InputsStructureRecorder is not initialized.' + return self._flattened_input_dims + + def has_labels(self): + return 'labels' in self._feature_structure + + def _flatten_input_dims(self, feature_dims, feature_dims_names, label_dims, + label_dims_names, label_names, has_labels): + """Flatten input dims with the same order as flattened input tensors.""" + flattened_input_dims = [] + if feature_dims_names: + # We need a fixed ordering for matching the tensors in features. + flattened_input_dims.extend( + [feature_dims[name] for name in feature_dims_names]) + else: + flattened_input_dims.append(feature_dims) + + if label_dims_names: + # We need a fixed ordering for matching the tensors in labels. + flattened_input_dims.extend( + [label_dims[name] for name in label_dims_names]) + else: + if label_names: + num_tensors_in_label = len(label_names) + else: + num_tensors_in_label = int(has_labels) + # Setting `None` in input_partition_dims[1] will apply `None` to + # all the tensors in labels, regardless of internal structure. + flattened_input_dims.extend([label_dims] * num_tensors_in_label) + + return flattened_input_dims + + def validate_and_record_structure(self, features, labels): + """Validates and records the structure of `features` and `labels`.""" + # Extract structure. + has_labels = labels is not None + feature_names = _extract_key_names(features) + label_names = _extract_key_names(labels) + + if not self._initialized: + # Record structure. + self._initialized = True + if self._feature_dims is not None: + feature_dims_names = _extract_key_names(self._feature_dims) + if feature_dims_names != feature_names: + raise ValueError( + 'TPUConfig.input_partition_dims[0] mismatched feature' + ' keys. Expected {}, got {}'.format(feature_names, + feature_dims_names)) + + label_dims_names = _extract_key_names(self._label_dims) + if self._label_dims is not None and label_dims_names != label_names: + raise ValueError( + 'TPUConfig.input_partition_dims[1] mismatched label' + ' keys. Expected {}, got {}'.format(label_names, + label_dims_names)) + + self._flattened_input_dims = self._flatten_input_dims( + self._feature_dims, feature_dims_names, self._label_dims, + label_dims_names, label_names, has_labels) + + def flatten_features_and_labels(self, features, labels, signals=None): + """Flattens the `features` and `labels` to a single tensor list.""" + self._feature_structure['features'] = features + if labels is not None: + self._feature_structure['labels'] = labels + if signals is not None: + self._feature_structure['signals'] = signals + return data_nest.flatten(self._feature_structure) + + def unflatten_features_and_labels(self, flattened_inputs): + """Restores the flattened inputs to original features and labels form. + + Args: + flattened_inputs: Flattened inputs for each shard. + + Returns: + A tuple of (`features`, `labels`), where `labels` could be None. + Each one, if present, should have identical structure (single tensor vs + dict) as the one returned by input_fn. + + Raises: + ValueError: If the number of expected tensors from `flattened_inputs` + mismatches the recorded structure. + """ + + unflattened_inputs = data_nest.pack_sequence_as(self._feature_structure, + flattened_inputs) + return _Inputs( + unflattened_inputs['features'], + unflattened_inputs.get('labels'), + signals=unflattened_inputs.get('signals')) + + def __init__(self, input_fn, batch_axis, ctx): + """Constructor. + + Args: + input_fn: input fn for train or eval. + batch_axis: A python tuple of int values describing how each tensor + produced by the Estimator `input_fn` should be split across the TPU + compute shards. + ctx: A `_InternalTPUContext` instance with mode. + + Raises: + ValueError: If both `sharded_features` and `num_cores` are `None`. + """ + self._inputs_structure_recorder = _InputPipeline.InputsStructureRecorder( + ctx.input_partition_dims) + + self._sharded_per_core = ctx.is_input_sharded_per_core() + self._input_fn = input_fn + self._infeed_queue = None + self._ctx = ctx + self._batch_axis = batch_axis + + def generate_infeed_enqueue_ops_and_dequeue_fn(self): + """Generates infeed enqueue ops and dequeue_fn.""" + # While tf.while_loop is called, the body function, which invokes + # `enqueue_fn` passed in, is called to construct the graph. So, input_fn + # structure is recorded. + enqueue_ops, all_hooks, run_infeed_loop_on_coordinator = ( + self._invoke_input_fn_and_record_structure()) + + self._validate_input_pipeline() + + def dequeue_fn(): + """dequeue_fn is used by TPU to retrieve the tensors.""" + # In the model-parallel case, both the host-side and device-side + # computations must agree on the core on which infeed takes place. We + # choose to perform infeed on logical core 0 of each replica. + values = self._infeed_queue.generate_dequeue_op(tpu_device=0) + # The unflatten process uses the structure information recorded above. + return self._inputs_structure_recorder.unflatten_features_and_labels( + values) + + return (enqueue_ops, dequeue_fn, all_hooks, run_infeed_loop_on_coordinator) + + def _invoke_input_fn_and_record_structure(self): + """Deploys the input pipeline and record input structure.""" + enqueue_ops = [] + infeed_queues = [] + all_dataset_initializers = [] + num_hosts = self._ctx.num_hosts + tpu_host_placement_fn = self._ctx.tpu_host_placement_function + + run_infeed_loop_on_coordinator = True + + if self._sharded_per_core: + # Per-Core input pipeline deployment. + # Invoke input pipeline for each core and placed on the corresponding + # host. + for host_id in range(num_hosts): + host_device = tpu_host_placement_fn(host_id=host_id) + with ops.device(host_device): + with ops.name_scope('input_pipeline_task%d' % (host_id)): + enqueue_ops_fn, captured_infeed_queue = ( + generate_per_core_enqueue_ops_fn_for_host( + self._ctx, self._input_fn, self._inputs_structure_recorder, + host_device, host_id)) + + if _WRAP_INPUT_FN_INTO_WHILE_LOOP: + run_infeed_loop_on_coordinator = False + enqueue_ops.append( + _wrap_computation_in_while_loop( + device=host_device, op_fn=enqueue_ops_fn)) + else: + enqueue_ops.append(enqueue_ops_fn()) + # Infeed_queue_getter must be called after enqueue_ops_fn is called. + infeed_queues.append(captured_infeed_queue.get()) + + elif self._ctx.is_input_broadcast_with_iterators(): + # Only calls input_fn in host 0. + host_device = tpu_host_placement_fn(host_id=0) + enqueue_ops_fn, captured_infeed_queue, dataset_initializer = ( + generate_broadcast_enqueue_ops_fn(self._ctx, self._input_fn, + self._inputs_structure_recorder, + num_hosts)) + if dataset_initializer: + all_dataset_initializers.append(dataset_initializer) + run_infeed_loop_on_coordinator = False + wrap_fn = ( + _wrap_computation_in_while_loop + if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else + _wrap_computation_in_while_loop_with_stopping_signals) + enqueue_ops.append(wrap_fn(device=host_device, op_fn=enqueue_ops_fn)) + else: + enqueue_ops.append(enqueue_ops_fn()) + infeed_queues.append(captured_infeed_queue.get()) + else: + for host_id in range(num_hosts): + host_device = tpu_host_placement_fn(host_id=host_id) + with ops.device(host_device): + with ops.name_scope('input_pipeline_task%d' % (host_id)): + if self._ctx.is_input_per_host_with_iterators(): + enqueue_ops_fn, captured_infeed_queue, dataset_initializer = ( + generate_per_host_v2_enqueue_ops_fn_for_host( + self._ctx, self._input_fn, + self._inputs_structure_recorder, host_device, host_id)) + else: + enqueue_ops_fn, captured_infeed_queue, dataset_initializer = ( + generate_per_host_enqueue_ops_fn_for_host( + self._ctx, self._input_fn, + self._inputs_structure_recorder, self._batch_axis, + host_device, host_id)) + + # NOTE(xiejw): We dispatch here based on the return type of the + # users `input_fn`. + # + # 1. If input_fn returns a Dataset instance, we initialize the + # iterator outside of tf.while_loop, and call the iterator.get_next + # inside tf.while_loop. This should be always safe. + # + # 2. If input_fn returns (features, labels), it is too late to wrap + # them inside tf.while_loop, as resource initialization cannot be + # handled in TF control flow properly. In this case, we will use + # python loop to enqueue the data into TPU system. This may be + # slow compared to the previous case. + if dataset_initializer: + all_dataset_initializers.append(dataset_initializer) + run_infeed_loop_on_coordinator = False + wrap_fn = ( + _wrap_computation_in_while_loop + if self._ctx.mode != model_fn_lib.ModeKeys.PREDICT else + _wrap_computation_in_while_loop_with_stopping_signals) + enqueue_ops.append( + wrap_fn(device=host_device, op_fn=enqueue_ops_fn)) + else: + enqueue_ops.append(enqueue_ops_fn()) + infeed_queues.append(captured_infeed_queue.get()) + # infeed_queue is used to generate dequeue ops. The only thing it uses for + # dequeue is dtypes and types. So, any one can be used. Here, grab the + # first one. + self._infeed_queue = infeed_queues[0] + return enqueue_ops, [ + util_lib.MultiHostDatasetInitializerHook(all_dataset_initializers) + ], run_infeed_loop_on_coordinator + + def _validate_input_pipeline(self): + """Validates the input pipeline. + + Perform some sanity checks to log user friendly information. We should + error out to give users better error message. But, if + _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break + user code, so, log a warning. + + Raises: + RuntimeError: If the validation failed. + """ + if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS): + err_msg = ('Input pipeline contains one or more QueueRunners. ' + 'It could be slow and not scalable. Please consider ' + 'converting your input pipeline to use `tf.data` instead (see ' + 'https://www.tensorflow.org/guide/datasets for ' + 'instructions.') + if _WRAP_INPUT_FN_INTO_WHILE_LOOP: + raise RuntimeError(err_msg) + else: + logging.warn(err_msg) + + +class _ModelFnWrapper(object): + """A `model_fn` wrapper. + + This makes calling model_fn on CPU and TPU easier and more consistent and + performs necessary check and mutation required by TPU training and evaluation. + + In addition, this wrapper manages converting the `model_fn` to a single TPU + train and eval step. + """ + + def __init__(self, model_fn, train_cache_fn, eval_cache_fn, config, params, ctx): + self._model_fn = model_fn + self._train_cache_fn = train_cache_fn + self._eval_cache_fn = eval_cache_fn + self._config = config + self._params = params + self._ctx = ctx + + def call_without_tpu(self, features, labels, is_export_mode): + return self._call_model_fn(features, labels, is_export_mode=is_export_mode) + + def convert_to_single_tpu_train_step(self, dequeue_fn): + """Converts user provided model_fn` as a single train step on TPU. + + The user provided `model_fn` takes input tuple + (features, labels) and produces the EstimatorSpec with train_op and loss for + train `mode`. This usually represents a single train computation on CPU. + + For TPU training, a train (computation) step is first wrapped in a + tf.while_loop control flow to repeat for many times and then replicated to + all TPU shards. Besides the input should be taken from TPU infeed rather + than input pipeline (input_fn) directly. To fit TPU loop and replicate + pattern, the original train computation should be reformed, which is the + returned `train_step`. + + Args: + dequeue_fn: The function to retrieve inputs, features and labels, from TPU + infeed dequeue channel. + + Returns: + A tuple of train_fn, host_calls, and captured scaffold_fn. The train_fn + representing the train step for TPU. + """ + + host_call = _OutfeedHostCall(self._ctx) + captured_scaffold_fn = _CapturedObject() + captured_training_hooks = _CapturedObject() + + def train_step(loss, *cache): + """Training step function for use inside a while loop.""" + del loss # unused; required in function signature. + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() + + # Consume the current cache + estimator_spec = self._verify_estimator_spec( + self._call_model_fn(features, labels, cache=cache)) + + # Retrieve the new returned cache + """ + `cache` consists of a list of tensors, potentially empty (of length 0) + """ + cache = estimator_spec.cache + loss, train_op = estimator_spec.loss, estimator_spec.train_op + + if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec): # pylint: disable=protected-access + captured_scaffold_fn.capture(estimator_spec.scaffold_fn) + else: + captured_scaffold_fn.capture(None) + + captured_training_hooks.capture(estimator_spec.training_hooks) + + tracing_ops = [] + if tensor_tracer.TensorTracer.is_enabled(): + tt = tensor_tracer.TensorTracer() + loss, tracing_ops = tt.trace_tpu(ops.get_default_graph(), loss, + self._ctx.num_replicas) + + # We must run train_op to update the variables prior to running the + # outfeed. + with ops.control_dependencies([train_op]+tracing_ops): + host_call_outfeed_ops = [] + if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec) # pylint: disable=protected-access + and estimator_spec.host_call is not None): + host_call.record({'host_call': estimator_spec.host_call}) + host_call_outfeed_ops = host_call.create_enqueue_op() + with ops.control_dependencies(host_call_outfeed_ops): + return [array_ops.identity(loss)] + cache + + return (train_step, host_call, captured_scaffold_fn, + captured_training_hooks) + + def convert_to_single_tpu_eval_step(self, dequeue_fn): + """Converts user provided model_fn` as a single eval step on TPU. + + Similar to training, the user provided `model_fn` takes input tuple + (features, labels) and produces the TPUEstimatorSpec with eval_metrics for + eval `mode`. This usually represents a single evaluation computation on CPU. + + For TPU evaluation, a eval (computation) step is first wrapped in a + tf.while_loop control flow to repeat for many times and then replicated to + all TPU shards. Besides the input and output are slightly different. Input, + features and labels, should be taken from TPU infeed rather than input + pipeline (input_fn) directly. Output is managed in two stages. First, the + model outputs as the result of evaluation computation, usually model logits, + should be transferred from TPU system to CPU. Then, all model outputs are + concatenated first on CPU and sent to the metric_fn for metrics computation. + To fit TPU evaluation pattern, the original eval computation should be + reformed, which is the returned `eval_step`. + + Args: + dequeue_fn: The function to retrieve inputs, features and labels, from TPU + infeed dequeue channel. + + Returns: + A tuple of eval_fn, host_calls, and captured scaffold_fn. The eval_fn + representing the eval step for TPU. + """ + host_calls = _OutfeedHostCall(self._ctx) + captured_scaffold_fn = _CapturedObject() + captured_eval_hooks = _CapturedObject() + + def eval_step(total_loss, *cache): + """Evaluation step function for use inside a while loop.""" + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() + + # Consume the current cache + tpu_estimator_spec = self._call_model_fn(features, labels, cache=cache) + if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec): # pylint: disable=protected-access + raise RuntimeError( + 'estimator_spec used by TPU evaluation must have type' + '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec))) + + # Retrieve the new returned cache + cache = tpu_estimator_spec.cache + loss = tpu_estimator_spec.loss + + captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn) + captured_eval_hooks.capture(tpu_estimator_spec.evaluation_hooks) + + to_record = {} + if tpu_estimator_spec.eval_metrics: + to_record['eval_metrics'] = tpu_estimator_spec.eval_metrics + if tpu_estimator_spec.host_call is not None: + # We assume that evaluate won't update global step, so we don't wrap + # this host_call. + to_record['host_call'] = tpu_estimator_spec.host_call + host_calls.record(to_record) + + with ops.control_dependencies(host_calls.create_enqueue_op()): + return [math_ops.add(total_loss, loss)] + cache + + return eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks + + def convert_to_single_tpu_predict_step(self, dequeue_fn): + """Converts user provided model_fn` as a single predict step on TPU. + + Args: + dequeue_fn: The function to retrieve inputs, features and labels, from TPU + infeed dequeue channel. + + Returns: + A tuple of predict_fn, host_calls, and captured scaffold_fn. The + predict_fn representing the predict step for TPU. + """ + host_calls = _OutfeedHostCall(self._ctx) + captured_scaffold_fn = _CapturedObject() + captured_predict_hooks = _CapturedObject() + + def predict_step(unused_scalar_stopping_signal): + """Evaluation step function for use inside a while loop.""" + inputs = dequeue_fn() + features, labels = inputs.features_and_labels() + stopping_signals = inputs.signals() + + assert stopping_signals is not None, ( + 'Internal Error: `signals` is missing.') + + tpu_estimator_spec = self._call_model_fn( + features, labels, is_export_mode=False) + if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec): # pylint: disable=protected-access + raise RuntimeError( + 'estimator_spec used by TPU prediction must have type' + '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec))) + + self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions) + + captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn) + captured_predict_hooks.capture(tpu_estimator_spec.prediction_hooks) + to_record = {} + identity_fn = lambda **kwargs: kwargs + to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions] + to_record['signals'] = [identity_fn, stopping_signals] + if tpu_estimator_spec.host_call is not None: + to_record['host_call'] = tpu_estimator_spec.host_call + host_calls.record(to_record) + + with ops.control_dependencies(host_calls.create_enqueue_op()): + return _StopSignals.as_scalar_stopping_signal(stopping_signals) + + return (predict_step, host_calls, captured_scaffold_fn, + captured_predict_hooks) + + def _verify_tpu_spec_predictions(self, predictions): + """Validates TPUEstimatorSpec.predictions dict.""" + # TODO(xiejw): Adds validation for prediction dictionrary. + # TODO(xiejw): Adds support for single tensor as predictions. + if not isinstance(predictions, dict): + raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.') + + for (key, tensor) in predictions.items(): + if tensor.shape.dims[0].value is None: + raise ValueError( + 'The tensor with key ({}) in TPUEstimatorSpec.predictions has ' + 'dynamic shape (should be static). Tensor: {}'.format(key, tensor)) + return predictions + + def _validate_model_features_and_labels(self, features, labels, + is_export_mode): + """Validates that the features and labels for the model function are valid. + + A valid features/labels object is the one with: + - Type: A tensor or any nested structure of tensors supported by TF nest, + namely nested dictionary, tuple, namedtuple, or sequence of tensors. + - Static shape if is_export_mode is False. + + Args: + features: the features that would be input to the model function. + labels: the labels that would be input to the model function. + is_export_mode: boolean value specifying if in export mode. + + Raises: + TypeError: If features/labels are not of the correct type. + ValueError: If features/labels have dynamic shape. + """ + + def validate(obj, obj_name): + """Helper validate function.""" + if is_export_mode or self._ctx.is_running_on_cpu(is_export_mode): + return + if isinstance(obj, ops.Tensor): + if not obj.get_shape().is_fully_defined(): + raise ValueError( + 'The {} to the model returned by input_fn must have static shape.' + ' Tensor: {}'.format(obj_name, obj)) + else: + for tensor in data_nest.flatten(obj): + if not tensor.get_shape().is_fully_defined(): + raise ValueError( + ('The {} to the model returned by input_fn must have static ' + 'shape. Tensor: {}').format(obj_name, tensor)) + + validate(features, 'features') + if labels is not None: + validate(labels, 'labels') + + def _call_model_fn(self, features, labels, cache=None, is_export_mode=False): + """Calls the model_fn with required parameters.""" + self._validate_model_features_and_labels(features, labels, is_export_mode) + model_fn_args = function_utils.fn_args(self._model_fn) + kwargs = {} + + # Makes deep copy with `config` and params` in case user mutates them. + config = copy.deepcopy(self._config) + params = copy.deepcopy(self._params) + + if 'labels' in model_fn_args: + kwargs['labels'] = labels + elif labels is not None: + raise ValueError( + 'model_fn does not take labels, but input_fn returns labels.') + if 'mode' in model_fn_args: + kwargs['mode'] = self._ctx.mode + if 'config' in model_fn_args: + kwargs['config'] = config + if 'params' in model_fn_args: + kwargs['params'] = params + + if cache is not None: + params['cache'] = cache + + if 'params' not in model_fn_args: + raise ValueError('model_fn ({}) does not include params argument, ' + 'required by TPUEstimator to pass batch size as ' + 'params[\'batch_size\']'.format(self._model_fn)) + + if is_export_mode: + batch_size_for_model_fn = None + else: + batch_size_for_model_fn = self._ctx.batch_size_for_model_fn + + if batch_size_for_model_fn is not None: + _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn) + + running_on_cpu = self._ctx.is_running_on_cpu(is_export_mode) + _add_item_to_params(params, _USE_TPU_KEY, not running_on_cpu) + + if not running_on_cpu: + user_context = tpu_context.TPUContext( + internal_ctx=self._ctx, call_from_input_fn=False) + _add_item_to_params(params, _CTX_KEY, user_context) + + estimator_spec = self._model_fn(features=features, **kwargs) + if (running_on_cpu and + isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)): # pylint: disable=protected-access + # The estimator_spec will be passed to `Estimator` directly, which expects + # type `EstimatorSpec`. + return estimator_spec.as_estimator_spec() + else: + return estimator_spec + + def _verify_estimator_spec(self, estimator_spec): + """Validates the estimator_spec.""" + if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec): # pylint: disable=protected-access + return estimator_spec + + err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.' + if estimator_spec.training_chief_hooks: + raise ValueError( + err_msg.format('training_chief_hooks') + 'If you want' + + ' to pass training hooks, please pass via training_hooks.') + + if estimator_spec.scaffold: + logging.warning('EstimatorSpec.Scaffold is ignored by TPU train/eval. ' + 'Please use TPUEstimatorSpec.') + return estimator_spec + + +class _OutfeedHostCall(object): + """Support for `eval_metrics` and `host_call` in TPUEstimatorSpec.""" + + def __init__(self, ctx): + self._ctx = ctx + self._names = [] + # All of these are dictionaries of lists keyed on the name. + self._host_fns = {} + self._tensor_keys = collections.defaultdict(list) + self._tensors = collections.defaultdict(list) + self._tensor_dtypes = collections.defaultdict(list) + self._tensor_shapes = collections.defaultdict(list) + + @staticmethod + def validate(host_calls): + """Validates the `eval_metrics` and `host_call` in `TPUEstimatorSpec`.""" + + for name, host_call in host_calls.items(): + if not isinstance(host_call, (tuple, list)): + raise ValueError('{} should be tuple or list'.format(name)) + if len(host_call) != 2: + raise ValueError('{} should have two elements.'.format(name)) + if not callable(host_call[0]): + raise TypeError('{}[0] should be callable.'.format(name)) + if not isinstance(host_call[1], (tuple, list, dict)): + raise ValueError('{}[1] should be tuple or list, or dict.'.format(name)) + + if isinstance(host_call[1], (tuple, list)): + fullargspec = tf_inspect.getfullargspec(host_call[0]) + fn_args = function_utils.fn_args(host_call[0]) + # wrapped_hostcall_with_global_step uses varargs, so we allow that. + if fullargspec.varargs is None and len(host_call[1]) != len(fn_args): + raise RuntimeError( + 'In TPUEstimatorSpec.{}, length of tensors {} does not match ' + 'method args of the function, which takes {}.'.format( + name, len(host_call[1]), len(fn_args))) + + @staticmethod + def create_cpu_hostcall(host_calls): + """Runs on the host_call on CPU instead of TPU when use_tpu=False.""" + + _OutfeedHostCall.validate(host_calls) + ret = {} + for name, host_call in host_calls.items(): + host_fn, tensors = host_call + if isinstance(tensors, (tuple, list)): + ret[name] = host_fn(*tensors) + else: + # Must be dict. + try: + ret[name] = host_fn(**tensors) + except TypeError as e: + logging.warning( + 'Exception while calling %s: %s. It is likely the tensors ' + '(%s[1]) do not match the ' + 'function\'s arguments', name, e, name) + raise e + return ret + + def record(self, host_calls): + """Records the host_call structure.""" + + for name, host_call in host_calls.items(): + host_fn, tensor_list_or_dict = host_call + self._names.append(name) + self._host_fns[name] = host_fn + + if isinstance(tensor_list_or_dict, dict): + for (key, tensor) in six.iteritems(tensor_list_or_dict): + self._tensor_keys[name].append(key) + self._tensors[name].append(tensor) + self._tensor_dtypes[name].append(tensor.dtype) + self._tensor_shapes[name].append(tensor.shape) + else: + # List or tuple. + self._tensor_keys[name] = None + for tensor in tensor_list_or_dict: + self._tensors[name].append(tensor) + self._tensor_dtypes[name].append(tensor.dtype) + self._tensor_shapes[name].append(tensor.shape) + + def create_enqueue_op(self): + """Create the op to enqueue the recorded host_calls. + + Returns: + A list of enqueue ops, which is empty if there are no host calls. + """ + if not self._names: + return [] + + tensors = [] + # TODO(jhseu): Consider deduping tensors. + for name in self._names: + tensors.extend(self._tensors[name]) + + with ops.device(tpu.core(0)): + return [tpu_ops.outfeed_enqueue_tuple(tensors)] + + def create_tpu_hostcall(self): + """Sends the tensors through outfeed and runs the host_fn on CPU. + + The tensors are concatenated along dimension 0 to form a global tensor + across all shards. The concatenated function is passed to the host_fn and + executed on the first host. + + Returns: + A dictionary mapping name to the return type of the host_call by that + name. + + Raises: + RuntimeError: If outfeed tensor is scalar. + """ + if not self._names: + return {} + + ret = {} + # For each i, dequeue_ops[i] is a list containing the tensors from all + # shards. This list is concatenated later. + dequeue_ops = [] + tensor_dtypes = [] + tensor_shapes = [] + for name in self._names: + for _ in self._tensors[name]: + dequeue_ops.append([]) + for dtype in self._tensor_dtypes[name]: + tensor_dtypes.append(dtype) + for shape in self._tensor_shapes[name]: + tensor_shapes.append(shape) + + # Outfeed ops execute on each replica's first logical core. Note: we must + # constraint it such that we have at most one outfeed dequeue and enqueue + # per replica. + for i in xrange(self._ctx.num_replicas): + host_device, ordinal_id = self._ctx.device_for_replica(i) + with ops.device(host_device): + outfeed_tensors = tpu_ops.outfeed_dequeue_tuple( + dtypes=tensor_dtypes, + shapes=tensor_shapes, + device_ordinal=ordinal_id) + for j, item in enumerate(outfeed_tensors): + dequeue_ops[j].append(item) + + # Deconstruct dequeue ops. + dequeue_ops_by_name = {} + pos = 0 + for name in self._names: + dequeue_ops_by_name[name] = dequeue_ops[pos:pos + + len(self._tensors[name])] + pos += len(self._tensors[name]) + + # It is assumed evaluation always happens on single host TPU system. So, + # place all ops on tpu host if possible. + # + # TODO(jhseu): Evaluate whether this is right for summaries. + with ops.device(self._ctx.tpu_host_placement_function(replica_id=0)): + for name in self._names: + dequeue_ops = dequeue_ops_by_name[name] + for i, item in enumerate(dequeue_ops): + if dequeue_ops[i][0].shape.ndims == 0: + raise RuntimeError( + 'All tensors outfed from TPU should preserve batch size ' + 'dimension, but got scalar {}'.format(dequeue_ops[i][0])) + # TODO(xiejw): Allow users to specify the axis for batch size + # dimension. + dequeue_ops[i] = array_ops.concat(dequeue_ops[i], axis=0) + + if self._tensor_keys[name] is not None: + # The user-provided eval_metrics[1] is a dict. + dequeue_ops = dict(zip(self._tensor_keys[name], dequeue_ops)) + try: + ret[name] = self._host_fns[name](**dequeue_ops) + except TypeError as e: + logging.warning( + 'Exception while calling %s: %s. It is likely the tensors ' + '(%s[1]) do not match the ' + 'function\'s arguments', name, e, name) + raise e + else: + ret[name] = self._host_fns[name](*dequeue_ops) + + return ret + + +class _OutfeedHostCallHook(session_run_hook.SessionRunHook): + """Hook to run host calls when use_tpu=False.""" + + def __init__(self, tensors): + self._tensors = tensors + + def begin(self): + # We duplicate this code from the TPUInfeedOutfeedSessionHook rather than + # create a separate hook to guarantee execution order, because summaries + # need to be initialized before the outfeed thread starts. + # TODO(jhseu): Make a wrapper hook instead? + self._init_ops = contrib_summary.summary_writer_initializer_op() + # Get all the writer resources from the initializer, so we know what to + # flush. + self._finalize_ops = [] + for op in self._init_ops: + self._finalize_ops.append(contrib_summary.flush(writer=op.inputs[0])) + + def after_create_session(self, session, coord): + session.run(self._init_ops) + + def before_run(self, run_context): + return basic_session_run_hooks.SessionRunArgs(self._tensors) + + def end(self, session): + session.run(self._finalize_ops) + + +class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook): + """Calculate and report global_step/sec and examples/sec during runtime.""" + + def __init__(self, + batch_size, + every_n_steps=100, + every_n_secs=None, + output_dir=None, + summary_writer=None): + self._batch_size = batch_size + super(ExamplesPerSecondHook, self).__init__( + every_n_steps=every_n_steps, + every_n_secs=every_n_secs, + output_dir=output_dir, + summary_writer=summary_writer) + + def _log_and_record(self, elapsed_steps, elapsed_time, global_step): + global_step_per_sec = elapsed_steps / elapsed_time + examples_per_sec = self._batch_size * global_step_per_sec + if self._summary_writer is not None: + global_step_summary = Summary(value=[ + Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec) + ]) + example_summary = Summary(value=[ + Summary.Value(tag='examples/sec', simple_value=examples_per_sec) + ]) + self._summary_writer.add_summary(global_step_summary, global_step) + self._summary_writer.add_summary(example_summary, global_step) + logging.info('global_step/sec: %g', global_step_per_sec) + logging.info('examples/sec: %g', examples_per_sec) + + +class InstallSignalHandlerHook(session_run_hook.SessionRunHook): + """Change SIGINT (CTRL^C) handler to force quit the process. + + The default behavior often results in hanging processes. + The original handler is restored after training/evaluation. + """ + + def __init__(self): + self._signal_fn = signal.getsignal(signal.SIGINT) + + def before_run(self, run_context): + signal.signal(signal.SIGINT, signal.SIG_DFL) + + def end(self, session): + signal.signal(signal.SIGINT, self._signal_fn) + + +class TPUEstimator(estimator_lib.Estimator): + """Estimator with TPU support. + + TPUEstimator also supports training on CPU and GPU. You don't need to define + a separate `tf.estimator.Estimator`. + + TPUEstimator handles many of the details of running on TPU devices, such as + replicating inputs and models for each core, and returning to host + periodically to run hooks. + + TPUEstimator transforms a global batch size in params to a per-shard batch + size when calling the `input_fn` and `model_fn`. Users should specify + global batch size in constructor, and then get the batch size for each shard + in `input_fn` and `model_fn` by `params['batch_size']`. + + - For training, `model_fn` gets per-core batch size; `input_fn` may get + per-core or per-host batch size depending on `per_host_input_for_training` + in `TPUConfig` (See docstring for TPUConfig for details). + + - For evaluation and prediction, `model_fn` gets per-core batch size and + `input_fn` get per-host batch size. + + Evaluation + ========== + + `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics` + for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return + `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case + the following discussion on TPU evaluation does not apply. + + `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where + `tensors` could be a list of any nested structure of `Tensor`s (See + `TPUEstimatorSpec` for details). `metric_fn` takes the `tensors` and returns + a dict from metric string name to the result of calling a metric function, + namely a `(metric_tensor, update_op)` tuple. + + One can set `use_tpu` to `False` for testing. All training, evaluation, and + predict will be executed on CPU. `input_fn` and `model_fn` will receive + `train_batch_size` or `eval_batch_size` unmodified as `params['batch_size']`. + + Current limitations: + -------------------- + + 1. TPU evaluation only works on a single host (one TPU worker) except + BROADCAST mode. + + 2. `input_fn` for evaluation should **NOT** raise an end-of-input exception + (`OutOfRangeError` or `StopIteration`). And all evaluation steps and all + batches should have the same size. + + Example (MNIST): + ---------------- + + ``` + # The metric Fn which runs on CPU. + def metric_fn(labels, logits): + predictions = tf.argmax(logits, 1) + return { + 'accuracy': tf.metrics.precision( + labels=labels, predictions=predictions), + } + + # Your model Fn which runs on TPU (eval_metrics is list in this example) + def model_fn(features, labels, mode, config, params): + ... + logits = ... + + if mode = tf.estimator.ModeKeys.EVAL: + return tpu_estimator.TPUEstimatorSpec( + mode=mode, + loss=loss, + eval_metrics=(metric_fn, [labels, logits])) + + # or specify the eval_metrics tensors as dict. + def model_fn(features, labels, mode, config, params): + ... + final_layer_output = ... + + if mode = tf.estimator.ModeKeys.EVAL: + return tpu_estimator.TPUEstimatorSpec( + mode=mode, + loss=loss, + eval_metrics=(metric_fn, { + 'labels': labels, + 'logits': final_layer_output, + })) + ``` + + Prediction + ========== + + Prediction on TPU is an experimental feature to support large batch inference. + It is not designed for latency-critical system. In addition, due to some + usability issues, for prediction with small dataset, CPU `.predict`, i.e., + creating a new `TPUEstimator` instance with `use_tpu=False`, might be more + convenient. + + Note: In contrast to TPU training/evaluation, the `input_fn` for prediction + *should* raise an end-of-input exception (`OutOfRangeError` or + `StopIteration`), which serves as the stopping signal to `TPUEstimator`. To be + precise, the ops created by `input_fn` produce one batch of the data. + The `predict()` API processes one batch at a time. When reaching the end of + the data source, an end-of-input exception should be raised by one of these + operations. The user usually does not need to do this manually. As long as the + dataset is not repeated forever, the `tf.data` API will raise an end-of-input + exception automatically after the last batch has been produced. + + Note: Estimator.predict returns a Python generator. Please consume all the + data from the generator so that TPUEstimator can shutdown the TPU system + properly for user. + + Current limitations: + -------------------- + 1. TPU prediction only works on a single host (one TPU worker). + + 2. `input_fn` must return a `Dataset` instance rather than `features`. In + fact, .train() and .evaluate() also support Dataset as return value. + + Example (MNIST): + ---------------- + ``` + height = 32 + width = 32 + total_examples = 100 + + def predict_input_fn(params): + batch_size = params['batch_size'] + + images = tf.random_uniform( + [total_examples, height, width, 3], minval=-1, maxval=1) + + dataset = tf.data.Dataset.from_tensor_slices(images) + dataset = dataset.map(lambda images: {'image': images}) + + dataset = dataset.batch(batch_size) + return dataset + + def model_fn(features, labels, params, mode): + # Generate predictions, called 'output', from features['image'] + + if mode == tf.estimator.ModeKeys.PREDICT: + return tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={ + 'predictions': output, + 'is_padding': features['is_padding'] + }) + + tpu_est = TPUEstimator( + model_fn=model_fn, + ..., + predict_batch_size=16) + + # Fully consume the generator so that TPUEstimator can shutdown the TPU + # system. + for item in tpu_est.predict(input_fn=input_fn): + # Filter out item if the `is_padding` is 1. + # Process the 'predictions' + ``` + + Exporting + ========= + + `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`, + and another with `tag_constants.SERVING` and `tag_constants.TPU`. + At serving time, these tags are used to select metagraph to load. + + Before running the graph on TPU, TPU system needs to be initialized. If + TensorFlow Serving model-server is used, this is done automatically. If + not, please call `session.run(tpu.initialize_system())`. + + `tpu.outside_compilation` can be used to wrap TPU incompatible ops in + `model_fn`. + + Example: + ---------------- + + ``` + def model_fn(features, labels, mode, config, params): + ... + logits = ... + export_outputs = { + 'logits': export_output_lib.PredictOutput( + {'logits': logits}) + } + + def host_call(logits): + class_ids = math_ops.argmax(logits) + classes = string_ops.as_string(class_ids) + export_outputs['classes'] = + export_output_lib.ClassificationOutput(classes=classes) + + tpu.outside_compilation(host_call, logits) + + ... + ``` + + """ + + def __init__(self, + model_fn=None, + train_cache_fn=None, + eval_cache_fn=None, + model_dir=None, + config=None, + params=None, + use_tpu=True, + train_batch_size=None, + eval_batch_size=None, + predict_batch_size=None, + batch_axis=None, + eval_on_tpu=True, + export_to_tpu=True, + warm_start_from=None): + """Constructs an `TPUEstimator` instance. + + Args: + model_fn: Model function as required by `Estimator` which returns + EstimatorSpec or TPUEstimatorSpec. `training_hooks`, 'evaluation_hooks', + and `prediction_hooks` must not capure any TPU Tensor inside the + model_fn. + model_dir: Directory to save model parameters, graph and etc. This can + also be used to load checkpoints from the directory into a estimator to + continue training a previously saved model. If `None`, the model_dir in + `config` will be used if set. If both are set, they must be same. If + both are `None`, a temporary directory will be used. + config: An `tpu_config.RunConfig` configuration object. Cannot be `None`. + params: An optional `dict` of hyper parameters that will be passed into + `input_fn` and `model_fn`. Keys are names of parameters, values are + basic python types. There are reserved keys for `TPUEstimator`, + including 'batch_size'. + use_tpu: A bool indicating whether TPU support is enabled. Currently, - + TPU training and evaluation respect this bit, but eval_on_tpu can + override execution of eval. See below. - Predict still happens on CPU. + train_batch_size: An int representing the global training batch size. + TPUEstimator transforms this global batch size to a per-shard batch + size, as params['batch_size'], when calling `input_fn` and `model_fn`. + Cannot be `None` if `use_tpu` is `True`. Must be divisible by total + number of replicas. + eval_batch_size: An int representing evaluation batch size. Must be + divisible by total number of replicas. + predict_batch_size: An int representing the prediction batch size. Must be + divisible by total number of replicas. + batch_axis: A python tuple of int values describing how each tensor + produced by the Estimator `input_fn` should be split across the TPU + compute shards. For example, if your input_fn produced (images, labels) + where the images tensor is in `HWCN` format, your shard dimensions would + be [3, 0], where 3 corresponds to the `N` dimension of your images + Tensor, and 0 corresponds to the dimension along which to split the + labels to match up with the corresponding images. If None is supplied, + and per_host_input_for_training is True, batches will be sharded based + on the major dimension. If tpu_config.per_host_input_for_training is + False or `PER_HOST_V2`, batch_axis is ignored. + eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the + model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`. + export_to_tpu: If True, `export_savedmodel()` exports a metagraph for + serving on TPU besides the one on CPU. + warm_start_from: Optional string filepath to a checkpoint or SavedModel to + warm-start from, or a `tf.estimator.WarmStartSettings` object to fully + configure warm-starting. If the string filepath is provided instead of + a `WarmStartSettings`, then all variables are warm-started, and it is + assumed that vocabularies and Tensor names are unchanged. + + Raises: + ValueError: `params` has reserved keys already. + """ + if config is None or not isinstance(config, tpu_config.RunConfig): + raise ValueError( + '`config` must be provided with type `tpu_config.RunConfig`') + + if params is not None and any(k in params for k in _RESERVED_PARAMS_KEYS): + raise ValueError('{} are reserved keys but existed in params {}.'.format( + _RESERVED_PARAMS_KEYS, params)) + + if use_tpu: + # Perform some very basic validations. More validations will be found in + # _InternalTPUContext. + if train_batch_size is None: + raise ValueError('`train_batch_size` cannot be `None`') + util_lib.check_positive_integer(train_batch_size, 'train_batch_size') + + if (config.tpu_config.per_host_input_for_training is + tpu_config.InputPipelineConfig.PER_SHARD_V1 and + config.tpu_config.num_cores_per_replica): + raise ValueError( + 'Model parallelism only supports per host input for training. ' + 'Please adjust TPURunconfig.per_host_input_for_training.') + + if eval_batch_size is not None: + util_lib.check_positive_integer(eval_batch_size, 'eval_batch_size') + + if predict_batch_size is not None: + util_lib.check_positive_integer(predict_batch_size, + 'predict_batch_size') + + # Verifies the model_fn signature according to Estimator framework. + estimator_lib._verify_model_fn_args(model_fn, params) # pylint: disable=protected-access + # We cannot store config and params in this constructor as parent + # constructor might change them, such as assigning a temp dir for + # config.model_dir. + model_function = self._augment_model_fn( + model_fn, + train_cache_fn, + eval_cache_fn, + batch_axis) + + # Overwrite log_step_count_steps to disable TensorLoggingHook and + # StepCounterHook from being created in Estimator. TPUEstimator already + # added equivalent hooks in _augment_model_fn above. + self._log_every_n_steps = config.log_step_count_steps + config = config.replace(log_step_count_steps=None) + + # Passing non-None params as wrapped model_fn has it. + params = params or {} + super(TPUEstimator, self).__init__( + model_fn=model_function, + model_dir=model_dir, + config=config, + params=params, + warm_start_from=warm_start_from) + self._iterations_per_training_loop = ( + self._config.tpu_config.iterations_per_loop) + + # All properties passed to _InternalTPUContext are immutable. + # pylint: disable=protected-access + self._ctx = tpu_context._get_tpu_context( + self._config, train_batch_size, eval_batch_size, predict_batch_size, + use_tpu, eval_on_tpu) + + self._export_to_tpu = export_to_tpu + + self._is_input_fn_invoked = None + self._rendezvous = {} + + def _add_meta_graph_for_mode(self, + builder, + input_receiver_fn_map, + checkpoint_path, + save_variables=True, + mode=model_fn_lib.ModeKeys.PREDICT, + export_tags=None, + check_variables=True): + if self._export_to_tpu and mode != model_fn_lib.ModeKeys.PREDICT: + raise NotImplementedError( + 'TPUEstimator only handles mode PREDICT for exporting ' + 'when `export_to_tpu` is `True`; ' + 'got {}.'.format(mode)) + + (super(TPUEstimator, self)._add_meta_graph_for_mode( + builder, + input_receiver_fn_map, + checkpoint_path, + save_variables, + mode=mode, + export_tags=export_tags, + check_variables=check_variables)) + + if self._export_to_tpu: + input_receiver_fn_map = { + _REWRITE_FOR_INFERENCE_MODE: input_receiver_fn_map[mode] + } + export_tags = [tag_constants.SERVING, tag_constants.TPU] + mode = _REWRITE_FOR_INFERENCE_MODE + # See b/110052256 for why `check_variables` is `False`. + (super(TPUEstimator, self)._add_meta_graph_for_mode( + builder, + input_receiver_fn_map, + checkpoint_path, + save_variables=False, + mode=mode, + export_tags=export_tags, + check_variables=False)) + + def _call_model_fn(self, features, labels, mode, config): + if mode == _REWRITE_FOR_INFERENCE_MODE: + return self._call_model_fn_for_inference(features, labels, mode, config) + else: + return super(TPUEstimator, self)._call_model_fn(features, labels, mode, + config) + + def _call_model_fn_for_inference(self, features, labels, mode, config): + """Wraps `_call_model_fn` for `export_savedmodel`.""" + if mode != _REWRITE_FOR_INFERENCE_MODE: + raise ValueError('mode must be {}; ' + 'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode)) + + capture = _CapturedObject() + + def computation(): + """Compute tpu tensors used in export_outputs. + + Passed to rewrite_for_inference so that model_fn will be called under + the rewriting contexts. Only tpu tensors are returned, but export_outputs + and scaffold are captured. + + Returns: + A list of Tensors used in export_outputs and not marked for + outside_compilation. + """ + # We should only call model fn once and it should be inside `computation` + # so that building the graph will happen under `rewrite_for_inference`. + mode = model_fn_lib.ModeKeys.PREDICT + estimator_spec = self._call_model_fn(features, labels, mode, config) + + # We pick the TPU tensors out from `export_output` and later return them + # from `computation` for rewriting. + tensors_dict = collections.OrderedDict( + (k, _export_output_to_tensors(v)) + for k, v in six.iteritems(estimator_spec.export_outputs)) + tensors = nest.flatten(tensors_dict) + tpu_tensors = [t for t in tensors if t is not None] + + # We cannot return anything other than `tpu_tensors` here so we capture + # the rest for later use. + capture.capture((estimator_spec, tensors_dict, tensors)) + return tpu_tensors + + tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation) + estimator_spec, tensors_dict, tensors = capture.get() + + # Reconstruct `tensors`, but with `tpu_tensors` replaced with + # `tpu_tensors_on_cpu`. + new_tensors = [] + for t in tensors: + if t is None: + new_tensors.append(None) + else: + new_tensors.append(tpu_tensors_on_cpu.pop(0)) + + # Reconstruct `tensors_dict`. + new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors) + # Reconstruct `export_outputs`. + export_outputs = estimator_spec.export_outputs + new_export_outputs = collections.OrderedDict( + (k, _clone_export_output_with_tensors(export_outputs[k], v)) + for k, v in six.iteritems(new_tensors_dict)) + + return estimator_spec._replace(export_outputs=new_export_outputs) + + def _create_global_step(self, graph): + """Creates a global step suitable for TPUs. + + Args: + graph: The graph in which to create the global step. + + Returns: + A global step `Tensor`. + + Raises: + ValueError: if the global step tensor is already defined. + """ + return _create_global_step(graph) + + def _convert_train_steps_to_hooks(self, steps, max_steps): + with self._ctx.with_mode(model_fn_lib.ModeKeys.TRAIN) as ctx: + if ctx.is_running_on_cpu(): + return super(TPUEstimator, self)._convert_train_steps_to_hooks( + steps, max_steps) + + # On TPU. + if steps is None and max_steps is None: + raise ValueError( + 'For TPU training, one of `steps` or `max_steps` must be set. ' + 'Cannot be both `None`.') + + # Estimator.train has explicit positiveness check. + if steps is not None: + util_lib.check_positive_integer(steps, 'Train steps') + if max_steps is not None: + util_lib.check_positive_integer(max_steps, 'Train max_steps') + + return [ + _TPUStopAtStepHook(self._iterations_per_training_loop, steps, max_steps) + ] + + def _convert_eval_steps_to_hooks(self, steps): + with self._ctx.with_mode(model_fn_lib.ModeKeys.EVAL) as ctx: + if ctx.is_running_on_cpu(): + return super(TPUEstimator, self)._convert_eval_steps_to_hooks(steps) + + if steps is None: + raise ValueError('Evaluate `steps` must be set on TPU. Cannot be `None`.') + + util_lib.check_positive_integer(steps, 'Eval steps') + + return [ + evaluation._StopAfterNEvalsHook( # pylint: disable=protected-access + num_evals=steps), + _SetEvalIterationsHook(steps) + ] + + def _call_input_fn(self, input_fn, mode): + """Calls the input function. + + Args: + input_fn: The input function. + mode: ModeKeys + + Returns: + In TPU mode, returns an input_fn to be called later in model_fn. + Otherwise, calls the input_fn and returns either fatures or + (features, labels). + + Raises: + ValueError: if input_fn takes invalid arguments or does not have `params`. + """ + input_fn_args = function_utils.fn_args(input_fn) + config = self.config # a deep copy. + kwargs = {} + if 'params' in input_fn_args: + kwargs['params'] = self.params # a deep copy. + else: + raise ValueError('input_fn ({}) does not include params argument, ' + 'required by TPUEstimator to pass batch size as ' + 'params["batch_size"]'.format(input_fn)) + if 'config' in input_fn_args: + kwargs['config'] = config + + if 'mode' in input_fn_args: + kwargs['mode'] = mode + + # Records the fact input_fn has been invoked. + self._is_input_fn_invoked = True + + with self._ctx.with_mode(mode) as ctx: + # Setting the batch size in params first. This helps user to have same + # input_fn for use_tpu=True/False. + batch_size_for_input_fn = ctx.batch_size_for_input_fn + if batch_size_for_input_fn is not None: + _add_item_to_params(kwargs['params'], _BATCH_SIZE_KEY, + batch_size_for_input_fn) + + # For export_savedmodel, input_fn is never passed to Estimator. So, + # `is_export_mode` must be False. + if ctx.is_running_on_cpu(is_export_mode=False): + with ops.device('/device:CPU:0'): + return input_fn(**kwargs) + + # For TPU computation, input_fn should be invoked in a tf.while_loop for + # performance. While constructing the tf.while_loop, the structure of + # inputs returned by the `input_fn` needs to be recorded. The structure + # includes whether features or labels is dict or single Tensor, dict keys, + # tensor shapes, and dtypes. The recorded structure is used to create the + # infeed dequeue ops, which must be wrapped and passed as a Fn, called + # inside the TPU computation, as the TPU computation is wrapped inside a + # tf.while_loop also. So, we either pass input_fn to model_fn or pass + # dequeue_fn to model_fn. Here, `input_fn` is passed directly as + # `features` in `model_fn` signature. + def _input_fn(ctx): + _add_item_to_params(kwargs['params'], _CTX_KEY, ctx) + return input_fn(**kwargs) + + return _input_fn + + def _validate_features_in_predict_input(self, result): + """Skip the validation. + + For TPUEstimator, we do not need to check the result type. `_InputPipeline` + has stronger check. Parent class's check generates confusing warning msg. + + Args: + result: `features` returned by input_fn. + """ + pass + + def train(self, + input_fn, + hooks=None, + steps=None, + max_steps=None, + saving_listeners=None): + rendezvous = error_handling.ErrorRendezvous(num_sources=3) + self._rendezvous[model_fn_lib.ModeKeys.TRAIN] = rendezvous + try: + return super(TPUEstimator, self).train( + input_fn=input_fn, + hooks=hooks, + steps=steps, + max_steps=max_steps, + saving_listeners=saving_listeners) + except Exception: # pylint: disable=broad-except + rendezvous.record_error('training_loop', sys.exc_info()) + finally: + rendezvous.record_done('training_loop') + rendezvous.raise_errors() + + def evaluate(self, + input_fn, + steps=None, + hooks=None, + checkpoint_path=None, + name=None): + rendezvous = error_handling.ErrorRendezvous(num_sources=3) + self._rendezvous[model_fn_lib.ModeKeys.EVAL] = rendezvous + try: + return super(TPUEstimator, self).evaluate( + input_fn, + steps=steps, + hooks=hooks, + checkpoint_path=checkpoint_path, + name=name) + except Exception: # pylint: disable=broad-except + rendezvous.record_error('evaluation_loop', sys.exc_info()) + finally: + rendezvous.record_done('evaluation_loop') + rendezvous.raise_errors() + + def predict(self, + input_fn, + predict_keys=None, + hooks=None, + checkpoint_path=None, + yield_single_examples=True): + rendezvous = error_handling.ErrorRendezvous(num_sources=3) + self._rendezvous[model_fn_lib.ModeKeys.PREDICT] = rendezvous + try: + for result in super(TPUEstimator, self).predict( + input_fn=input_fn, + predict_keys=predict_keys, + hooks=hooks, + checkpoint_path=checkpoint_path, + yield_single_examples=yield_single_examples): + yield result + except Exception: # pylint: disable=broad-except + rendezvous.record_error('prediction_loop', sys.exc_info()) + finally: + rendezvous.record_done('prediction_loop') + rendezvous.raise_errors() + + rendezvous.record_done('prediction_loop') + rendezvous.raise_errors() + + def _augment_model_fn(self, model_fn, train_cache_fn, eval_cache_fn, batch_axis): + """Returns a new model_fn, which wraps the TPU support.""" + + def _model_fn(features, labels, mode, config, params): + """A Estimator `model_fn` for TPUEstimator.""" + with self._ctx.with_mode(mode) as ctx: + model_fn_wrapper = _ModelFnWrapper(model_fn, train_cache_fn, + eval_cache_fn, config, params, ctx) + + # `input_fn` is called in `train()`, `evaluate()`, and `predict()`, + # but not in `export_savedmodel()`. + if self._is_input_fn_invoked: + is_export_mode = False + else: + is_export_mode = True + + # Clear the bit. + self._is_input_fn_invoked = None + + # examples_hook is added to training_hooks for both CPU and TPU + # execution. + if self._log_every_n_steps is not None: + examples_hook = ExamplesPerSecondHook( + ctx.global_batch_size, + output_dir=self.model_dir, + every_n_steps=self._log_every_n_steps) + + if ctx.is_running_on_cpu(is_export_mode=is_export_mode): + logging.info('Running %s on CPU', mode) + estimator_spec = model_fn_wrapper.call_without_tpu( + features, labels, is_export_mode=is_export_mode) + if self._log_every_n_steps is not None: + estimator_spec = estimator_spec._replace( + training_hooks=estimator_spec.training_hooks + (examples_hook,)) + return estimator_spec + + assert labels is None, '`labels` passed to `model_fn` must be `None`.' + # TPUEstimator._call_input_fn passes `input_fn` as features to here. + assert callable(features), '`input_fn` is not callable.' + input_fn = features + + input_holders = _InputPipeline(input_fn, batch_axis, ctx) + enqueue_ops, dequeue_fn, input_hooks, run_infeed_loop_on_coordinator = ( + input_holders.generate_infeed_enqueue_ops_and_dequeue_fn()) + + graph = ops.get_default_graph() + for enqueue_op in enqueue_ops: + if isinstance(enqueue_op, list): + graph.get_collection_ref(_TPU_ENQUEUE_OPS).extend(enqueue_op) + else: + graph.add_to_collection(_TPU_ENQUEUE_OPS, enqueue_op) + + if mode == model_fn_lib.ModeKeys.TRAIN: + compile_op, loss, host_call, scaffold, training_hooks = ( + _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)) + host_ops = host_call.create_tpu_hostcall() + if host_ops is None: + host_ops = [] + + shutdown_hooks = [] + shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE', + 'shutdown_worker') + if shutdown_mode: + if shutdown_mode == 'shutdown_worker': + finalizer_hooks = [ + session_support.ShutdownLameWorkers(timeout_ms=60 * 1000), + ] + elif shutdown_mode == 'shutdown_computation': + finalizer_hooks = [ + session_support.RestartComputation(timeout_ms=60 * 1000), + ] + else: + raise ValueError( + 'Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' % shutdown_mode) + + shutdown_hooks.append( + session_support.GracefulShutdownHook( + checkpoint_prefix=self.model_dir + '/model.ckpt', + on_shutdown_hooks=finalizer_hooks)) + + with ops.control_dependencies([loss]): + global_step = array_ops.identity(training.get_global_step()) + hooks = input_hooks + shutdown_hooks + hooks.extend([ + TPUInfeedOutfeedSessionHook( + ctx, + enqueue_ops, + host_ops, + tpu_compile_op=compile_op, + run_infeed_loop_on_coordinator=( + run_infeed_loop_on_coordinator), + rendezvous=self._rendezvous[mode], + master=self._config.master, + session_config=self._session_config, + ), + InstallSignalHandlerHook() + ]) + if self._log_every_n_steps is not None: + logging_hook_frequency = ( # Divide and round up + (self._log_every_n_steps + + self._config.tpu_config.iterations_per_loop - 1) // + self._config.tpu_config.iterations_per_loop) + hooks.append( + training.LoggingTensorHook({ + 'loss': array_ops.identity(loss), + 'step': global_step, + }, + every_n_iter=logging_hook_frequency)) + examples_hook._set_steps_per_run( # pylint: disable=protected-access + self._config.tpu_config.iterations_per_loop) + hooks.append(examples_hook) + + if training_hooks: + hooks.extend(training_hooks) + + chief_hooks = [] + if (self._config.save_checkpoints_secs or + self._config.save_checkpoints_steps): + checkpoint_hook = training.CheckpointSaverHook( + self.model_dir, + save_secs=self._config.save_checkpoints_secs, + save_steps=self._config.save_checkpoints_steps, + scaffold=scaffold) + checkpoint_hook._set_steps_per_run( # pylint: disable=protected-access + self._config.tpu_config.iterations_per_loop) + chief_hooks.append(checkpoint_hook) + + summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss) + with ops.control_dependencies([loss]): + update_ops = _sync_variables_ops(ctx) + + # Validate the TPU training graph to catch basic errors + _validate_tpu_training_graph() + + train_op = control_flow_ops.group(*update_ops) + graph.add_to_collection(_TPU_TRAIN_OP, train_op) + + return model_fn_lib.EstimatorSpec( + mode, + loss=loss, + training_chief_hooks=chief_hooks, + training_hooks=hooks, + train_op=train_op, + scaffold=scaffold) + + if mode == model_fn_lib.ModeKeys.EVAL: + compile_op, total_loss, host_calls, scaffold, eval_hooks = ( + _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn)) + iterations_per_loop_var = _create_or_get_iterations_per_loop() + mean_loss = math_ops.div( + total_loss, + math_ops.cast(iterations_per_loop_var, dtype=total_loss.dtype)) + + with ops.control_dependencies([mean_loss]): + # After TPU evaluation computation is done (the mean_loss tensor), + # reads all variables back from TPU and updates the eval step + # counter properly + internal_ops_to_run = _sync_variables_ops(ctx) + internal_ops_to_run.append( + _increase_eval_step_op(iterations_per_loop_var)) + + host_call_ret = host_calls.create_tpu_hostcall() + eval_metric_ops = {} + eval_update_ops = [] + + eval_metrics = host_call_ret.get('eval_metrics', {}) + if eval_metrics: + # Creates a dummy metric update_op for all metrics. Estimator + # expects all metrics in `eval_metric_ops` have update_op and calls + # them one by one. The real metric update_ops are invoked in a + # separated thread. So, here give Estimator the dummy op for all + # metrics. + with ops.control_dependencies(internal_ops_to_run): + dummy_update_op = control_flow_ops.no_op() + + for k, v in eval_metrics.items(): + eval_metric_ops[k] = (v[0], dummy_update_op) + eval_update_ops.append(v[1]) + else: + # If no eval metrics are passed, create an identity node for the + # loss and add `internal_ops_to_run` to its dependencies. So + # `internal_ops_to_run` can be executed. + with ops.control_dependencies(internal_ops_to_run): + mean_loss = array_ops.identity(mean_loss) + + if 'host_call' not in host_call_ret: + host_ops = [] + else: + host_ops = host_call_ret['host_call'] + hooks = [ + TPUInfeedOutfeedSessionHook( + ctx, + enqueue_ops, + eval_update_ops + host_ops, + tpu_compile_op=compile_op, + run_infeed_loop_on_coordinator=( + run_infeed_loop_on_coordinator), + rendezvous=self._rendezvous[mode], + master=self._config.evaluation_master, + session_config=self._session_config, + )] + input_hooks + + if eval_hooks: + hooks.extend(eval_hooks) + + return model_fn_lib.EstimatorSpec( + mode, + loss=mean_loss, + evaluation_hooks=hooks, + eval_metric_ops=eval_metric_ops, + scaffold=scaffold) + + # Predict + assert mode == model_fn_lib.ModeKeys.PREDICT + + (compile_op, dummy_predict_op, host_calls, + scaffold, prediction_hooks) = _predict_on_tpu_system( + ctx, model_fn_wrapper, dequeue_fn) + with ops.control_dependencies([dummy_predict_op]): + internal_ops_to_run = _sync_variables_ops(ctx) + with ops.control_dependencies(internal_ops_to_run): + dummy_predict_op = control_flow_ops.no_op() + + # In train and evaluation, the main TPU program is passed to monitored + # training session to run. Infeed enqueue and outfeed dequeue are + # executed in side threads. This is not the configuration for + # prediction mode. + # + # For prediction, the Estimator executes the EstimatorSpec.predictions + # directly and yield the element (via generator) to call site. So, the + # outfeed based prediction must be passed to MonitoredSession directly. + # Other parts of the TPU execution are organized as follows. + # + # 1. All outfeed based Tensors must be grouped with predictions Tensors + # to form a single invocation. This avoid the issue we might trigger + # multiple outfeeds incorrectly. To achieve this, `host_call` is + # placed in control_dependencies of `stopping_signals`, and + # `stopping_signals` is passed into _StoppingPredictHook, which sets + # the `stopping_signals` as SessionRunArgs. MonitoredSession merges + # all SessionRunArgs with the fetch in session.run together. + # + # 2. The TPU program (dummy_predict_op) and enqueue_ops (infeed Enqueue) + # are grouped together. They will be launched once and only once in + # side threads and they quit naturally according to the SAME stopping + # condition. + enqueue_ops.append(dummy_predict_op) + + host_call_ret = host_calls.create_tpu_hostcall() + if 'host_call' not in host_call_ret: + host_ops = [] + else: + host_ops = host_call_ret['host_call'] + + predictions = host_call_ret['predictions'] + _verify_cross_hosts_transfer_size( + predictions, + message=( + 'The estimated size for TPUEstimatorSpec.predictions is too ' + 'large.')) + signals = host_call_ret['signals'] + + with ops.control_dependencies(host_ops): + host_ops = [] # Empty, we do do not need it anymore. + scalar_stopping_signal = _StopSignals.as_scalar_stopping_signal( + signals) + predictions = _PaddingSignals.slice_tensor_or_dict( + predictions, signals) + + hooks = [ + _StoppingPredictHook(scalar_stopping_signal), + TPUInfeedOutfeedSessionHookForPrediction( + ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode], + tpu_compile_op=compile_op, + master=self._config.master, + session_config=self._session_config), + ] + input_hooks + + if prediction_hooks: + hooks.extend(prediction_hooks) + + return model_fn_lib.EstimatorSpec( + mode, + prediction_hooks=hooks, + predictions=predictions, + scaffold=scaffold) + + return _model_fn + + +def _export_output_to_tensors(export_output): + """Get a list of `Tensors` used in `export_output`. + + Args: + export_output: an `ExportOutput` object such as `ClassificationOutput`, + `RegressionOutput`, or `PredictOutput`. + + Returns: + a list of tensors used in export_output. + + Raises: + ValueError: if `export_output` is not one of `ClassificationOutput`, + `RegressionOutput`, or `PredictOutput`. + """ + if isinstance(export_output, export_output_lib.ClassificationOutput): + return [export_output.scores, export_output.classes] + elif isinstance(export_output, export_output_lib.RegressionOutput): + return [export_output.value] + elif isinstance(export_output, export_output_lib.PredictOutput): + return list(export_output.outputs.values()) + else: + raise ValueError( + '`export_output` must be have type `ClassificationOutput`, ' + '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output)) + + +def _clone_export_output_with_tensors(export_output, tensors): + """Clones `export_output` but with new `tensors`. + + Args: + export_output: an `ExportOutput` object such as `ClassificationOutput`, + `RegressionOutput`, or `PredictOutput`. + tensors: a list of `Tensors` used to construct a new `export_output`. + + Returns: + A dict similar to `export_output` but with `tensors`. + + Raises: + ValueError: if `export_output` is not one of `ClassificationOutput`, + `RegressionOutput`, or `PredictOutput`. + """ + if isinstance(export_output, export_output_lib.ClassificationOutput): + if len(tensors) != 2: + raise ValueError('tensors must be of length 2; ' + 'got {}.'.format(len(tensors))) + return export_output_lib.ClassificationOutput(*tensors) + elif isinstance(export_output, export_output_lib.RegressionOutput): + if len(tensors) != 1: + raise ValueError('tensors must be of length 1; ' + 'got {}'.format(len(tensors))) + return export_output_lib.RegressionOutput(*tensors) + elif isinstance(export_output, export_output_lib.PredictOutput): + return export_output_lib.PredictOutput( + dict(zip(export_output.outputs.keys(), tensors))) + else: + raise ValueError( + '`export_output` must be have type `ClassificationOutput`, ' + '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output)) + + +def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): + """Executes `model_fn_wrapper` multiple times on all TPU shards.""" + iterations_per_loop_var = _create_or_get_iterations_per_loop() + + (single_tpu_eval_step, host_calls, captured_scaffold_fn, captured_eval_hooks + ) = model_fn_wrapper.convert_to_single_tpu_eval_step(dequeue_fn) + + def multi_tpu_eval_steps_on_single_shard(): + loop_vars = [_ZERO_LOSS] + if model_fn_wrapper._eval_cache_fn is not None: + batch_size = ctx.global_batch_size + num_shards = ctx._config._tpu_config.num_shards + loop_vars += model_fn_wrapper._eval_cache_fn(batch_size // num_shards) + + return training_loop.repeat( + iterations_per_loop_var, + single_tpu_eval_step, + loop_vars) + + compile_op, ret = tpu.split_compile_and_shard( + multi_tpu_eval_steps_on_single_shard, + inputs=[], + num_shards=ctx.num_replicas, + outputs_from_all_shards=False, + device_assignment=ctx.device_assignment) + + loss = ret[0] + scaffold = _get_scaffold(captured_scaffold_fn) + return compile_op, loss, host_calls, scaffold, captured_eval_hooks.get() + + +def _train_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): + """Executes `model_fn_wrapper` multiple times on all TPU shards.""" + iterations_per_loop_var = _create_or_get_iterations_per_loop() + + (single_tpu_train_step, host_call, captured_scaffold_fn, + captured_training_hooks) = ( + model_fn_wrapper.convert_to_single_tpu_train_step(dequeue_fn)) + + def multi_tpu_train_steps_on_single_shard(): + loop_vars = [_INITIAL_LOSS] + if model_fn_wrapper._train_cache_fn is not None: + batch_size = ctx.global_batch_size + num_shards = ctx._config._tpu_config.num_shards + loop_vars += model_fn_wrapper._train_cache_fn(batch_size // num_shards) + + return training_loop.repeat( + iterations_per_loop_var, + single_tpu_train_step, + loop_vars) + + compile_op, ret = tpu.split_compile_and_shard( + multi_tpu_train_steps_on_single_shard, + inputs=[], + num_shards=ctx.num_replicas, + outputs_from_all_shards=False, + device_assignment=ctx.device_assignment) + + loss = ret[0] + scaffold = _get_scaffold(captured_scaffold_fn) + return compile_op, loss, host_call, scaffold, captured_training_hooks.get() + + +def _predict_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn): + """Executes `model_fn_wrapper` multiple times on all TPU shards.""" + (single_tpu_predict_step, host_calls, captured_scaffold_fn, + captured_predict_hooks + ) = model_fn_wrapper.convert_to_single_tpu_predict_step(dequeue_fn) + + def multi_tpu_predict_steps_on_single_shard(): + + def cond(scalar_stopping_signal): + return math_ops.logical_not( + _StopSignals.should_stop(scalar_stopping_signal)) + + inputs = [_StopSignals.NON_STOPPING_SIGNAL] + outputs = training_loop.while_loop( + cond, single_tpu_predict_step, inputs=inputs, name=b'loop') + return outputs + + (compile_op, dummy_predict_op,) = tpu.split_compile_and_shard( + multi_tpu_predict_steps_on_single_shard, + inputs=[], + num_shards=ctx.num_replicas, + outputs_from_all_shards=False, + device_assignment=ctx.device_assignment) + + dummy_predict_op = dummy_predict_op[0] + scaffold = _get_scaffold(captured_scaffold_fn) + return (compile_op, dummy_predict_op, host_calls, scaffold, + captured_predict_hooks.get()) + + +def _wrap_computation_in_while_loop(device, op_fn): + """Wraps the ops generated by `op_fn` in tf.while_loop.""" + + def computation(i): + with ops.control_dependencies(op_fn()): + return i + 1 + + iterations_per_loop_var = _create_or_get_iterations_per_loop() + # By setting parallel_iterations=1, the parallel execution in while_loop is + # basically turned off. + with ops.device(device): + iterations = array_ops.identity(iterations_per_loop_var) + return control_flow_ops.while_loop( + lambda i: i < iterations, + computation, [constant_op.constant(0)], + parallel_iterations=1) + + +def _wrap_computation_in_while_loop_with_stopping_signals(device, op_fn): + """Wraps the ops generated by `op_fn` in tf.while_loop.""" + + def cond(scalar_stopping_signal): + return math_ops.logical_not( + _StopSignals.should_stop(scalar_stopping_signal)) + + def computation(unused_scalar_stopping_signal): + return_value = op_fn() + execute_ops = return_value['ops'] + signals = return_value['signals'] + with ops.control_dependencies(execute_ops): + return _StopSignals.as_scalar_stopping_signal(signals) + + # By setting parallel_iterations=1, the parallel execution in while_loop is + # basically turned off. + with ops.device(device): + return control_flow_ops.while_loop( + cond, + computation, [_StopSignals.NON_STOPPING_SIGNAL], + parallel_iterations=1) + + +def _validate_tpu_training_graph(): + """Validate graph before running distributed training. + + Raises: + ValueError: If the graph seems invalid for running on device + """ + operations = ops.get_default_graph().get_operations() + + # Check if there is atleast one CrossReplicaSum operation in the graph + # This should be introduced by using the CrossShardOptimizer wrapper + cross_replica_sum_ops = [ + o for o in operations if o.type == _CROSS_REPLICA_SUM_OP + ] + if not cross_replica_sum_ops: + raise ValueError( + 'CrossShardOptimizer must be used for model training on TPUs.') + + +class _CapturedObject(object): + """A placeholder to capture an object. + + This is useful when we need to capture a Python object in the Tensorflow + control flow body function and use it outside the control flow. + """ + + def __init__(self): + self._object = None + self._captured = False + + def capture(self, o): + if self._captured: + raise RuntimeError( + 'InternalError: Object can capture only once. Please file bug.') + + self._captured = True + self._object = o + + def get(self): + if not self._captured: + raise RuntimeError( + 'InternalError: Object is not captured properly before `get`. ' + 'Please file bug.') + return self._object + + +def _get_scaffold(captured_scaffold_fn): + """Retrieves the Scaffold from `captured_scaffold_fn`.""" + with _CapturingContext(message='Inside scaffold_fn'): + scaffold_fn = captured_scaffold_fn.get() + if scaffold_fn: + scaffold = scaffold_fn() + if scaffold is None: + raise ValueError( + 'TPUEstimatorSpec.scaffold_fn returns None, which is not allowed') + else: + scaffold = None + + if scaffold: + wrapped_finalize = scaffold.finalize + + def _finalize(): + with _CapturingContext('Inside Scaffold.finalize'): + wrapped_finalize() + + scaffold.finalize = _finalize + return scaffold + + +class _CapturingContext(control_flow_ops.ControlFlowContext): + """Tracks references to Tensors defined in TPU replication.""" + + def __init__(self, message): + control_flow_ops.ControlFlowContext.__init__(self) + self._message = message + + def to_control_flow_context_def(self, context_def, export_scope=None): + # pylint: disable=useless-super-delegation + # NOTE(slebedev): the method is required by `ControlFlowContext`. + super(_CapturingContext, self).to_control_flow_context_def( + context_def, export_scope) + + def AddOp(self, op): # pylint: disable=invalid-name + for c in op.inputs: + if tpu._TPU_REPLICATE_ATTR in c.op.node_def.attr: # pylint: disable=protected-access + raise ValueError('{}: Op {} depends on TPU computation {}, ' + 'which is not allowed.'.format(self._message, op, c)) + + def __enter__(self): + # pylint: disable=protected-access + self._g = ops.get_default_graph() + self._old = self._g._get_control_flow_context() + self._g._set_control_flow_context(self) + # pylint: enable=protected-access + + def __exit__(self, _, __, ___): # pylint: disable=invalid-name + self._g._set_control_flow_context(self._old) # pylint: disable=protected-access + + +class _Inputs(object): + """A data structure representing the input_fn returned values. + + This also supports the returned value from input_fn as `Dataset`. + """ + + def __init__(self, features=None, labels=None, dataset=None, signals=None): + if dataset is not None and (features is not None or labels is not None or + signals is not None): + raise RuntimeError('Internal Error: Either (features and labels) or ' + 'dataset should be provided, not both. Please file ' + 'bug') + + self._features = features + self._labels = labels + self._signals = signals + + self._dataset = dataset + self._iterator = None + + @staticmethod + def from_input_fn(return_values): + """Returns an `_Inputs` instance according to `input_fn` return value.""" + if isinstance(return_values, dataset_ops.DatasetV2): + dataset = return_values + return _Inputs(dataset=dataset) + + features, labels = _Inputs._parse_inputs(return_values) + return _Inputs(features, labels) + + @staticmethod + def _parse_inputs(return_values): + if isinstance(return_values, tuple): + features, labels = return_values + else: + features, labels = return_values, None + return features, labels + + @property + def is_dataset(self): + """Returns True if the return value from input_fn is Dataset.""" + return self._dataset is not None + + def dataset_initializer(self): + """Returns the dataset's initializer. + + The initializer must be run before calling `features_and_labels`. + """ + self._iterator = dataset_ops.make_initializable_iterator(self._dataset) + return self._iterator.initializer + + def features_and_labels(self): + """Gets `features` and `labels`.""" + if self.is_dataset: + if self._iterator is None: + raise RuntimeError('Internal error: Must run dataset_initializer ' + 'before calling features_and_labels(). Please file ' + 'a bug!') + return _Inputs._parse_inputs(self._iterator.get_next()) + + return (self._features, self._labels) + + def signals(self): + return self._signals + + @property + def dataset(self): + return self._dataset + + +class _InputsWithStoppingSignals(_Inputs): + """Inputs with `_StopSignals` inserted into the dataset.""" + + def __init__(self, + dataset, + batch_size, + add_padding=False, + num_invocations_per_step=1): + + assert dataset is not None + user_provided_dataset = dataset.map( + _InputsWithStoppingSignals.insert_stopping_signal( + stop=False, batch_size=batch_size, add_padding=add_padding)) + if num_invocations_per_step == 1: + final_batch_dataset = dataset.take(1).map( + _InputsWithStoppingSignals.insert_stopping_signal( + stop=True, batch_size=batch_size, add_padding=add_padding)) + else: + # We append (2 * num_invocations_per_step - 1) batches for exhausting the + # user_provided_dataset and stop properly. + # For example, if num_invocations_per_step is 2, we append 3 additional + # padding batches: b1, b2, b3. + # If user_provided_dataset contains two batches: a1, a2 + # Step 1: [a1, a2] + # Step 2: [b1, b2] -> STOP + # If user_provided_dataset contains three batches: a1, a2, a3. + # The training loops: + # Step 1: [a1, a2] + # Step 2: [a3, b1] + # Step 3: [b2, b3] -> STOP. + final_batch_dataset = dataset.take(1).map( + _InputsWithStoppingSignals.insert_stopping_signal( + stop=True, batch_size=batch_size, add_padding=add_padding)) + final_batch_dataset = final_batch_dataset.repeat( + 2 * num_invocations_per_step - 1) + + def _set_mask(data_dict): + signals = data_dict['signals'] + signals['padding_mask'] = array_ops.ones_like(signals['padding_mask']) + data_dict['signals'] = signals + return data_dict + + # Mask out the extra batch. + final_batch_dataset = final_batch_dataset.map(_set_mask) + + dataset = user_provided_dataset.concatenate(final_batch_dataset).prefetch(2) + + super(_InputsWithStoppingSignals, self).__init__(dataset=dataset) + self._current_inputs = None + + def features_and_labels(self): + if self._current_inputs is not None: + raise RuntimeError( + 'Internal Error: The previous inputs have not been properly ' + 'consumed. First call features_and_labels, then call signals.') + + inputs_with_signals = self._iterator.get_next() + features = inputs_with_signals['features'] + labels = inputs_with_signals.get('labels') + + self._current_inputs = inputs_with_signals + return features, labels + + def signals(self): + """Returns the `Signals` from `_Inputs`.""" + if self._current_inputs is None: + raise RuntimeError( + 'Internal Error: The current inputs have not been properly ' + 'generated. First call features_and_labels, then call signals.') + signals = self._current_inputs['signals'] + self._current_inputs = None + return signals + + @staticmethod + def insert_stopping_signal(stop, batch_size, add_padding=False): + """Inserts stopping_signal into dataset via _map_fn. + + Here we change the data structure in the dataset, such that the return value + is a dictionary now and `features`, `labels`, and `signals` are three + distinguished keys in that dict. This provides a better structure, which + eases the process to decompose the inputs (see `features_and_labels`). + + Args: + stop: bool, state of current stopping signals. + batch_size: int, batch size. + add_padding: bool, whether to pad the tensor to full batch size. + + Returns: + A map_fn passed to dataset.map API. + """ + + def _map_fn(*args): + """The map fn to insert signals.""" + if len(args) == 1: + # Unpack the single Tensor/dict argument as features. This is required + # for the input_fn returns no labels. + args = args[0] + features, labels = _Inputs._parse_inputs(args) + new_input_dict = {} + + if add_padding: + padding_mask, features, labels = ( + _PaddingSignals.pad_features_and_labels(features, labels, + batch_size)) + + new_input_dict['features'] = features + if labels is not None: + new_input_dict['labels'] = labels + + else: + new_input_dict['features'] = features + if labels is not None: + new_input_dict['labels'] = labels + padding_mask = None + + new_input_dict['signals'] = _StopSignals( + stop=stop, batch_size=batch_size, + padding_mask=padding_mask).as_dict() + + return new_input_dict + + return _map_fn + + +class _StopSignals(object): + """Signals class holding all logic to handle TPU stopping condition.""" + + NON_STOPPING_SIGNAL = False + STOPPING_SIGNAL = True + + def __init__(self, stop, batch_size, padding_mask=None): + self._stop = stop + self._batch_size = batch_size + self._padding_mask = padding_mask + + def as_dict(self): + """Returns the signals as Python dict.""" + shape = [self._batch_size, 1] + dtype = dtypes.bool + + if self._stop: + stopping = array_ops.ones(shape=shape, dtype=dtype) + else: + stopping = array_ops.zeros(shape=shape, dtype=dtype) + + signals = {'stopping': stopping} + if self._padding_mask is not None: + signals['padding_mask'] = self._padding_mask + return signals + + @staticmethod + def as_scalar_stopping_signal(signals): + return array_ops.identity(signals['stopping'][0][0]) + + @staticmethod + def should_stop(scalar_stopping_signal): + """Detects whether scalar_stopping_signal indicates stopping.""" + if isinstance(scalar_stopping_signal, ops.Tensor): + # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF + # way to express the bool check whether scalar_stopping_signal is True. + return math_ops.logical_and(scalar_stopping_signal, + _StopSignals.STOPPING_SIGNAL) + else: + # For non Tensor case, it is used in SessionRunHook. So, we cannot modify + # the graph anymore. Here, we use pure Python. + return bool(scalar_stopping_signal) + + +class _PaddingSignals(object): + """Signals class holding all logic to handle padding.""" + + @staticmethod + def pad_features_and_labels(features, labels, batch_size): + """Pads out the batch dimension of features and labels.""" + real_batch_size = array_ops.shape( + _PaddingSignals._find_any_tensor(features))[0] + + batch_size_tensor = constant_op.constant(batch_size, dtypes.int32) + + check_greater = check_ops.assert_greater_equal( + batch_size_tensor, + real_batch_size, + data=(batch_size_tensor, real_batch_size), + message='The real batch size should not be greater than batch_size.') + + with ops.control_dependencies([check_greater]): + missing_count = batch_size_tensor - real_batch_size + + def pad_single_tensor(tensor): + """Pads out the batch dimension of a tensor to the complete batch_size.""" + rank = len(tensor.shape) + assert rank > 0 + padding = array_ops.stack([[0, missing_count]] + [[0, 0]] * (rank - 1)) + padded_shape = (batch_size,) + tuple(tensor.shape[1:]) + padded_tensor = array_ops.pad(tensor, padding) + padded_tensor.set_shape(padded_shape) + return padded_tensor + + def nest_pad(tensor_or_dict): + return nest.map_structure(pad_single_tensor, tensor_or_dict) + + features = nest_pad(features) + if labels is not None: + labels = nest_pad(labels) + + padding_mask = _PaddingSignals._padding_mask(real_batch_size, missing_count, + batch_size) + + return padding_mask, features, labels + + @staticmethod + def slice_tensor_or_dict(tensor_or_dict, signals): + """Slice the real Tensors according to padding mask in signals.""" + + padding_mask = signals['padding_mask'] + batch_size = array_ops.shape(padding_mask)[0] + + def verify_batch_size(tensor): + check_batch_size = math_ops.equal(batch_size, tensor.shape[0]) + with ops.control_dependencies([check_batch_size]): + return array_ops.identity(tensor) + + def slice_single_tensor(tensor): + rank = len(tensor.shape) + assert rank > 0 + real_batch_size = batch_size - math_ops.reduce_sum(padding_mask) + return verify_batch_size(tensor)[0:real_batch_size] + + # As we split the Tensors to all TPU cores and concat them back, it is + # important to ensure the real data is placed before padded ones, i.e., + # order is preserved. By that, the sliced padding mask should have all 0's. + # If this assertion failed, # the slice logic here would not hold. + sliced_padding_mask = slice_single_tensor(padding_mask) + assert_padding_mask = math_ops.equal( + math_ops.reduce_sum(sliced_padding_mask), 0) + + with ops.control_dependencies([assert_padding_mask]): + should_stop = _StopSignals.should_stop( + _StopSignals.as_scalar_stopping_signal(signals)) + + is_full_batch = math_ops.equal(math_ops.reduce_sum(padding_mask), 0) + + def slice_fn(tensor): + # If the current batch is full batch or part of stopping signals, we do + # not need to slice to save performance. + return control_flow_ops.cond( + math_ops.logical_or(should_stop, is_full_batch), + (lambda: verify_batch_size(tensor)), + (lambda: slice_single_tensor(tensor))) + + return nest.map_structure(slice_fn, tensor_or_dict) + + @staticmethod + def _find_any_tensor(batch_features): + tensors = [ + x for x in nest.flatten(batch_features) if isinstance(x, ops.Tensor) + ] + if not tensors: + raise ValueError('Cannot find any Tensor in features dict.') + return tensors[0] + + @staticmethod + def _padding_mask(real_batch_size, missing_count, batch_size): + padding_mask = array_ops.concat([ + array_ops.zeros((real_batch_size,), dtype=dtypes.int32), + array_ops.ones((missing_count,), dtype=dtypes.int32) + ], + axis=0) + padding_mask.set_shape((batch_size,)) + return padding_mask + + +def _verify_cross_hosts_transfer_size(tensor_dict, message): + total_size = 0 + tensor_structure = {} + for key, tensor in tensor_dict.items(): + shape = tensor.shape + size = np.product(shape) * tensor.dtype.size + tensor_structure[key] = shape + total_size += size + if total_size >= _ONE_GIGABYTE: + raise ValueError( + '{} The transfer size is larger than the protobuf limit. Please ' + 'consider to use Tensors with smaller shapes or reduce batch ' + 'size. Given:\n' + '{}'.format( + message, '\n'.join([ + ' -- Key: {}, Shape: {}'.format(k, v) + for k, v in tensor_structure.items() + ]))) + + +def _add_item_to_params(params, key, value): + """Adds a new item into `params`.""" + if isinstance(params, hparam.HParams): + # For HParams, we need to use special API. + if key in params: + params.set_hparam(key, value) + else: + params.add_hparam(key, value) + else: + # Now params is Python dict. + params[key] = value + + +def export_estimator_savedmodel(estimator, + export_dir_base, + serving_input_receiver_fn, + assets_extra=None, + as_text=False, + checkpoint_path=None, + strip_default_attrs=False): + """Export `Estimator` trained model for TPU inference. + + Args: + estimator: `Estimator` with which model has been trained. + export_dir_base: A string containing a directory in which to create + timestamped subdirectories containing exported SavedModels. + serving_input_receiver_fn: A function that takes no argument and returns a + `ServingInputReceiver` or `TensorServingInputReceiver`. + assets_extra: A dict specifying how to populate the assets.extra directory + within the exported SavedModel, or `None` if no extra assets are needed. + as_text: whether to write the SavedModel proto in text format. + checkpoint_path: The checkpoint path to export. If `None` (the default), + the most recent checkpoint found within the model directory is chosen. + strip_default_attrs: Boolean. If `True`, default-valued attributes will be + removed from the NodeDefs. + + Returns: + The string path to the exported directory. + """ + # `TPUEstimator` requires `tpu_config.RunConfig`, so we cannot use + # `estimator.config`. + config = tpu_config.RunConfig(model_dir=estimator.model_dir) + est = TPUEstimator( + estimator._model_fn, # pylint: disable=protected-access + config=config, + params=estimator.params, + use_tpu=True, + train_batch_size=2048, # Does not matter. + eval_batch_size=2048, # Does not matter. + ) + return est.export_savedmodel(export_dir_base, serving_input_receiver_fn, + assets_extra, as_text, checkpoint_path, + strip_default_attrs) diff --git a/baselines/models/xlnet/xlnet.py b/baselines/models/xlnet/xlnet.py new file mode 100644 index 0000000..4341e24 --- /dev/null +++ b/baselines/models/xlnet/xlnet.py @@ -0,0 +1,292 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import os +import tensorflow as tf +import modeling + + +def _get_initializer(FLAGS): + """Get variable intializer.""" + if FLAGS.init == "uniform": + initializer = tf.initializers.random_uniform( + minval=-FLAGS.init_range, + maxval=FLAGS.init_range, + seed=None) + elif FLAGS.init == "normal": + initializer = tf.initializers.random_normal( + stddev=FLAGS.init_std, + seed=None) + else: + raise ValueError("Initializer {} not supported".format(FLAGS.init)) + return initializer + + +class XLNetConfig(object): + """XLNetConfig contains hyperparameters that are specific to a model checkpoint; + i.e., these hyperparameters should be the same between + pretraining and finetuning. + + The following hyperparameters are defined: + n_layer: int, the number of layers. + d_model: int, the hidden size. + n_head: int, the number of attention heads. + d_head: int, the dimension size of each attention head. + d_inner: int, the hidden size in feed-forward layers. + ff_activation: str, "relu" or "gelu". + untie_r: bool, whether to untie the biases in attention. + n_token: int, the vocab size. + """ + + def __init__(self, FLAGS=None, json_path=None): + """Constructing an XLNetConfig. + One of FLAGS or json_path should be provided.""" + + assert FLAGS is not None or json_path is not None + + self.keys = ["n_layer", "d_model", "n_head", "d_head", "d_inner", + "ff_activation", "untie_r", "n_token"] + + if FLAGS is not None: + self.init_from_flags(FLAGS) + + if json_path is not None: + self.init_from_json(json_path) + + def init_from_flags(self, FLAGS): + for key in self.keys: + setattr(self, key, getattr(FLAGS, key)) + + def init_from_json(self, json_path): + with tf.gfile.Open(json_path) as f: + json_data = json.load(f) + for key in self.keys: + setattr(self, key, json_data[key]) + + def to_json(self, json_path): + """Save XLNetConfig to a json file.""" + json_data = {} + for key in self.keys: + json_data[key] = getattr(self, key) + + json_dir = os.path.dirname(json_path) + if not tf.gfile.Exists(json_dir): + tf.gfile.MakeDirs(json_dir) + with tf.gfile.Open(json_path, "w") as f: + json.dump(json_data, f, indent=4, sort_keys=True) + + +def create_run_config(is_training, is_finetune, FLAGS): + kwargs = dict( + is_training=is_training, + use_tpu=FLAGS.use_tpu, + use_bfloat16=FLAGS.use_bfloat16, + dropout=FLAGS.dropout, + dropatt=FLAGS.dropatt, + init=FLAGS.init, + init_range=FLAGS.init_range, + init_std=FLAGS.init_std, + clamp_len=FLAGS.clamp_len) + + if not is_finetune: + kwargs.update(dict( + mem_len=FLAGS.mem_len, + reuse_len=FLAGS.reuse_len, + bi_data=FLAGS.bi_data, + clamp_len=FLAGS.clamp_len, + same_length=FLAGS.same_length)) + + return RunConfig(**kwargs) + + +class RunConfig(object): + """RunConfig contains hyperparameters that could be different + between pretraining and finetuning. + These hyperparameters can also be changed from run to run. + We store them separately from XLNetConfig for flexibility. + """ + + def __init__(self, is_training, use_tpu, use_bfloat16, dropout, dropatt, + init="normal", init_range=0.1, init_std=0.02, mem_len=None, + reuse_len=None, bi_data=False, clamp_len=-1, same_length=False): + """ + Args: + is_training: bool, whether in training mode. + use_tpu: bool, whether TPUs are used. + use_bfloat16: bool, use bfloat16 instead of float32. + dropout: float, dropout rate. + dropatt: float, dropout rate on attention probabilities. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + """ + + self.init = init + self.init_range = init_range + self.init_std = init_std + self.is_training = is_training + self.dropout = dropout + self.dropatt = dropatt + self.use_tpu = use_tpu + self.use_bfloat16 = use_bfloat16 + self.mem_len = mem_len + self.reuse_len = reuse_len + self.bi_data = bi_data + self.clamp_len = clamp_len + self.same_length = same_length + + +class XLNetModel(object): + """A wrapper of the XLNet model used during both pretraining and finetuning.""" + + def __init__(self, xlnet_config, run_config, input_ids, seg_ids, input_mask, + mems=None, perm_mask=None, target_mapping=None, inp_q=None, + **kwargs): + """ + Args: + xlnet_config: XLNetConfig, + run_config: RunConfig, + input_ids: int32 Tensor in shape [len, bsz], the input token IDs. + seg_ids: int32 Tensor in shape [len, bsz], the input segment IDs. + input_mask: float32 Tensor in shape [len, bsz], the input mask. + 0 for real tokens and 1 for padding. + mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory + from previous batches. The length of the list equals n_layer. + If None, no memory is used. + perm_mask: float32 Tensor in shape [len, len, bsz]. + If perm_mask[i, j, k] = 0, i attend to j in batch k; + if perm_mask[i, j, k] = 1, i does not attend to j in batch k. + If None, each position attends to all the others. + target_mapping: float32 Tensor in shape [num_predict, len, bsz]. + If target_mapping[i, j, k] = 1, the i-th predict in batch k is + on the j-th token. + Only used during pretraining for partial prediction. + Set to None during finetuning. + inp_q: float32 Tensor in shape [len, bsz]. + 1 for tokens with losses and 0 for tokens without losses. + Only used during pretraining for two-stream attention. + Set to None during finetuning. + """ + + initializer = _get_initializer(run_config) + + tfm_args = dict( + n_token=xlnet_config.n_token, + initializer=initializer, + attn_type="bi", + n_layer=xlnet_config.n_layer, + d_model=xlnet_config.d_model, + n_head=xlnet_config.n_head, + d_head=xlnet_config.d_head, + d_inner=xlnet_config.d_inner, + ff_activation=xlnet_config.ff_activation, + untie_r=xlnet_config.untie_r, + + is_training=run_config.is_training, + use_bfloat16=run_config.use_bfloat16, + use_tpu=run_config.use_tpu, + dropout=run_config.dropout, + dropatt=run_config.dropatt, + + mem_len=run_config.mem_len, + reuse_len=run_config.reuse_len, + bi_data=run_config.bi_data, + clamp_len=run_config.clamp_len, + same_length=run_config.same_length + ) + + input_args = dict( + inp_k=input_ids, + seg_id=seg_ids, + input_mask=input_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + inp_q=inp_q) + tfm_args.update(input_args) + + with tf.variable_scope("model", reuse=tf.AUTO_REUSE): + (self.output, self.new_mems, self.lookup_table + ) = modeling.transformer_xl(**tfm_args) + + self.input_mask = input_mask + self.initializer = initializer + self.xlnet_config = xlnet_config + self.run_config = run_config + + def get_pooled_out(self, summary_type, use_summ_proj=True): + """ + Args: + summary_type: str, "last", "first", "mean", or "attn". The method + to pool the input to get a vector representation. + use_summ_proj: bool, whether to use a linear projection during pooling. + + Returns: + float32 Tensor in shape [bsz, d_model], the pooled representation. + """ + + xlnet_config = self.xlnet_config + run_config = self.run_config + + with tf.variable_scope("model", reuse=tf.AUTO_REUSE): + summary = modeling.summarize_sequence( + summary_type=summary_type, + hidden=self.output, + d_model=xlnet_config.d_model, + n_head=xlnet_config.n_head, + d_head=xlnet_config.d_head, + dropout=run_config.dropout, + dropatt=run_config.dropatt, + is_training=run_config.is_training, + input_mask=self.input_mask, + initializer=self.initializer, + use_proj=use_summ_proj) + + return summary + + def get_sequence_output(self): + """ + Returns: + float32 Tensor in shape [len, bsz, d_model]. The last layer hidden + representation of XLNet. + """ + + return self.output + + def get_new_memory(self): + """ + Returns: + list of float32 Tensors in shape [mem_len, bsz, d_model], the new + memory that concatenates the previous memory with the current input + representations. + The length of the list equals n_layer. + """ + return self.new_mems + + def get_embedding_table(self): + """ + Returns: + float32 Tensor in shape [n_token, d_model]. The embedding lookup table. + Used for tying embeddings between input and output layers. + """ + return self.lookup_table + + def get_initializer(self): + """ + Returns: + A tf initializer. Used to initialize variables in layers on top of XLNet. + """ + return self.initializer + diff --git a/baselines/models_pytorch/classifier_pytorch/.gitignore b/baselines/models_pytorch/classifier_pytorch/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/README.md b/baselines/models_pytorch/classifier_pytorch/README.md new file mode 100644 index 0000000..3ceb4b3 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/README.md @@ -0,0 +1,118 @@ +# chineseGLUE_pytorch + +**详细信息见于https://github.com/chineseGLUE/chineseGLUE** + +## 代码目录说明 + +```text +├── chineseGLUEdatasets # 存放数据 +| └── inews    +| └── lcqmc  +| └── ... +├── metrics         # metric计算 +| └── glue_compute_metrics.py    +├── outputs # 模型输出保存 +| └── inews_output +| └── lcqmc_output  +| └── ... +├── prev_trained_model # 预训练模型 +| └── albert_base +| └── bert-wwm +| └── ... +├── processors     # 数据处理 +| └── glue.py +| └── ... +├── tools        # 通用脚本 +| └── progressbar.py +| └── ... +├── transformers   # 模型 +| └── modeling_albert.py +| └── modeling_bert.py +| └── ... +├── convert_albert_original_tf_checkpoint_to_pytorch.py # 模型文件转换 +├── run_classifier.py # 主程序 +├── run_classifier_inews.sh # 任务运行脚本 +``` +### 依赖模块 + +- pytorch=1.1.0 +- boto3=1.9 +- regex +- sacremoses +- sentencepiece + +### 运行 + +1. 若下载对应tf模型权重,则运行转换脚本,比如转换`albert_base_tf`: +```python +python convert_albert_original_tf_checkpoint_to_pytorch.py \ + --tf_checkpoint_path=./prev_trained_model/albert_base_tf \ + --bert_config_file=./prev_trained_model/albert_base_tf/albert_config_base.json \ + --pytorch_dump_path=./prev_trained_model/albert_base/pytorch_model.bin +``` +**注意**: 当转换完模型之后,需要在对应的文件夹内存放`config.json`和`vocab.txt`文件 + +2. 直接运行对应任务sh脚本,如: + +```shell +sh run_classifier_inews.sh +``` +### 模型列表 + +``` +MODEL_CLASSES = { + ## bert ernie bert_wwm bert_wwwm_ext + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + # xlnet_base xlnet_mid xlnet_large + 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + # roberta_base roberta_wwm roberta_wwm_ext roberta_wwm_large_ext + 'roberta': (BertConfig, BertForSequenceClassification, BertTokenizer), + # albert_tiny albert_base albert_large albert_xlarge + 'albert': (BertConfig, AlbertForSequenceClassification, BertTokenizer) +} +``` +**注意**: bert ernie bert_wwm bert_wwwm_ext等模型只是权重不一样,而模型本身主体一样,因此参数`model_type=bert`其余同理。 + +## 基线结果 + +**说明**: + +1. 目前结果大体上跟tf差不多,但是有+-0.4%上下波动,可能时由于参数不同等原因造成 + +2. 增加collate_fn,对每一个batch进行动态长度padding + +### Tnews文本分类任务 + +| 模型 | 开发集(Dev) | 测试集(Test) | 训练参数 | +| :------- | :---------: | :---------: | :---------: | +| albert_tiny | 86.89 | 87.02 | epoch=5,length=128,lr=1e-4 | +| albert_base | 88.42 | 88.26 | epoch=5,length=128,lr=1e-4 | +| bert_base | 89.8 | 89.77 | epoch=4,length=128,lr=2e-5 | +| ernie_base | 89.99 | 89.90 | epoch=4,length=128,lr=2e-5 | +| xlnet_base | 89.44 | 89.59 | epoch=4,length=128,lr=2e-5 | +| bert_wwm_ext | 89.83 | 89.80 | epoch=4,length=128,lr=2e-5 | + +### Lcqmc语义相似度匹配 + +| 模型 | 开发集(Dev) | 测试集(Test) | 训练参数 | +| :------- | :---------: | :---------: | :---------: | +| albert_base | 87.8 | 86.6 | epoch=5,length=128,lr=1e-4 | +| bert_base | 89.4 | 86.9 | epoch=4,length=128,lr=2e-5 | +| ernie_base | 89.8 | 87.1 | epoch=4,length=128,lr=2e-5 | +| bert_wwm | 89.0 | 87.2 | epoch=4,length=128,lr=2e-5 | +| bert_wwm_ext | 89.3 | 87.1 | epoch=4,length=128,lr=2e-5 | + +### Inews 互联网情感分析 + +| 模型 | 开发集(Dev) | 测试集(Test) | 训练参数 | +| :------- | :---------: | :---------: | :---------: | +| bert_base | 85.1 | 84.5 | epoch=4,length=512,lr=2e-5 | +| ernie_base | 85.9 | 84.7 | epoch=4,length=512,lr=2e-5 | +| xlnet_base | 85.1 | 84.5 | epoch=4,length=512,lr=2e-5 | +| bert_wwm | 85.7 | 85.1 | epoch=4,length=512,lr=2e-5 | +| bert_wwm_ext | 85.4 | 85.8 | epoch=4,length=512,lr=2e-5 | +| robertta_wwm_ext | 84.5 | 84.9 | epoch=4,length=512,lr=2e-5 | + + + + diff --git a/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/inews/.gitignore b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/inews/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/inews/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/lcqmc/.gitignore b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/lcqmc/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/lcqmc/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/tnews/.gitignore b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/tnews/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/tnews/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/xnli/.gitignore b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/xnli/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/chineseGLUEdatasets/xnli/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/convert_albert_original_tf_checkpoint_to_pytorch.py b/baselines/models_pytorch/classifier_pytorch/convert_albert_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000..866456d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/convert_albert_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,72 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import torch + +from transformers.modeling_albert import BertConfig, AlbertForPreTraining, load_tf_weights_in_albert + +import logging +logging.basicConfig(level=logging.INFO) + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = AlbertForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_albert(model, config, tf_checkpoint_path) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path to the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained ALBERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path) + +''' +python convert_albert_original_tf_checkpoint_to_pytorch.py \ + --tf_checkpoint_path=/home/lwt/NewDisk/chineseGLUE_pytorch/prev_trained_model/albert_tiny_tf \ + --bert_config_file=/home/lwt/NewDisk/chineseGLUE_pytorch/prev_trained_model/albert_tiny_tf/config.json \ + --pytorch_dump_path=/home/lwt/NewDisk/chineseGLUE_pytorch/prev_trained_model/albert_tiny/pytorch_model.bin +''' \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py b/baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000..7580881 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/convert_bert_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import torch + +from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert + +import logging +logging.basicConfig(level=logging.INFO) + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = BertForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_bert(model, config, tf_checkpoint_path) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path to the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path) diff --git a/baselines/models_pytorch/classifier_pytorch/convert_ernie_original_pad_checkpoint_to_pytorch.py b/baselines/models_pytorch/classifier_pytorch/convert_ernie_original_pad_checkpoint_to_pytorch.py new file mode 100644 index 0000000..915735c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/convert_ernie_original_pad_checkpoint_to_pytorch.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python +# encoding: utf-8 +import collections +import os +import sys +import numpy as np +import argparse +import paddle.fluid as fluid +import torch +import json + +if not os.path.exists('ERNIE'): + os.system('git clone https://github.com/PaddlePaddle/ERNIE.git') +sys.path = ['./ERNIE'] + sys.path +try: + from model.ernie_v1 import ErnieConfig, ErnieModel +except: + raise Exception('Place clone ERNIE first') + + +def create_model(args, pyreader_name, ernie_config, is_prediction=False): + pyreader = fluid.layers.py_reader( + capacity=50, + shapes=[[-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], + [-1, args.max_seq_len, 1], [-1, args.max_seq_len, 1], [-1, 1], + [-1, 1], + [3, 1], [3]], + dtypes=['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64', 'int64'], + lod_levels=[0, 0, 0, 0, 0, 0, 0, 0], + name=pyreader_name, + use_double_buffer=True) + + (src_ids, sent_ids, pos_ids, input_mask, labels, qids, + mlm_mask_label, mlm_mask_pos) = fluid.layers.read_file(pyreader) + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + input_mask=input_mask, + config=ernie_config, + use_fp16=args.use_fp16 + ) + cls_feats = ernie.get_pooled_output() + cls_feats = fluid.layers.dropout( + x=cls_feats, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + logits = fluid.layers.fc( + input=cls_feats, + size=args.num_labels, + param_attr=fluid.ParamAttr( + name="cls_out_w", + initializer=fluid.initializer.TruncatedNormal(scale=0.02)), + bias_attr=fluid.ParamAttr( + name="cls_out_b", initializer=fluid.initializer.Constant(0.))) + + ernie.get_pretraining_output( + mask_label=mlm_mask_label, + mask_pos=mlm_mask_pos, + labels=labels + ) + + if is_prediction: + probs = fluid.layers.softmax(logits) + feed_targets_name = [ + src_ids.name, pos_ids.name, sent_ids.name, input_mask.name + ] + return pyreader, probs, feed_targets_name + + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=logits, label=labels, return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + + if args.use_fp16 and args.loss_scaling > 1.0: + loss *= args.loss_scaling + + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) + + graph_vars = { + "loss": loss, + "probs": probs, + "accuracy": accuracy, + "labels": labels, + "num_seqs": num_seqs, + "qids": qids + } + + for k, v in graph_vars.items(): + v.persistable = True + + return pyreader, graph_vars + + +def if_exist(var): + return os.path.exists(os.path.join(args.init_pretraining_params, var.name)) + + +def build_weight_map(): + weight_map = collections.OrderedDict({ + 'word_embedding': 'bert.embeddings.word_embeddings.weight', + 'pos_embedding': 'bert.embeddings.position_embeddings.weight', + 'sent_embedding': 'bert.embeddings.token_type_embeddings.weight', + 'pre_encoder_layer_norm_scale': 'bert.embeddings.LayerNorm.gamma', + 'pre_encoder_layer_norm_bias': 'bert.embeddings.LayerNorm.beta', + }) + + def add_w_and_b(ernie_pre, pytroch_pre): + weight_map[ernie_pre + ".w_0"] = pytroch_pre + ".weight" + weight_map[ernie_pre + ".b_0"] = pytroch_pre + ".bias" + + def add_one_encoder_layer(layer_number): + # attention + add_w_and_b(f"encoder_layer_{layer_number}_multi_head_att_query_fc", + f"bert.encoder.layer.{layer_number}.attention.self.query") + add_w_and_b(f"encoder_layer_{layer_number}_multi_head_att_key_fc", + f"bert.encoder.layer.{layer_number}.attention.self.key") + add_w_and_b(f"encoder_layer_{layer_number}_multi_head_att_value_fc", + f"bert.encoder.layer.{layer_number}.attention.self.value") + add_w_and_b(f"encoder_layer_{layer_number}_multi_head_att_output_fc", + f"bert.encoder.layer.{layer_number}.attention.output.dense") + weight_map[f"encoder_layer_{layer_number}_post_att_layer_norm_bias"] = \ + f"bert.encoder.layer.{layer_number}.attention.output.LayerNorm.bias" + weight_map[f"encoder_layer_{layer_number}_post_att_layer_norm_scale"] = \ + f"bert.encoder.layer.{layer_number}.attention.output.LayerNorm.weight" + # intermediate + add_w_and_b(f"encoder_layer_{layer_number}_ffn_fc_0", f"bert.encoder.layer.{layer_number}.intermediate.dense") + # output + add_w_and_b(f"encoder_layer_{layer_number}_ffn_fc_1", f"bert.encoder.layer.{layer_number}.output.dense") + weight_map[f"encoder_layer_{layer_number}_post_ffn_layer_norm_bias"] = \ + f"bert.encoder.layer.{layer_number}.output.LayerNorm.bias" + weight_map[f"encoder_layer_{layer_number}_post_ffn_layer_norm_scale"] = \ + f"bert.encoder.layer.{layer_number}.output.LayerNorm.weight" + + for i in range(12): + add_one_encoder_layer(i) + add_w_and_b('pooled_fc', 'bert.pooler.dense') + + weight_map.update({ + 'mask_lm_trans_fc.b_0': 'cls.predictions.transform.dense.bias', + 'mask_lm_trans_fc.w_0': 'cls.predictions.transform.dense.weight', + 'mask_lm_trans_layer_norm_scale': 'cls.predictions.transform.LayerNorm.weight', + 'mask_lm_trans_layer_norm_bias': 'cls.predictions.transform.LayerNorm.bias', + 'mask_lm_out_fc.b_0': 'cls.predictions.bias' + }) + + return weight_map + +def extract_weights(args): + # add ERNIR to environment + print('extract weights start'.center(60, '=')) + startup_prog = fluid.Program() + test_prog = fluid.Program() + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + args.max_seq_len = 512 + args.use_fp16 = False + args.num_labels = 2 + args.loss_scaling = 1.0 + ernie_config = ErnieConfig(args.ernie_config_path) + ernie_config.print_config() + with fluid.program_guard(test_prog, startup_prog): + with fluid.unique_name.guard(): + create_model( + args, + pyreader_name='train', + ernie_config=ernie_config) + fluid.io.load_vars(exe, args.init_pretraining_params, main_program=test_prog, predicate=if_exist) + state_dict = collections.OrderedDict() + weight_map = build_weight_map() + for ernie_name, pytorch_name in weight_map.items(): + fluid_tensor = fluid.global_scope().find_var(ernie_name).get_tensor() + fluid_array = np.array(fluid_tensor, dtype=np.float32) + if 'w_0' in ernie_name: + fluid_array = fluid_array.transpose() + state_dict[pytorch_name] = fluid_array + print(f'{ernie_name} -> {pytorch_name} {fluid_array.shape}') + print('extract weights done!'.center(60, '=')) + return state_dict + + +def save_model(state_dict, dump_path): + print('save model start'.center(60, '=')) + if not os.path.exists(dump_path): + os.makedirs(dump_path) + # save model + for key in state_dict: + state_dict[key] = torch.FloatTensor(state_dict[key]) + torch.save(state_dict, os.path.join(dump_path, "pytorch_model.bin")) + print('finish save model') + # save config + ernie_config = ErnieConfig(args.ernie_config_path)._config_dict + # set layer_norm_eps, more detail see: https://github.com/PaddlePaddle/LARK/issues/75 + ernie_config['layer_norm_eps'] = 1e-5 + with open(os.path.join(dump_path, "config.json"), 'wt', encoding='utf-8') as f: + json.dump(ernie_config, f, indent=4) + print('finish save config') + # save vocab.txt + vocab_f = open(os.path.join(dump_path, "vocab.txt"), "wt", encoding='utf-8') + with open("./ERNIE/config/vocab.txt", "rt", encoding='utf-8') as f: + for line in f: + data = line.strip().split("\t") + vocab_f.writelines(data[0] + "\n") + vocab_f.close() + print('finish save vocab') + print('save model done!'.center(60, '=')) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--init_pretraining_params", default='./ERNIE_1.0_max-len-512/params', type=str, help=".") + parser.add_argument("--ernie_config_path", default='./ERNIE_1.0_max-len-512/ernie_config.json', type=str, help=".") + parser.add_argument("--output_dir", default='./ERNIE-converted', type=str, help=".") + args = parser.parse_args() + state_dict = extract_weights(args) + save_model(state_dict, args.output_dir) \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/convert_xlnet_original_tf_checkpoint_to_pytorch.py b/baselines/models_pytorch/classifier_pytorch/convert_xlnet_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000..3669d99 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/convert_xlnet_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,104 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import argparse +import torch + +from transformers import (CONFIG_NAME, WEIGHTS_NAME, + XLNetConfig, + XLNetLMHeadModel, XLNetForQuestionAnswering, + XLNetForSequenceClassification, + load_tf_weights_in_xlnet) + +GLUE_TASKS_NUM_LABELS = { + "cola": 2, + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "wnli": 2, +} + +import logging +logging.basicConfig(level=logging.INFO) + +def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None): + # Initialise PyTorch model + config = XLNetConfig.from_json_file(bert_config_file) + + finetuning_task = finetuning_task.lower() if finetuning_task is not None else "" + if finetuning_task in GLUE_TASKS_NUM_LABELS: + print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config))) + config.finetuning_task = finetuning_task + config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task] + model = XLNetForSequenceClassification(config) + elif 'squad' in finetuning_task: + config.finetuning_task = finetuning_task + model = XLNetForQuestionAnswering(config) + else: + model = XLNetLMHeadModel(config) + + # Load weights from tf checkpoint + load_tf_weights_in_xlnet(model, config, tf_checkpoint_path) + + # Save pytorch-model + pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) + pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME) + print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path))) + torch.save(model.state_dict(), pytorch_weights_dump_path) + print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path))) + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path to the TensorFlow checkpoint path.") + parser.add_argument("--xlnet_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained XLNet model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_folder_path", + default = None, + type = str, + required = True, + help = "Path to the folder to store the PyTorch model or dataset/vocab.") + parser.add_argument("--finetuning_task", + default = None, + type = str, + help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned") + args = parser.parse_args() + print(args) + + convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.xlnet_config_file, + args.pytorch_dump_folder_path, + args.finetuning_task) diff --git a/baselines/models_pytorch/classifier_pytorch/metrics/__init__.py b/baselines/models_pytorch/classifier_pytorch/metrics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/baselines/models_pytorch/classifier_pytorch/metrics/glue_compute_metrics.py b/baselines/models_pytorch/classifier_pytorch/metrics/glue_compute_metrics.py new file mode 100644 index 0000000..d3eee56 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/metrics/glue_compute_metrics.py @@ -0,0 +1,88 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import sys +import logging + +logger = logging.getLogger(__name__) + +try: + from scipy.stats import pearsonr, spearmanr + from sklearn.metrics import matthews_corrcoef, f1_score + _has_sklearn = True +except (AttributeError, ImportError) as e: + logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html") + _has_sklearn = False + + + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "lcqmc": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "tnews": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "xnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "inews": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) diff --git a/baselines/models_pytorch/classifier_pytorch/outputs/inews_output/.gitignore b/baselines/models_pytorch/classifier_pytorch/outputs/inews_output/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/outputs/inews_output/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/outputs/lcqmc_output/.gitignore b/baselines/models_pytorch/classifier_pytorch/outputs/lcqmc_output/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/outputs/lcqmc_output/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore b/baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/outputs/tnews_output/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/outputs/xnli_output/.gitignore b/baselines/models_pytorch/classifier_pytorch/outputs/xnli_output/.gitignore new file mode 100644 index 0000000..894a44c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/outputs/xnli_output/.gitignore @@ -0,0 +1,104 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ diff --git a/baselines/models_pytorch/classifier_pytorch/processors/__init__.py b/baselines/models_pytorch/classifier_pytorch/processors/__init__.py new file mode 100644 index 0000000..969950f --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/processors/__init__.py @@ -0,0 +1,4 @@ +from .utils import InputExample, InputFeatures, DataProcessor +from .glue import (glue_output_modes, glue_processors, glue_tasks_num_labels, + glue_convert_examples_to_features,collate_fn,xlnet_collate_fn) + diff --git a/baselines/models_pytorch/classifier_pytorch/processors/glue.py b/baselines/models_pytorch/classifier_pytorch/processors/glue.py new file mode 100644 index 0000000..4eb0fdf --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/processors/glue.py @@ -0,0 +1,697 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" GLUE processors and helpers """ + +import logging +import os +import torch +from .utils import DataProcessor, InputExample, InputFeatures + +logger = logging.getLogger(__name__) + + +def collate_fn(batch): + """ + batch should be a list of (sequence, target, length) tuples... + Returns a padded tensor of sequences sorted from longest to shortest, + """ + all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch)) + max_len = max(all_lens).item() + all_input_ids = all_input_ids[:, :max_len] + all_attention_mask = all_attention_mask[:, :max_len] + all_token_type_ids = all_token_type_ids[:, :max_len] + return all_input_ids, all_attention_mask, all_token_type_ids, all_labels + +def xlnet_collate_fn(batch): + """ + batch should be a list of (sequence, target, length) tuples... + Returns a padded tensor of sequences sorted from longest to shortest, + """ + all_input_ids, all_attention_mask, all_token_type_ids, all_lens, all_labels = map(torch.stack, zip(*batch)) + max_len = max(all_lens).item() + all_input_ids = all_input_ids[:, -max_len:] + all_attention_mask = all_attention_mask[:, -max_len:] + all_token_type_ids = all_token_type_ids[:, -max_len:] + return all_input_ids, all_attention_mask, all_token_type_ids, all_labels + +def glue_convert_examples_to_features(examples, tokenizer, + max_length=512, + task=None, + label_list=None, + output_mode=None, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): + """ + Loads a data file into a list of ``InputFeatures`` + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + if task is not None: + processor = glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = glue_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + input_len = len(input_ids) + # Zero-pad up to the sequence length. + padding_length = max_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), + max_length) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), + max_length) + + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) + logger.info("label: %s (id = %d)" % (example.label, label)) + logger.info("input length: %d" % (input_len)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label, + input_len=input_len)) + return features + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + logger.info("LOOKING AT {}".format(os.path.join(data_dir, "train.tsv"))) + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[3] + text_b = line[4] + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class XnliProcessor(DataProcessor): + """Processor for the Xlni data set (GLUE version).""" + + def __init__(self): + self.language = 'zh' + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "train.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = line[0] + text_b = line[1] + label = line[2] + if label == "contradictory": + label = "contradiction" + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = line[0] + if language != self.language: + continue + text_a = line[6] + text_b = line[7] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_test_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "test.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "test-%d" % (i) + language = line[0] + if language != self.language: + continue + text_a = line[6] + text_b = line[7] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[8] + text_b = line[9] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class InewsProcessor(DataProcessor): + """Processor for the inews data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = ["0", "1", "2"] + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[2] + text_b = line[3] + if set_type == "test": + label = "0" + else: + label = line[0] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MnliMismatchedProcessor(MnliProcessor): + """Processor for the MultiNLI Mismatched data set (GLUE version).""" + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_mismatched.tsv")), + "dev_matched") + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class TnewsProcessor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_txt(os.path.join(data_dir, "toutiao_category_test.txt")), "test") + + def get_labels(self): + """See base class.""" + labels = [] + for i in range(17): + if i == 5 or i == 11: + continue + labels.append(str(100 + i)) + return labels + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[3] + if set_type == 'test': + label = '0' + else: + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class LcqmcProcessor(DataProcessor): + """Processor for the LCQMC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.txt")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.txt")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.txt")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + guid = "%s-%s" % (set_type, i) + text_a = line[0] + text_b = line[1] + if set_type == 'test': + label = '0' + else: + label = line[2] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class Sst2Processor(DataProcessor): + """Processor for the SST-2 data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = line[0] + label = line[1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class StsbProcessor(DataProcessor): + """Processor for the STS-B data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return [None] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[7] + text_b = line[8] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QqpProcessor(DataProcessor): + """Processor for the QQP data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + try: + text_a = line[3] + text_b = line[4] + label = line[5] + except IndexError: + continue + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class QnliProcessor(DataProcessor): + """Processor for the QNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), + "dev_matched") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class RteProcessor(DataProcessor): + """Processor for the RTE data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["entailment", "not_entailment"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class WnliProcessor(DataProcessor): + """Processor for the WNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, line[0]) + text_a = line[1] + text_b = line[2] + label = line[-1] + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +glue_tasks_num_labels = { + "mnli": 3, + "mrpc": 2, + "sst-2": 2, + "sts-b": 1, + "qqp": 2, + "qnli": 2, + "rte": 2, + "xnli": 3, + 'tnews': 15, + 'lcqmc': 2, + 'inews': 3, +} + +glue_processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, + "mrpc": MrpcProcessor, + "sst-2": Sst2Processor, + "sts-b": StsbProcessor, + "qqp": QqpProcessor, + "qnli": QnliProcessor, + "rte": RteProcessor, + "wnli": WnliProcessor, + 'tnews': TnewsProcessor, + 'xnli': XnliProcessor, + 'lcqmc': LcqmcProcessor, + 'inews': InewsProcessor, +} + +glue_output_modes = { + "cola": "classification", + "mnli": "classification", + "mnli-mm": "classification", + "mrpc": "classification", + "sst-2": "classification", + "sts-b": "regression", + "qqp": "classification", + "qnli": "classification", + "rte": "classification", + "wnli": "classification", + 'tnews': "classification", + 'xnli': "classification", + 'lcqmc': "classification", + 'inews': "classification", +} diff --git a/baselines/models_pytorch/classifier_pytorch/processors/utils.py b/baselines/models_pytorch/classifier_pytorch/processors/utils.py new file mode 100644 index 0000000..a220bc1 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/processors/utils.py @@ -0,0 +1,120 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import csv +import sys +import copy +import json + +class InputExample(object): + """ + A single training/test example for simple sequence classification. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + def __init__(self, guid, text_a, text_b=None, label=None): + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class InputFeatures(object): + """ + A single set of features of data. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + label: Label corresponding to the input + """ + + def __init__(self, input_ids, attention_mask, token_type_ids, label,input_len): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.input_len = input_len + self.label = label + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with open(input_file, "r", encoding="utf-8-sig") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + @classmethod + def _read_txt(cls, input_file): + """Reads a tab separated value file.""" + with open(input_file, "r") as f: + reader = f.readlines() + lines = [] + for line in reader: + lines.append(line.strip().split("_!_")) + return lines diff --git a/baselines/models_pytorch/classifier_pytorch/run_classifier.py b/baselines/models_pytorch/classifier_pytorch/run_classifier.py new file mode 100644 index 0000000..c4a4dd9 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/run_classifier.py @@ -0,0 +1,522 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Finetuning the library models for sequence classification on chineseGLUE (Bert, XLM, XLNet, RoBERTa).""" + +from __future__ import absolute_import, division, print_function + +import argparse +import glob +import logging +import os +import numpy as np +import torch +from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,TensorDataset +from torch.utils.data.distributed import DistributedSampler + +from transformers import (WEIGHTS_NAME, BertConfig, + BertForSequenceClassification, BertTokenizer, + RobertaConfig, XLNetConfig, + XLNetForSequenceClassification, + XLNetTokenizer, + AlbertForSequenceClassification) + +from transformers import AdamW, WarmupLinearSchedule + +from metrics.glue_compute_metrics import compute_metrics +from processors import glue_output_modes as output_modes + +from processors import glue_processors as processors +from processors import glue_convert_examples_to_features as convert_examples_to_features +from processors import collate_fn,xlnet_collate_fn +from tools.common import seed_everything +from tools.common import init_logger,logger +from tools.progressbar import ProgressBar + + +ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, + RobertaConfig)), ()) +MODEL_CLASSES = { + ## bert ernie bert_wwm bert_wwwm_ext + 'bert': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer), + 'roberta': (BertConfig, BertForSequenceClassification, BertTokenizer), + 'albert': (BertConfig, AlbertForSequenceClassification, BertTokenizer) +} + +def train(args, train_dataset, model, tokenizer): + """ Train the model """ + args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) + train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) + train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) + + if args.max_steps > 0: + t_total = args.max_steps + args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 + else: + t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + args.warmup_steps = int(t_total * args.warmup_proportion) + # Prepare optimizer and schedule (linear warmup and decay) + no_decay = ['bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, + {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) + scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) + if args.fp16: + try: + from apex import amp + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + + # Distributed training (should be after apex fp16 initialization) + if args.local_rank != -1: + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], + output_device=args.local_rank, + find_unused_parameters=True) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_dataset)) + logger.info(" Num Epochs = %d", args.num_train_epochs) + logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) + logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d", + args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) + logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) + logger.info(" Total optimization steps = %d", t_total) + + global_step = 0 + tr_loss, logging_loss = 0.0, 0.0 + model.zero_grad() + seed_everything(args.seed) # Added here for reproductibility (even between python 2 and 3) + for _ in range(int(args.num_train_epochs)): + pbar = ProgressBar(n_total=len(train_dataloader),desc='Training') + for step, batch in enumerate(train_dataloader): + model.train() + batch = tuple(t.to(args.device) for t in batch) + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert','xlnet','albert','roberta'] else None # XLM, DistilBERT don't use segment_ids + outputs = model(**inputs) + loss = outputs[0] # model outputs are always tuple in transformers (see doc) + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + if args.fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) + else: + loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) + + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() + scheduler.step() # Update learning rate schedule + model.zero_grad() + global_step += 1 + + if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: + # Log metrics + if args.local_rank == -1 : # Only evaluate when single GPU otherwise metrics may not average well + results = evaluate(args, model, tokenizer) + + if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: + # Save model checkpoint + output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(output_dir) + torch.save(args, os.path.join(output_dir, 'training_args.bin')) + logger.info("Saving model checkpoint to %s", output_dir) + tokenizer.save_vocabulary(vocab_path=output_dir) + pbar(step,{'loss':loss.item()}) + print(" ") + if 'cuda' in str(args.device): + torch.cuda.empty_cache() + return global_step, tr_loss / global_step + +def evaluate(args, model, tokenizer, prefix=""): + # Loop to handle MNLI double evaluation (matched, mis-matched) + eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) + eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) + + results = {} + for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): + eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, data_type='dev') + if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly + eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset) + eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size,collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) + + # Eval! + logger.info("***** Running evaluation {} *****".format(prefix)) + logger.info(" Num examples = %d", len(eval_dataset)) + logger.info(" Batch size = %d", args.eval_batch_size) + eval_loss = 0.0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + pbar = ProgressBar(n_total=len(eval_dataloader),desc = "Evaluating") + for step,batch in enumerate(eval_dataloader): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if args.model_type in ['bert','xlnet','albert','roberta'] else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + tmp_eval_loss, logits = outputs[:2] + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = inputs['labels'].detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) + pbar(step) + print(' ') + if 'cuda' in str(args.device): + torch.cuda.empty_cache() + eval_loss = eval_loss / nb_eval_steps + if args.output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif args.output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(eval_task, preds, out_label_ids) + results.update(result) + logger.info("***** Eval results {} *****".format(prefix)) + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + return results + +def predict(args, model, tokenizer, prefix=""): + # Loop to handle MNLI double evaluation (matched, mis-matched) + pred_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) + pred_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) + + results = {} + for pred_task, pred_output_dir in zip(pred_task_names, pred_outputs_dirs): + pred_dataset = load_and_cache_examples(args, pred_task, tokenizer, data_type='test') + if not os.path.exists(pred_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(pred_output_dir) + + args.pred_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly + pred_sampler = SequentialSampler(pred_dataset) if args.local_rank == -1 else DistributedSampler(pred_dataset) + pred_dataloader = DataLoader(pred_dataset, sampler=pred_sampler, batch_size=args.pred_batch_size,collate_fn=xlnet_collate_fn if args.model_type in ['xlnet'] else collate_fn) + + logger.info("***** Running prediction {} *****".format(prefix)) + logger.info(" Num examples = %d", len(pred_dataset)) + logger.info(" Batch size = %d", args.pred_batch_size) + nb_pred_steps = 0 + preds = None + pbar = ProgressBar(n_total=len(pred_dataloader),desc = "Predicting") + for step,batch in enumerate(pred_dataloader): + model.eval() + batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): + inputs = {'input_ids': batch[0], + 'attention_mask': batch[1], + 'labels': batch[3]} + if args.model_type != 'distilbert': + inputs['token_type_ids'] = batch[2] if ('bert' in args.model_type or 'xlnet' in args.model_type) else None # XLM, DistilBERT and RoBERTa don't use segment_ids + outputs = model(**inputs) + _, logits = outputs[:2] + nb_pred_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + pbar(step) + print(' ') + if args.output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif args.output_mode == "regression": + preds = np.squeeze(preds) + output_pred_file = os.path.join(pred_output_dir, prefix, "test_prediction.txt") + with open(output_pred_file, "w") as writer: + for pred in preds: + writer.write(str(pred)+'\n') + return results + +def load_and_cache_examples(args, task, tokenizer, data_type='train'): + if args.local_rank not in [-1, 0] and not evaluate: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + + processor = processors[task]() + output_mode = output_modes[task] + # Load data features from cache or dataset file + cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format( + data_type, + list(filter(None, args.model_name_or_path.split('/'))).pop(), + str(args.max_seq_length), + str(task))) + if os.path.exists(cached_features_file): + logger.info("Loading features from cached file %s", cached_features_file) + features = torch.load(cached_features_file) + else: + logger.info("Creating features from dataset file at %s", args.data_dir) + label_list = processor.get_labels() + if task in ['mnli', 'mnli-mm'] and 'roberta' in args.model_type: + # HACK(label indices are swapped in RoBERTa pretrained model) + label_list[1], label_list[2] = label_list[2], label_list[1] + + if data_type == 'train': + examples = processor.get_train_examples(args.data_dir) + elif data_type == 'dev': + examples = processor.get_dev_examples(args.data_dir) + else: + examples = processor.get_test_examples(args.data_dir) + + features = convert_examples_to_features(examples, + tokenizer, + label_list=label_list, + max_length=args.max_seq_length, + output_mode=output_mode, + pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet + pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], + pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, + ) + if args.local_rank in [-1, 0]: + logger.info("Saving features into cached file %s", cached_features_file) + torch.save(features, cached_features_file) + + if args.local_rank == 0 and not evaluate: + torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_lens = torch.tensor([f.input_len for f in features],dtype=torch.long) + if output_mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif output_mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_lens,all_labels) + return dataset + + +def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--data_dir", default=None, type=str, required=True, + help="The input data dir. Should contain the .tsv files (or other data files) for the task.") + parser.add_argument("--model_type", default=None, type=str, required=True, + help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--model_name_or_path", default=None, type=str, required=True, + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) + parser.add_argument("--task_name", default=None, type=str, required=True, + help="The name of the task to train selected in the list: " + ", ".join(processors.keys())) + parser.add_argument("--output_dir", default=None, type=str, required=True, + help="The output directory where the model predictions and checkpoints will be written.") + + ## Other parameters + parser.add_argument("--config_name", default="", type=str, + help="Pretrained config name or path if not the same as model_name") + parser.add_argument("--tokenizer_name", default="", type=str, + help="Pretrained tokenizer name or path if not the same as model_name") + parser.add_argument("--cache_dir", default="", type=str, + help="Where do you want to store the pre-trained models downloaded from s3") + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after tokenization. Sequences longer " + "than this will be truncated, sequences shorter will be padded.") + parser.add_argument("--do_train", action='store_true', + help="Whether to run training.") + parser.add_argument("--do_eval", action='store_true', + help="Whether to run eval on the dev set.") + parser.add_argument("--do_predict", action='store_true', + help="Whether to run the model in inference mode on the test set.") + parser.add_argument("--do_lower_case", action='store_true', + help="Set this flag if you are using an uncased model.") + + parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for training.") + parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int, + help="Batch size per GPU/CPU for evaluation.") + parser.add_argument('--gradient_accumulation_steps', type=int, default=1, + help="Number of updates steps to accumulate before performing a backward/update pass.") + parser.add_argument("--learning_rate", default=5e-5, type=float, + help="The initial learning rate for Adam.") + parser.add_argument("--weight_decay", default=0.01, type=float, + help="Weight deay if we apply some.") + parser.add_argument("--adam_epsilon", default=1e-8, type=float, + help="Epsilon for Adam optimizer.") + parser.add_argument("--max_grad_norm", default=1.0, type=float, + help="Max gradient norm.") + parser.add_argument("--num_train_epochs", default=3.0, type=float, + help="Total number of training epochs to perform.") + parser.add_argument("--max_steps", default=-1, type=int, + help="If > 0: set total number of training steps to perform. Override num_train_epochs.") + parser.add_argument("--warmup_proportion", default=0.1, type=float, + help="Proportion of training to perform linear learning rate warmup for,E.g., 0.1 = 10% of training.") + + parser.add_argument('--logging_steps', type=int, default=10, + help="Log every X updates steps.") + parser.add_argument('--save_steps', type=int, default=1000, + help="Save checkpoint every X updates steps.") + parser.add_argument("--eval_all_checkpoints", action='store_true', + help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number") + parser.add_argument("--no_cuda", action='store_true', + help="Avoid using CUDA when available") + parser.add_argument('--overwrite_output_dir', action='store_true', + help="Overwrite the content of the output directory") + parser.add_argument('--overwrite_cache', action='store_true', + help="Overwrite the cached training and evaluation sets") + parser.add_argument('--seed', type=int, default=42, + help="random seed for initialization") + + parser.add_argument('--fp16', action='store_true', + help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit") + parser.add_argument('--fp16_opt_level', type=str, default='O1', + help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']." + "See details at https://nvidia.github.io/apex/amp.html") + parser.add_argument("--local_rank", type=int, default=-1, + help="For distributed training: local_rank") + parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.") + parser.add_argument('--server_port', type=str, default='', help="For distant debugging.") + args = parser.parse_args() + + args.output_dir = args.output_dir +'{}'.format(args.model_type) + if not os.path.exists(args.output_dir): + os.mkdir(args.output_dir) + init_logger(log_file=args.output_dir+'/{}-{}.log'.format(args.model_type,args.task_name)) + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) + + # Setup distant debugging if needed + if args.server_ip and args.server_port: + # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script + import ptvsd + print("Waiting for debugger attach") + ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) + ptvsd.wait_for_attach() + + # Setup CUDA, GPU & distributed training + if args.local_rank == -1 or args.no_cuda: + device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.n_gpu = torch.cuda.device_count() + else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs + torch.cuda.set_device(args.local_rank) + device = torch.device("cuda", args.local_rank) + torch.distributed.init_process_group(backend='nccl') + args.n_gpu = 1 + args.device = device + + # Setup logging + logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) + + # Set seed + seed_everything(args.seed) + # Prepare GLUE task + args.task_name = args.task_name.lower() + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + args.output_mode = output_modes[args.task_name] + label_list = processor.get_labels() + num_labels = len(label_list) + + # Load pretrained model and tokenizer + if args.local_rank not in [-1, 0]: + torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + + args.model_type = args.model_type.lower() + config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name) + tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case) + model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config) + + if args.local_rank == 0: + torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab + + model.to(args.device) + + logger.info("Training/evaluation parameters %s", args) + + # Training + if args.do_train: + train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, data_type='train') + global_step, tr_loss = train(args, train_dataset, model, tokenizer) + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) + + + # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): + # Create output directory if needed + if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: + os.makedirs(args.output_dir) + + logger.info("Saving model checkpoint to %s", args.output_dir) + # Save a trained model, configuration and tokenizer using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) + tokenizer.save_pretrained(args.output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) + + # Load a trained model and vocabulary that you have fine-tuned + model = model_class.from_pretrained(args.output_dir) + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + model.to(args.device) + + # Evaluation + results = {} + if args.do_eval and args.local_rank in [-1, 0]: + tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case) + checkpoints = [args.output_dir] + if args.eval_all_checkpoints: + checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging + logger.info("Evaluate the following checkpoints: %s", checkpoints) + for checkpoint in checkpoints: + global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else "" + prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else "" + model = model_class.from_pretrained(checkpoint) + model.to(args.device) + result = evaluate(args, model, tokenizer, prefix=prefix) + result = dict((k + '_{}'.format(global_step), v) for k, v in result.items()) + results.update(result) + output_eval_file = os.path.join(args.output_dir, "checkpoint_eval_results.txt") + with open(output_eval_file, "w") as writer: + for key in sorted(results.keys()): + writer.write("%s = %s\n" % (key, str(results[key]))) + +if __name__ == "__main__": + main() diff --git a/baselines/models_pytorch/classifier_pytorch/run_classifier_inews.sh b/baselines/models_pytorch/classifier_pytorch/run_classifier_inews.sh new file mode 100644 index 0000000..416c716 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/run_classifier_inews.sh @@ -0,0 +1,25 @@ +CURRENT_DIR=`pwd` +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/roberta_wwm_ext +export GLUE_DIR=$CURRENT_DIR/chineseGLUEdatasets +export OUTPUR_DIR=$CURRENT_DIR/outputs +TASK_NAME="inews" +python run_classifier.py \ + --model_type=roberta \ + --model_name_or_path=$BERT_BASE_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir=$GLUE_DIR/${TASK_NAME}/ \ + --max_seq_length=512 \ + --per_gpu_train_batch_size=8 \ + --per_gpu_eval_batch_size=8 \ + --learning_rate=2e-5 \ + --num_train_epochs=4.0 \ + --logging_steps=670 \ + --save_steps=670 \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --overwrite_output_dir + +# 每一个epoch保存一次 +# 每一个epoch评估一次 \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/run_classifier_lcqmc.sh b/baselines/models_pytorch/classifier_pytorch/run_classifier_lcqmc.sh new file mode 100644 index 0000000..a2f3e4f --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/run_classifier_lcqmc.sh @@ -0,0 +1,25 @@ +CURRENT_DIR=`pwd` +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/ernie-base +export GLUE_DIR=$CURRENT_DIR/chineseGLUEdatasets +export OUTPUR_DIR=$CURRENT_DIR/outputs +TASK_NAME="lcqmc" +python run_classifier.py \ + --model_type=bert \ + --model_name_or_path=$BERT_BASE_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir=$GLUE_DIR/${TASK_NAME}/ \ + --max_seq_length=128 \ + --per_gpu_train_batch_size=16 \ + --per_gpu_eval_batch_size=16 \ + --learning_rate=2e-5 \ + --num_train_epochs=4.0 \ + --logging_steps=14923 \ + --save_steps=14923 \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --overwrite_output_dir + +# 每一个epoch保存一次 +# 每一个epoch评估一次 \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh b/baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh new file mode 100644 index 0000000..546b8a5 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/run_classifier_tnews.sh @@ -0,0 +1,25 @@ +CURRENT_DIR=`pwd` +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/ernie-base +export GLUE_DIR=$CURRENT_DIR/chineseGLUEdatasets +export OUTPUR_DIR=$CURRENT_DIR/outputs +TASK_NAME="tnews" +python run_classifier.py \ + --model_type=bert \ + --model_name_or_path=$BERT_BASE_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir=$GLUE_DIR/${TASK_NAME}/ \ + --max_seq_length=128 \ + --per_gpu_train_batch_size=32 \ + --per_gpu_eval_batch_size=32 \ + --learning_rate=2e-5 \ + --num_train_epochs=4.0 \ + --logging_steps=8372 \ + --save_steps=8372 \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --overwrite_output_dir + +# 每一个epoch保存一次 +# 每一个epoch评估一次 \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/run_classifier_xnli.sh b/baselines/models_pytorch/classifier_pytorch/run_classifier_xnli.sh new file mode 100644 index 0000000..99ef98d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/run_classifier_xnli.sh @@ -0,0 +1,22 @@ +CURRENT_DIR=`pwd` +export BERT_BASE_DIR=$CURRENT_DIR/prev_trained_model/bert-base +export GLUE_DIR=$CURRENT_DIR/chineseGLUEdatasets +export OUTPUR_DIR=$CURRENT_DIR/outputs +TASK_NAME="xnli" +python run_classifier.py \ + --model_type=bert \ + --model_name_or_path=$BERT_BASE_DIR \ + --task_name=$TASK_NAME \ + --do_train \ + --do_eval \ + --do_lower_case \ + --data_dir=$GLUE_DIR/${TASK_NAME}/ \ + --max_seq_length=512 \ + --per_gpu_train_batch_size=8 \ + --per_gpu_eval_batch_size=8 \ + --learning_rate=2e-5 \ + --num_train_epochs=4.0 \ + --logging_steps=670 \ + --save_steps=670 \ + --output_dir=$OUTPUR_DIR/${TASK_NAME}_output/ \ + --overwrite_output_dir diff --git a/baselines/models_pytorch/classifier_pytorch/tools/common.py b/baselines/models_pytorch/classifier_pytorch/tools/common.py new file mode 100644 index 0000000..40b657d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/tools/common.py @@ -0,0 +1,353 @@ +import os +import random +import torch +import numpy as np +import json +import pickle +import torch.nn as nn +from collections import OrderedDict +from pathlib import Path +import logging + +logger = logging.getLogger() +def print_config(config): + info = "Running with the following configs:\n" + for k, v in config.items(): + info += f"\t{k} : {str(v)}\n" + print("\n" + info + "\n") + return + +def init_logger(log_file=None, log_file_level=logging.NOTSET): + ''' + Example: + >>> init_logger(log_file) + >>> logger.info("abc'") + ''' + if isinstance(log_file,Path): + log_file = str(log_file) + log_format = logging.Formatter(fmt='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S') + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler() + console_handler.setFormatter(log_format) + logger.handlers = [console_handler] + if log_file and log_file != '': + file_handler = logging.FileHandler(log_file) + file_handler.setLevel(log_file_level) + # file_handler.setFormatter(log_format) + logger.addHandler(file_handler) + return logger + +def seed_everything(seed=1029): + ''' + 设置整个开发环境的seed + :param seed: + :param device: + :return: + ''' + random.seed(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + # some cudnn methods can be random even after fixing the seed + # unless you tell it to be deterministic + torch.backends.cudnn.deterministic = True + + +def prepare_device(n_gpu_use): + """ + setup GPU device if available, move model into configured device + # 如果n_gpu_use为数字,则使用range生成list + # 如果输入的是一个list,则默认使用list[0]作为controller + """ + if not n_gpu_use: + device_type = 'cpu' + else: + n_gpu_use = n_gpu_use.split(",") + device_type = f"cuda:{n_gpu_use[0]}" + n_gpu = torch.cuda.device_count() + if len(n_gpu_use) > 0 and n_gpu == 0: + logger.warning("Warning: There\'s no GPU available on this machine, training will be performed on CPU.") + device_type = 'cpu' + if len(n_gpu_use) > n_gpu: + msg = f"Warning: The number of GPU\'s configured to use is {n_gpu_use}, but only {n_gpu} are available on this machine." + logger.warning(msg) + n_gpu_use = range(n_gpu) + device = torch.device(device_type) + list_ids = n_gpu_use + return device, list_ids + + +def model_device(n_gpu, model): + ''' + 判断环境 cpu还是gpu + 支持单机多卡 + :param n_gpu: + :param model: + :return: + ''' + device, device_ids = prepare_device(n_gpu) + if len(device_ids) > 1: + logger.info(f"current {len(device_ids)} GPUs") + model = torch.nn.DataParallel(model, device_ids=device_ids) + if len(device_ids) == 1: + os.environ['CUDA_VISIBLE_DEVICES'] = str(device_ids[0]) + model = model.to(device) + return model, device + + +def restore_checkpoint(resume_path, model=None): + ''' + 加载模型 + :param resume_path: + :param model: + :param optimizer: + :return: + 注意: 如果是加载Bert模型的话,需要调整,不能使用该模式 + 可以使用模块自带的Bert_model.from_pretrained(state_dict = your save state_dict) + ''' + if isinstance(resume_path, Path): + resume_path = str(resume_path) + checkpoint = torch.load(resume_path) + best = checkpoint['best'] + start_epoch = checkpoint['epoch'] + 1 + states = checkpoint['state_dict'] + if isinstance(model, nn.DataParallel): + model.module.load_state_dict(states) + else: + model.load_state_dict(states) + return [model,best,start_epoch] + + +def save_pickle(data, file_path): + ''' + 保存成pickle文件 + :param data: + :param file_name: + :param pickle_path: + :return: + ''' + if isinstance(file_path, Path): + file_path = str(file_path) + with open(file_path, 'wb') as f: + pickle.dump(data, f) + + +def load_pickle(input_file): + ''' + 读取pickle文件 + :param pickle_path: + :param file_name: + :return: + ''' + with open(str(input_file), 'rb') as f: + data = pickle.load(f) + return data + + +def save_json(data, file_path): + ''' + 保存成json文件 + :param data: + :param json_path: + :param file_name: + :return: + ''' + if not isinstance(file_path, Path): + file_path = Path(file_path) + # if isinstance(data,dict): + # data = json.dumps(data) + with open(str(file_path), 'w') as f: + json.dump(data, f) + + +def load_json(file_path): + ''' + 加载json文件 + :param json_path: + :param file_name: + :return: + ''' + if not isinstance(file_path, Path): + file_path = Path(file_path) + with open(str(file_path), 'r') as f: + data = json.load(f) + return data + +def save_model(model, model_path): + """ 存储不含有显卡信息的state_dict或model + :param model: + :param model_name: + :param only_param: + :return: + """ + if isinstance(model_path, Path): + model_path = str(model_path) + if isinstance(model, nn.DataParallel): + model = model.module + state_dict = model.state_dict() + for key in state_dict: + state_dict[key] = state_dict[key].cpu() + torch.save(state_dict, model_path) + +def load_model(model, model_path): + ''' + 加载模型 + :param model: + :param model_name: + :param model_path: + :param only_param: + :return: + ''' + if isinstance(model_path, Path): + model_path = str(model_path) + logging.info(f"loading model from {str(model_path)} .") + states = torch.load(model_path) + state = states['state_dict'] + if isinstance(model, nn.DataParallel): + model.module.load_state_dict(state) + else: + model.load_state_dict(state) + return model + + +class AverageMeter(object): + ''' + computes and stores the average and current value + Example: + >>> loss = AverageMeter() + >>> for step,batch in enumerate(train_data): + >>> pred = self.model(batch) + >>> raw_loss = self.metrics(pred,target) + >>> loss.update(raw_loss.item(),n = 1) + >>> cur_loss = loss.avg + ''' + + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def summary(model, *inputs, batch_size=-1, show_input=True): + ''' + 打印模型结构信息 + :param model: + :param inputs: + :param batch_size: + :param show_input: + :return: + Example: + >>> print("model summary info: ") + >>> for step,batch in enumerate(train_data): + >>> summary(self.model,*batch,show_input=True) + >>> break + ''' + + def register_hook(module): + def hook(module, input, output=None): + class_name = str(module.__class__).split(".")[-1].split("'")[0] + module_idx = len(summary) + + m_key = f"{class_name}-{module_idx + 1}" + summary[m_key] = OrderedDict() + summary[m_key]["input_shape"] = list(input[0].size()) + summary[m_key]["input_shape"][0] = batch_size + + if show_input is False and output is not None: + if isinstance(output, (list, tuple)): + for out in output: + if isinstance(out, torch.Tensor): + summary[m_key]["output_shape"] = [ + [-1] + list(out.size())[1:] + ][0] + else: + summary[m_key]["output_shape"] = [ + [-1] + list(out[0].size())[1:] + ][0] + else: + summary[m_key]["output_shape"] = list(output.size()) + summary[m_key]["output_shape"][0] = batch_size + + params = 0 + if hasattr(module, "weight") and hasattr(module.weight, "size"): + params += torch.prod(torch.LongTensor(list(module.weight.size()))) + summary[m_key]["trainable"] = module.weight.requires_grad + if hasattr(module, "bias") and hasattr(module.bias, "size"): + params += torch.prod(torch.LongTensor(list(module.bias.size()))) + summary[m_key]["nb_params"] = params + + if (not isinstance(module, nn.Sequential) and not isinstance(module, nn.ModuleList) and not (module == model)): + if show_input is True: + hooks.append(module.register_forward_pre_hook(hook)) + else: + hooks.append(module.register_forward_hook(hook)) + + # create properties + summary = OrderedDict() + hooks = [] + + # register hook + model.apply(register_hook) + model(*inputs) + + # remove these hooks + for h in hooks: + h.remove() + + print("-----------------------------------------------------------------------") + if show_input is True: + line_new = f"{'Layer (type)':>25} {'Input Shape':>25} {'Param #':>15}" + else: + line_new = f"{'Layer (type)':>25} {'Output Shape':>25} {'Param #':>15}" + print(line_new) + print("=======================================================================") + + total_params = 0 + total_output = 0 + trainable_params = 0 + for layer in summary: + # input_shape, output_shape, trainable, nb_params + if show_input is True: + line_new = "{:>25} {:>25} {:>15}".format( + layer, + str(summary[layer]["input_shape"]), + "{0:,}".format(summary[layer]["nb_params"]), + ) + else: + line_new = "{:>25} {:>25} {:>15}".format( + layer, + str(summary[layer]["output_shape"]), + "{0:,}".format(summary[layer]["nb_params"]), + ) + + total_params += summary[layer]["nb_params"] + if show_input is True: + total_output += np.prod(summary[layer]["input_shape"]) + else: + total_output += np.prod(summary[layer]["output_shape"]) + if "trainable" in summary[layer]: + if summary[layer]["trainable"] == True: + trainable_params += summary[layer]["nb_params"] + + print(line_new) + + print("=======================================================================") + print(f"Total params: {total_params:0,}") + print(f"Trainable params: {trainable_params:0,}") + print(f"Non-trainable params: {(total_params - trainable_params):0,}") + print("-----------------------------------------------------------------------") diff --git a/baselines/models_pytorch/classifier_pytorch/tools/progressbar.py b/baselines/models_pytorch/classifier_pytorch/tools/progressbar.py new file mode 100644 index 0000000..929783c --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/tools/progressbar.py @@ -0,0 +1,59 @@ +import time + +class ProgressBar(object): + ''' + custom progress bar + Example: + >>> pbar = ProgressBar(n_total=30,desc='training') + >>> step = 2 + >>> pbar(step=step) + ''' + def __init__(self, n_total,width=30,desc = 'Training'): + self.width = width + self.n_total = n_total + self.start_time = time.time() + self.desc = desc + + def __call__(self, step, info={}): + now = time.time() + current = step + 1 + recv_per = current / self.n_total + bar = f'[{self.desc}] {current}/{self.n_total} [' + if recv_per >= 1: + recv_per = 1 + prog_width = int(self.width * recv_per) + if prog_width > 0: + bar += '=' * (prog_width - 1) + if current< self.n_total: + bar += ">" + else: + bar += '=' + bar += '.' * (self.width - prog_width) + bar += ']' + show_bar = f"\r{bar}" + time_per_unit = (now - self.start_time) / current + if current < self.n_total: + eta = time_per_unit * (self.n_total - current) + if eta > 3600: + eta_format = ('%d:%02d:%02d' % + (eta // 3600, (eta % 3600) // 60, eta % 60)) + elif eta > 60: + eta_format = '%d:%02d' % (eta // 60, eta % 60) + else: + eta_format = '%ds' % eta + time_info = f' - ETA: {eta_format}' + else: + if time_per_unit >= 1: + time_info = f' {time_per_unit:.1f}s/step' + elif time_per_unit >= 1e-3: + time_info = f' {time_per_unit * 1e3:.1f}ms/step' + else: + time_info = f' {time_per_unit * 1e6:.1f}us/step' + + show_bar += time_info + if len(info) != 0: + show_info = f'{show_bar} ' + \ + "-".join([f' {key}: {value:.4f} ' for key, value in info.items()]) + print(show_info, end='') + else: + print(show_bar, end='') diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/__init__.py b/baselines/models_pytorch/classifier_pytorch/transformers/__init__.py new file mode 100644 index 0000000..22f3a02 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/__init__.py @@ -0,0 +1,96 @@ +__version__ = "2.1.1" + +# Work around to update TensorFlow's absl.logging threshold which alters the +# default Python logging output behavior when present. +# see: https://github.com/abseil/abseil-py/issues/99 +# and: https://github.com/tensorflow/tensorflow/issues/26691#issuecomment-500369493 +try: + import absl.logging + absl.logging.set_verbosity('info') + absl.logging.set_stderrthreshold('info') + absl.logging._warn_preinit_stderr = False +except: + pass + +import logging + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +# Files and general utilities +from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, + cached_path, add_start_docstrings, add_end_docstrings, + WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, + is_tf_available, is_torch_available) + +# Tokenizers +from .tokenization_utils import (PreTrainedTokenizer) +from .tokenization_auto import AutoTokenizer +from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .tokenization_openai import OpenAIGPTTokenizer +from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) +from .tokenization_gpt2 import GPT2Tokenizer +from .tokenization_ctrl import CTRLTokenizer +from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE +from .tokenization_xlm import XLMTokenizer +from .tokenization_roberta import RobertaTokenizer +from .tokenization_distilbert import DistilBertTokenizer + +# Configurations +from .configuration_utils import PretrainedConfig +from .configuration_auto import AutoConfig +from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP + +# Modeling +if is_torch_available(): + from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) + from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, + AutoModelWithLMHead) + + from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, + BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForMultipleChoice, + BertForTokenClassification, BertForQuestionAnswering, + load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, + OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, + load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, + load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, + GPT2LMHeadModel, GPT2DoubleHeadsModel, + load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_ctrl import (CTRLPreTrainedModel, CTRLModel, + CTRLLMHeadModel, + CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, + XLNetForSequenceClassification, XLNetForMultipleChoice, + XLNetForQuestionAnsweringSimple, XLNetForQuestionAnswering, + load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_xlm import (XLMPreTrainedModel , XLMModel, + XLMWithLMHeadModel, XLMForSequenceClassification, + XLMForQuestionAnswering, XLMForQuestionAnsweringSimple, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, + RobertaForSequenceClassification, RobertaForMultipleChoice, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel, + DistilBertForSequenceClassification, DistilBertForQuestionAnswering, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_albert import AlbertForSequenceClassification + + # Optimization + from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, + WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) +if not is_tf_available() and not is_torch_available(): + logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found." + "Models won't be available and only tokenizers, configuration" + "and file/data utilities can be used.") diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/__main__.py b/baselines/models_pytorch/classifier_pytorch/transformers/__main__.py new file mode 100644 index 0000000..f99e365 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/__main__.py @@ -0,0 +1,129 @@ +# coding: utf8 +def main(): + import sys + if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: + print( + "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" + "It should be used as one of: \n" + ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" + ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" + ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" + ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" + ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" + ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") + else: + if sys.argv[1] == "bert": + try: + from convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + except ImportError: + print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if len(sys.argv) != 5: + # pylint: disable=line-too-long + print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") + else: + PYTORCH_DUMP_OUTPUT = sys.argv.pop() + TF_CONFIG = sys.argv.pop() + TF_CHECKPOINT = sys.argv.pop() + convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "gpt": + from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch + if len(sys.argv) < 4 or len(sys.argv) > 5: + # pylint: disable=line-too-long + print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") + else: + OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + OPENAI_GPT_CONFIG = sys.argv[4] + else: + OPENAI_GPT_CONFIG = "" + convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, + OPENAI_GPT_CONFIG, + PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "transfo_xl": + try: + from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch + except ImportError: + print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + if len(sys.argv) < 4 or len(sys.argv) > 5: + # pylint: disable=line-too-long + print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") + else: + if 'ckpt' in sys.argv[2].lower(): + TF_CHECKPOINT = sys.argv[2] + TF_DATASET_FILE = "" + else: + TF_DATASET_FILE = sys.argv[2] + TF_CHECKPOINT = "" + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + TF_CONFIG = sys.argv[4] + else: + TF_CONFIG = "" + convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) + elif sys.argv[1] == "gpt2": + try: + from convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch + except ImportError: + print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if len(sys.argv) < 4 or len(sys.argv) > 5: + # pylint: disable=line-too-long + print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") + else: + TF_CHECKPOINT = sys.argv[2] + PYTORCH_DUMP_OUTPUT = sys.argv[3] + if len(sys.argv) == 5: + TF_CONFIG = sys.argv[4] + else: + TF_CONFIG = "" + convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "xlnet": + try: + from convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch + except ImportError: + print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + + if len(sys.argv) < 5 or len(sys.argv) > 6: + # pylint: disable=line-too-long + print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") + else: + TF_CHECKPOINT = sys.argv[2] + TF_CONFIG = sys.argv[3] + PYTORCH_DUMP_OUTPUT = sys.argv[4] + if len(sys.argv) == 6: + FINETUNING_TASK = sys.argv[5] + else: + FINETUNING_TASK = None + + convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, + TF_CONFIG, + PYTORCH_DUMP_OUTPUT, + FINETUNING_TASK) + elif sys.argv[1] == "xlm": + from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch + + if len(sys.argv) != 4: + # pylint: disable=line-too-long + print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") + else: + XLM_CHECKPOINT_PATH = sys.argv[2] + PYTORCH_DUMP_OUTPUT = sys.argv[3] + + convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) + +if __name__ == '__main__': + main() diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_auto.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_auto.py new file mode 100644 index 0000000..edd21a6 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_auto.py @@ -0,0 +1,137 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Model class. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from .configuration_bert import BertConfig +from .configuration_openai import OpenAIGPTConfig +from .configuration_gpt2 import GPT2Config +from .configuration_transfo_xl import TransfoXLConfig +from .configuration_xlnet import XLNetConfig +from .configuration_xlm import XLMConfig +from .configuration_roberta import RobertaConfig +from .configuration_distilbert import DistilBertConfig +from .configuration_ctrl import CTRLConfig + +logger = logging.getLogger(__name__) + + +class AutoConfig(object): + r""":class:`~transformers.AutoConfig` is a generic configuration class + that will be instantiated as one of the configuration classes of the library + when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method take care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The base model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertConfig (DistilBERT model) + - contains `bert`: BertConfig (Bert model) + - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) + - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) + - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) + - contains `xlnet`: XLNetConfig (XLNet model) + - contains `xlm`: XLMConfig (XLM model) + - contains `roberta`: RobertaConfig (RoBERTa model) + - contains `ctrl` : CTRLConfig (CTRL model) + This class cannot be instantiated using `__init__()` (throw an error). + """ + def __init__(self): + raise EnvironmentError("AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiate a one of the configuration classes of the library + from a pre-trained model configuration. + + The configuration class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertConfig (DistilBERT model) + - contains `bert`: BertConfig (Bert model) + - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) + - contains `gpt2`: GPT2Config (OpenAI GPT-2 model) + - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model) + - contains `xlnet`: XLNetConfig (XLNet model) + - contains `xlm`: XLMConfig (XLM model) + - contains `roberta`: RobertaConfig (RoBERTa model) + - contains `ctrl` : CTRLConfig (CTRL model) + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + return_unused_kwargs: (`optional`) bool: + + - If False, then this function returns just the final configuration object. + - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + + Examples:: + + config = AutoConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = AutoConfig.from_pretrained('./test/bert_saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json') + config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + assert config.output_attention == True + config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, + foo=False, return_unused_kwargs=True) + assert config.output_attention == True + assert unused_kwargs == {'foo': False} + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'openai-gpt' in pretrained_model_name_or_path: + return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'gpt2' in pretrained_model_name_or_path: + return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'transfo-xl' in pretrained_model_name_or_path: + return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'ctrl' in pretrained_model_name_or_path: + return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py new file mode 100644 index 0000000..d63be96 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_bert.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" BERT model configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", +} + + +class BertConfig(PretrainedConfig): + r""" + :class:`~transformers.BertConfig` is the configuration class to store the configuration of a + `BertModel`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + **kwargs): + super(BertConfig, self).__init__(**kwargs) + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py new file mode 100644 index 0000000..fcbd848 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_ctrl.py @@ -0,0 +1,143 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Salesforce CTRL configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/ctrl-config.json"} + +class CTRLConfig(PretrainedConfig): + """Configuration class to store the configuration of a `CTRLModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + dff: Size of the inner dimension of the FFN. + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + pretrained_config_archive_map = CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__( + self, + vocab_size_or_config_json_file=246534, + n_positions=256, + n_ctx=256, + n_embd=1280, + dff=8192, + n_layer=48, + n_head=16, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-6, + initializer_range=0.02, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): + """Constructs CTRLConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + dff: Size of the inner dimension of the FFN. + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + super(CTRLConfig, self).__init__(**kwargs) + + self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.dff = dff + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif not isinstance(vocab_size_or_config_json_file, int): + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py new file mode 100644 index 0000000..2a8a149 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_distilbert.py @@ -0,0 +1,89 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" DistilBERT model configuration """ +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json" +} + + +class DistilBertConfig(PretrainedConfig): + pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30522, + max_position_embeddings=512, + sinusoidal_pos_embds=False, + n_layers=6, + n_heads=12, + dim=768, + hidden_dim=4*768, + dropout=0.1, + attention_dropout=0.1, + activation='gelu', + initializer_range=0.02, + tie_weights_=True, + qa_dropout=0.1, + seq_classif_dropout=0.2, + **kwargs): + super(DistilBertConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.max_position_embeddings = max_position_embeddings + self.sinusoidal_pos_embds = sinusoidal_pos_embds + self.n_layers = n_layers + self.n_heads = n_heads + self.dim = dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation = activation + self.initializer_range = initializer_range + self.tie_weights_ = tie_weights_ + self.qa_dropout = qa_dropout + self.seq_classif_dropout = seq_classif_dropout + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + @property + def hidden_size(self): + return self.dim + + @property + def num_attention_heads(self): + return self.n_heads + + @property + def num_hidden_layers(self): + return self.n_layers diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py new file mode 100644 index 0000000..e7d853f --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_gpt2.py @@ -0,0 +1,144 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT-2 configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",} + +class GPT2Config(PretrainedConfig): + """Configuration class to store the configuration of a `GPT2Model`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__( + self, + vocab_size_or_config_json_file=50257, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): + """Constructs GPT2Config. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + layer_norm_epsilon: epsilon to use in the layer norm layers + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + super(GPT2Config, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py new file mode 100644 index 0000000..886b7f5 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_openai.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" OpenAI GPT configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json" +} + +class OpenAIGPTConfig(PretrainedConfig): + """ + Configuration class to store the configuration of a `OpenAIGPTModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). + n_embd: Dimensionality of the embeddings and hidden states. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + afn: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + resid_pdrop: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attn_pdrop: The dropout ratio for the attention + probabilities. + embd_pdrop: The dropout ratio for the embeddings. + layer_norm_epsilon: epsilon to use in the layer norm layers + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + predict_special_tokens: should we predict special tokens (when the model has a LM head) + """ + pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__( + self, + vocab_size_or_config_json_file=40478, + n_positions=512, + n_ctx=512, + n_embd=768, + n_layer=12, + n_head=12, + afn="gelu", + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + predict_special_tokens=True, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs + ): + """Constructs OpenAIGPTConfig. + """ + super(OpenAIGPTConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.afn = afn + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.predict_special_tokens = predict_special_tokens + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + else: + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py new file mode 100644 index 0000000..b92d6a9 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_roberta.py @@ -0,0 +1,35 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" RoBERTa configuration """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +from .configuration_bert import BertConfig + +logger = logging.getLogger(__name__) + +ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json", +} + + +class RobertaConfig(BertConfig): + pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py new file mode 100644 index 0000000..d55a6ad --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_transfo_xl.py @@ -0,0 +1,168 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Transformer XL configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json", +} + +class TransfoXLConfig(PretrainedConfig): + """Configuration class to store the configuration of a `TransfoXLModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. + cutoffs: cutoffs for the adaptive softmax + d_model: Dimensionality of the model's hidden states. + d_embed: Dimensionality of the embeddings + d_head: Dimensionality of the model's heads. + div_val: divident value for adapative input and softmax + pre_lnorm: apply LayerNorm to the input instead of the output + d_inner: Inner dimension in FF + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + tgt_len: number of tokens to predict + ext_len: length of the extended context + mem_len: length of the retained previous heads + same_length: use the same attn length for all tokens + proj_share_all_but_first: True to share all but first projs, False not to share. + attn_type: attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al. + clamp_len: use the same pos embeddings after clamp_len + sample_softmax: number of samples in sampled softmax + adaptive: use adaptive softmax + tie_weight: tie the word embedding and softmax weights + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + dropatt: The dropout ratio for the attention probabilities. + untie_r: untie relative position biases + embd_pdrop: The dropout ratio for the embeddings. + init: parameter initializer to use + init_range: parameters initialized by U(-init_range, init_range). + proj_init_std: parameters initialized by N(0, init_std) + init_std: parameters initialized by N(0, init_std) + """ + pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=267735, + cutoffs=[20000, 40000, 200000], + d_model=1024, + d_embed=1024, + n_head=16, + d_head=64, + d_inner=4096, + div_val=4, + pre_lnorm=False, + n_layer=18, + tgt_len=128, + ext_len=0, + mem_len=1600, + clamp_len=1000, + same_length=True, + proj_share_all_but_first=True, + attn_type=0, + sample_softmax=-1, + adaptive=True, + tie_weight=True, + dropout=0.1, + dropatt=0.0, + untie_r=True, + init="normal", + init_range=0.01, + proj_init_std=0.01, + init_std=0.02, + layer_norm_epsilon=1e-5, + **kwargs): + """Constructs TransfoXLConfig. + """ + super(TransfoXLConfig, self).__init__(**kwargs) + self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 + self.cutoffs = [] + self.cutoffs.extend(cutoffs) + self.tie_weight = tie_weight + if proj_share_all_but_first: + self.tie_projs = [False] + [True] * len(self.cutoffs) + else: + self.tie_projs = [False] + [False] * len(self.cutoffs) + self.d_model = d_model + self.d_embed = d_embed + self.d_head = d_head + self.d_inner = d_inner + self.div_val = div_val + self.pre_lnorm = pre_lnorm + self.n_layer = n_layer + self.n_head = n_head + self.tgt_len = tgt_len + self.ext_len = ext_len + self.mem_len = mem_len + self.same_length = same_length + self.attn_type = attn_type + self.clamp_len = clamp_len + self.sample_softmax = sample_softmax + self.adaptive = adaptive + self.dropout = dropout + self.dropatt = dropatt + self.untie_r = untie_r + self.init = init + self.init_range = init_range + self.proj_init_std = proj_init_std + self.init_std = init_std + self.layer_norm_epsilon = layer_norm_epsilon + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif not isinstance(vocab_size_or_config_json_file, int): + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def max_position_embeddings(self): + return self.tgt_len + self.ext_len + self.mem_len + + @property + def vocab_size(self): + return self.n_token + + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + + @property + def hidden_size(self): + return self.d_model + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_utils.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_utils.py new file mode 100644 index 0000000..cfa6502 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_utils.py @@ -0,0 +1,207 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Configuration base class and utilities.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import copy +import json +import logging +import os +from io import open + +from .file_utils import cached_path, CONFIG_NAME + +logger = logging.getLogger(__name__) + +class PretrainedConfig(object): + r""" Base class for all configuration classes. + Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations. + + Note: + A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights. + It only affects the model's configuration. + + Class attributes (overridden by derived classes): + - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values. + + Parameters: + ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint. + ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens) + ``output_attentions``: boolean, default `False`. Should the model returns attentions weights. + ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states. + ``torchscript``: string, default `False`. Is the model used with Torchscript. + """ + pretrained_config_archive_map = {} + + def __init__(self, **kwargs): + self.finetuning_task = kwargs.pop('finetuning_task', None) + self.num_labels = kwargs.pop('num_labels', 2) + self.output_attentions = kwargs.pop('output_attentions', False) + self.output_hidden_states = kwargs.pop('output_hidden_states', False) + self.output_past = kwargs.pop('output_past', True) # Not used by all models + self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models + self.use_bfloat16 = kwargs.pop('use_bfloat16', False) + self.pruned_heads = kwargs.pop('pruned_heads', {}) + + def save_pretrained(self, save_directory): + """ Save a configuration object to the directory `save_directory`, so that it + can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. + """ + assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + + # If we save using the predefined names, we can load using `from_pretrained` + output_config_file = os.path.join(save_directory, CONFIG_NAME) + + self.to_json_file(output_config_file) + logger.info("Configuration saved in {}".format(output_config_file)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration. + + Parameters: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading. + + - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + return_unused_kwargs: (`optional`) bool: + + - If False, then this function returns just the final configuration object. + - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored. + + Examples:: + + # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a + # derived class: BertConfig + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + config = BertConfig.from_pretrained('./test/saved_model/') # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')` + config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json') + config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + assert config.output_attention == True + config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, + foo=False, return_unused_kwargs=True) + assert config.output_attention == True + assert unused_kwargs == {'foo': False} + + """ + cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) + return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) + + if pretrained_model_name_or_path in cls.pretrained_config_archive_map: + config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] + elif os.path.isdir(pretrained_model_name_or_path): + config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) + else: + config_file = pretrained_model_name_or_path + # redirect to the cache, if necessary + try: + resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) + except EnvironmentError: + if pretrained_model_name_or_path in cls.pretrained_config_archive_map: + msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( + config_file) + else: + msg = "Model name '{}' was not found in model name list ({}). " \ + "We assumed '{}' was a path or url to a configuration file named {} or " \ + "a directory containing such a file but couldn't find any such file at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(cls.pretrained_config_archive_map.keys()), + config_file, CONFIG_NAME) + raise EnvironmentError(msg) + + if resolved_config_file == config_file: + logger.info("loading configuration file {}".format(config_file)) + else: + logger.info("loading configuration file {} from cache at {}".format( + config_file, resolved_config_file)) + + # Load config + config = cls.from_json_file(resolved_config_file) + + if hasattr(config, 'pruned_heads'): + config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) + + # Update config with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info("Model config %s", str(config)) + if return_unused_kwargs: + return config, kwargs + else: + return config + + @classmethod + def from_dict(cls, json_object): + """Constructs a `Config` from a Python dictionary of parameters.""" + config = cls(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + setattr(config, key, value) + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlm.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlm.py new file mode 100644 index 0000000..fa3a5f4 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlm.py @@ -0,0 +1,181 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" XLM configuration """ +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-config.json", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-config.json", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-config.json", +} + + +class XLMConfig(PretrainedConfig): + """Configuration class to store the configuration of a `XLMModel`. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. + d_model: Size of the encoder layers and the pooler layer. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + d_inner: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + ff_activation: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r: untie relative position biases + attn_type: 'bi' for XLM, 'uni' for Transformer-XL + + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + + dropout: float, dropout rate. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + """ + pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=30145, + emb_dim=2048, + n_layers=12, + n_heads=16, + dropout=0.1, + attention_dropout=0.1, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=1, + use_lang_emb=True, + max_position_embeddings=512, + embed_init_std=2048 ** -0.5, + layer_norm_eps=1e-12, + init_std=0.02, + bos_index=0, + eos_index=1, + pad_index=2, + unk_index=3, + mask_index=5, + is_encoder=True, + + finetuning_task=None, + num_labels=2, + summary_type='first', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + start_n_top=5, + end_n_top=5, + **kwargs): + """Constructs XLMConfig. + """ + super(XLMConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.n_words = vocab_size_or_config_json_file + self.emb_dim = emb_dim + self.n_layers = n_layers + self.n_heads = n_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.use_lang_emb = use_lang_emb + self.layer_norm_eps = layer_norm_eps + self.bos_index = bos_index + self.eos_index = eos_index + self.pad_index = pad_index + self.unk_index = unk_index + self.mask_index = mask_index + self.is_encoder = is_encoder + self.max_position_embeddings = max_position_embeddings + self.embed_init_std = embed_init_std + self.init_std = init_std + self.finetuning_task = finetuning_task + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_proj_to_labels = summary_proj_to_labels + self.summary_first_dropout = summary_first_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def vocab_size(self): + return self.n_words + + @vocab_size.setter + def vocab_size(self, value): + self.n_words = value + + @property + def hidden_size(self): + return self.emb_dim + + @property + def num_attention_heads(self): + return self.n_heads + + @property + def num_hidden_layers(self): + return self.n_layers diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py new file mode 100644 index 0000000..0dbf518 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/configuration_xlnet.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" XLNet configuration """ +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json", + 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-config.json", +} + + +class XLNetConfig(PretrainedConfig): + """Configuration class to store the configuration of a ``XLNetModel``. + + Args: + vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. + d_model: Size of the encoder layers and the pooler layer. + n_layer: Number of hidden layers in the Transformer encoder. + n_head: Number of attention heads for each attention layer in + the Transformer encoder. + d_inner: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + ff_activation: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + untie_r: untie relative position biases + attn_type: 'bi' for XLNet, 'uni' for Transformer-XL + + dropout: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + + dropout: float, dropout rate. + init: str, the initialization scheme, either "normal" or "uniform". + init_range: float, initialize the parameters with a uniform distribution + in [-init_range, init_range]. Only effective when init="uniform". + init_std: float, initialize the parameters with a normal distribution + with mean 0 and stddev init_std. Only effective when init="normal". + mem_len: int, the number of tokens to cache. + reuse_len: int, the number of tokens in the currect batch to be cached + and reused in the future. + bi_data: bool, whether to use bidirectional input pipeline. + Usually set to True during pretraining and False during finetuning. + clamp_len: int, clamp all relative distances larger than clamp_len. + -1 means no clamping. + same_length: bool, whether to use the same attention length for each token. + finetuning_task: name of the glue task on which the model was fine-tuned if any + """ + pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=32000, + d_model=1024, + n_layer=24, + n_head=16, + d_inner=4096, + max_position_embeddings=512, + ff_activation="gelu", + untie_r=True, + attn_type="bi", + + initializer_range=0.02, + layer_norm_eps=1e-12, + + dropout=0.1, + mem_len=None, + reuse_len=None, + bi_data=False, + clamp_len=-1, + same_length=False, + + finetuning_task=None, + num_labels=2, + summary_type='last', + summary_use_proj=True, + summary_activation='tanh', + summary_last_dropout=0.1, + start_n_top=5, + end_n_top=5, + **kwargs): + """Constructs XLNetConfig. + """ + super(XLNetConfig, self).__init__(**kwargs) + + if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 + and isinstance(vocab_size_or_config_json_file, unicode)): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + setattr(config, key, value) + elif isinstance(vocab_size_or_config_json_file, int): + self.n_token = vocab_size_or_config_json_file + self.d_model = d_model + self.n_layer = n_layer + self.n_head = n_head + assert d_model % n_head == 0 + self.d_head = d_model // n_head + self.ff_activation = ff_activation + self.d_inner = d_inner + self.untie_r = untie_r + self.attn_type = attn_type + + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + self.dropout = dropout + self.mem_len = mem_len + self.reuse_len = reuse_len + self.bi_data = bi_data + self.clamp_len = clamp_len + self.same_length = same_length + + self.finetuning_task = finetuning_task + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_last_dropout = summary_last_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top + else: + raise ValueError("First argument must be either a vocabulary size (int)" + " or the path to a pretrained model config file (str)") + + @property + def max_position_embeddings(self): + return -1 + + @property + def vocab_size(self): + return self.n_token + + @vocab_size.setter + def vocab_size(self, value): + self.n_token = value + + @property + def hidden_size(self): + return self.d_model + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/file_utils.py b/baselines/models_pytorch/classifier_pytorch/transformers/file_utils.py new file mode 100644 index 0000000..11c4ba6 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/file_utils.py @@ -0,0 +1,324 @@ +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" +from __future__ import (absolute_import, division, print_function, unicode_literals) + +import sys +import json +import logging +import os +import six +import shutil +import tempfile +import fnmatch +from functools import wraps +from hashlib import sha256 +from io import open + +import boto3 +from botocore.config import Config +from botocore.exceptions import ClientError +import requests +from tqdm import tqdm + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +try: + import tensorflow as tf + assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 + _tf_available = True # pylint: disable=invalid-name + logger.info("TensorFlow version {} available.".format(tf.__version__)) +except (ImportError, AssertionError): + _tf_available = False # pylint: disable=invalid-name + +try: + import torch + _torch_available = True # pylint: disable=invalid-name + logger.info("PyTorch version {} available.".format(torch.__version__)) +except ImportError: + _torch_available = False # pylint: disable=invalid-name + + +try: + from torch.hub import _get_torch_home + torch_cache_home = _get_torch_home() +except ImportError: + torch_cache_home = os.path.expanduser( + os.getenv('TORCH_HOME', os.path.join( + os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch'))) +default_cache_path = os.path.join(torch_cache_home, 'transformers') + +try: + from urllib.parse import urlparse +except ImportError: + from urlparse import urlparse + +try: + from pathlib import Path + PYTORCH_PRETRAINED_BERT_CACHE = Path( + os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))) +except (AttributeError, ImportError): + PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE', + os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + default_cache_path)) + +PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility +TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility + +WEIGHTS_NAME = "pytorch_model.bin" +TF2_WEIGHTS_NAME = 'tf_model.h5' +TF_WEIGHTS_NAME = 'model.ckpt' +CONFIG_NAME = "config.json" + +def is_torch_available(): + return _torch_available + +def is_tf_available(): + return _tf_available + +if not six.PY2: + def add_start_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = ''.join(docstr) + fn.__doc__ + return fn + return docstring_decorator + + def add_end_docstrings(*docstr): + def docstring_decorator(fn): + fn.__doc__ = fn.__doc__ + ''.join(docstr) + return fn + return docstring_decorator +else: + # Not possible to update class docstrings on python2 + def add_start_docstrings(*docstr): + def docstring_decorator(fn): + return fn + return docstring_decorator + + def add_end_docstrings(*docstr): + def docstring_decorator(fn): + return fn + return docstring_decorator + +def url_to_filename(url, etag=None): + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name + so that TF 2.0 can identify it as a HDF5 file + (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + if url.endswith('.h5'): + filename += '.h5' + + return filename + + +def filename_to_url(filename, cache_dir=None): + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise EnvironmentError("file {} not found".format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise EnvironmentError("file {} not found".format(meta_path)) + + with open(meta_path, encoding="utf-8") as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None): + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + Args: + cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). + force_download: if True, re-dowload the file even if it's already cached in the cache dir. + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise EnvironmentError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + +def split_s3_path(url): + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise EnvironmentError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url, proxies=None): + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url, temp_file, proxies=None): + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3", config=Config(proxies=proxies)) + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url, temp_file, proxies=None): + req = requests.get(url, stream=True, proxies=proxies) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url, cache_dir=None, force_download=False, proxies=None): + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = TRANSFORMERS_CACHE + if sys.version_info[0] == 3 and isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + if sys.version_info[0] == 2 and not isinstance(cache_dir, str): + cache_dir = str(cache_dir) + + if not os.path.exists(cache_dir): + os.makedirs(cache_dir) + + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url, proxies=proxies) + else: + try: + response = requests.head(url, allow_redirects=True, proxies=proxies) + if response.status_code != 200: + etag = None + else: + etag = response.headers.get("ETag") + except EnvironmentError: + etag = None + + if sys.version_info[0] == 2 and etag is not None: + etag = etag.decode('utf-8') + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + # If we don't have a connection (etag is None) and can't identify the file + # try to get the last downloaded one + if not os.path.exists(cache_path) and etag is None: + matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') + matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) + if matching_files: + cache_path = os.path.join(cache_dir, matching_files[-1]) + + if not os.path.exists(cache_path) or force_download: + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + s3_get(url, temp_file, proxies=proxies) + else: + http_get(url, temp_file, proxies=proxies) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info("copying %s to cache at %s", temp_file.name, cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w') as meta_file: + output_string = json.dumps(meta) + if sys.version_info[0] == 2 and isinstance(output_string, str): + output_string = unicode(output_string, 'utf-8') # The beauty of python 2 + meta_file.write(output_string) + + logger.info("removing temp file %s", temp_file.name) + + return cache_path diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_albert.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_albert.py new file mode 100644 index 0000000..23fafc2 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_albert.py @@ -0,0 +1,1065 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_bert import BertConfig +from .file_utils import add_start_docstrings +from .modeling_bert import (ACT2FN, BertSelfAttention, BertIntermediate, + BertPooler,BertPredictionHeadTransform) + +logger = logging.getLogger(__name__) + +ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'albert-base': "", + 'albert-large': "", + 'albert-xlarge': "", + 'albert-xxlarge': "", +} + + +def load_tf_weights_in_albert(model, config, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model. + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name[-13:] == '_embeddings_2': + pointer = getattr(pointer, 'weight') + array = np.transpose(array) + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +BertLayerNorm = torch.nn.LayerNorm + + +class AlbertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(AlbertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0) + # project layer + self.word_embeddings_2 = nn.Linear(config.embedding_size, config.hidden_size, bias=False) + + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + # project transform + words_embeddings = self.word_embeddings_2(words_embeddings) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.ln_type = config.ln_type + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + if self.ln_type == 'preln': + # preln + hidden_states = hidden_states + input_tensor + else: + # postln + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + self.ln_type = config.ln_type + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) + heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, input_tensor, attention_mask=None, head_mask=None): + if self.ln_type == 'preln': + # pre_ln + hidden_state = self.output.LayerNorm(input_tensor) + self_outputs = self.self(hidden_state, attention_mask, head_mask) + else: + # postln + self_outputs = self.self(input_tensor, attention_mask, head_mask) + attention_output = self.output(self_outputs[0], input_tensor) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.ln_type = config.ln_type + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + if self.ln_type == 'preln': + # preln + hidden_states = hidden_states + input_tensor + else: + # postln + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + self.ln_type = config.ln_type + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + attention_outputs = self.attention(hidden_states, attention_mask, head_mask) + attention_output = attention_outputs[0] + if self.ln_type == 'preln': + # preln + attention_output_pre = self.output.LayerNorm(attention_output) + else: + # postln + attention_output_pre = attention_output + intermediate_output = self.intermediate(attention_output_pre) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + +class AlbertEncoder(nn.Module): + def __init__(self, config): + super(AlbertEncoder, self).__init__() + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.num_hidden_layers = config.num_hidden_layers + self.layer_shared = BertLayer(config) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + all_hidden_states = () + all_attentions = () + for i in range(self.num_hidden_layers): + layer_module = self.layer_shared + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i]) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + +class AlbertLMPredictionHead(nn.Module): + def __init__(self, config): + super(AlbertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.project_layer = nn.Linear(config.hidden_size, config.embedding_size, bias=False) + self.decoder = nn.Linear(config.embedding_size, + config.vocab_size, + bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.project_layer(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class AlbertOnlyMLMHead(nn.Module): + def __init__(self, config): + super(AlbertOnlyMLMHead, self).__init__() + self.predictions = AlbertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class AlbertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(AlbertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class AlbertPreTrainingHeads(nn.Module): + def __init__(self, config): + super(AlbertPreTrainingHeads, self).__init__() + self.predictions = AlbertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class AlbertPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = BertConfig + pretrained_model_archive_map = ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_albert + base_model_prefix = "bert" + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +BERT_START_DOCSTRING = r""" The BERT model was proposed in + `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ + by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer + pre-trained using a combination of masked language modeling objective and next sentence prediction + on a large corpus comprising the Toronto Book Corpus and Wikipedia. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`: + https://arxiv.org/abs/1810.04805 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + Indices can be obtained using :class:`transformers.BertTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + + +@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertModel(AlbertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + + def __init__(self, config): + super(AlbertModel, self).__init__(config) + + self.embeddings = AlbertEmbeddings(config) + self.encoder = AlbertEncoder(config) + self.pooler = BertPooler(config) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( + -1) # We can specify head_mask for each layer + head_mask = head_mask.to( + dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + encoder_outputs = self.encoder(embedding_output, + extended_attention_mask, + head_mask=head_mask) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = (sequence_output, pooled_output,) + encoder_outputs[ + 1:] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training: + a `masked language modeling` head and a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForPreTraining(AlbertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + prediction_scores, seq_relationship_scores = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForPreTraining, self).__init__(config) + + self.bert = AlbertModel(config) + self.cls = AlbertPreTrainingHeads(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + self._tie_or_clone_data(self.cls.predictions.project_layer, + self.bert.embeddings.word_embeddings_2) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + masked_lm_labels=None, next_sentence_label=None): + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + outputs = (prediction_scores, seq_relationship_score,) + outputs[ + 2:] # add hidden states and attention if they are here + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + outputs = (total_loss,) + outputs + + return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForMaskedLM(AlbertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMaskedLM.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForMaskedLM, self).__init__(config) + + self.bert = AlbertModel(config) + self.cls = AlbertOnlyMLMHead(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + self._tie_or_clone_data(self.cls.predictions.project_layer, + self.bert.embeddings.word_embeddings_2) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + masked_lm_labels=None): + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + outputs = (masked_lm_loss,) + outputs + + return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForNextSentencePrediction(AlbertPreTrainedModel): + r""" + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Next sequence prediction (classification) loss. + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + seq_relationship_scores = outputs[0] + + """ + + def __init__(self, config): + super(AlbertForNextSentencePrediction, self).__init__(config) + + self.bert = AlbertModel(config) + self.cls = AlbertOnlyNSPHead(config) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + next_sentence_label=None): + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + seq_relationship_score = self.cls(pooled_output) + + outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + outputs = (next_sentence_loss,) + outputs + + return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForSequenceClassification(AlbertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = AlbertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForMultipleChoice(AlbertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMultipleChoice.from_pretrained('bert-base-uncased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForMultipleChoice, self).__init__(config) + + self.bert = AlbertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + num_choices = input_ids.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForTokenClassification(AlbertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForTokenClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForTokenClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = AlbertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class AlbertForQuestionAnswering(AlbertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + + def __init__(self, config): + super(AlbertForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = AlbertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_auto.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_auto.py new file mode 100644 index 0000000..d98110d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_auto.py @@ -0,0 +1,503 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Model class. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering +from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel +from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel +from .modeling_ctrl import CTRLModel, CTRLLMHeadModel +from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel +from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering +from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification +from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification + +from .modeling_utils import PreTrainedModel, SequenceSummary + +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + + +class AutoModel(object): + r""" + :class:`~transformers.AutoModel` is a generic model class + that will be instantiated as one of the base model classes of the library + when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The base model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertModel (DistilBERT model) + - contains `roberta`: RobertaModel (RoBERTa model) + - contains `bert`: BertModel (Bert model) + - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) + - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) + - contains `ctrl`: CTRLModel (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) + - contains `xlnet`: XLNetModel (XLNet model) + - contains `xlm`: XLMModel (XLM model) + + This class cannot be instantiated using `__init__()` (throws an error). + """ + def __init__(self): + raise EnvironmentError("AutoModel is designed to be instantiated " + "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the base model classes of the library + from a pre-trained model configuration. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertModel (DistilBERT model) + - contains `roberta`: RobertaModel (RoBERTa model) + - contains `bert`: BertModel (Bert model) + - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) + - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) + - contains `ctrl`: CTRLModel (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) + - contains `xlnet`: XLNetModel (XLNet model) + - contains `xlm`: XLMModel (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'openai-gpt' in pretrained_model_name_or_path: + return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'gpt2' in pretrained_model_name_or_path: + return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'transfo-xl' in pretrained_model_name_or_path: + return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'ctrl' in pretrained_model_name_or_path: + return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta, 'ctrl'".format(pretrained_model_name_or_path)) + + +class AutoModelWithLMHead(object): + r""" + :class:`~transformers.AutoModelWithLMHead` is a generic model class + that will be instantiated as one of the language modeling model classes of the library + when created with the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) + - contains `roberta`: RobertaForMaskedLM (RoBERTa model) + - contains `bert`: BertForMaskedLM (Bert model) + - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model) + - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model) + - contains `ctrl`: CTRLLMModel (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model) + - contains `xlnet`: XLNetLMHeadModel (XLNet model) + - contains `xlm`: XLMWithLMHeadModel (XLM model) + + This class cannot be instantiated using `__init__()` (throws an error). + """ + def __init__(self): + raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " + "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the language modeling model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) + - contains `roberta`: RobertaForMaskedLM (RoBERTa model) + - contains `bert`: BertForMaskedLM (Bert model) + - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model) + - contains `gpt2`: GPT2LMHeadModel (OpenAI GPT-2 model) + - contains `transfo-xl`: TransfoXLLMHeadModel (Transformer-XL model) + - contains `xlnet`: XLNetLMHeadModel (XLNet model) + - contains `xlm`: XLMWithLMHeadModel (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModelWithLMHead.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModelWithLMHead.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModelWithLMHead.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'openai-gpt' in pretrained_model_name_or_path: + return OpenAIGPTLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'gpt2' in pretrained_model_name_or_path: + return GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'transfo-xl' in pretrained_model_name_or_path: + return TransfoXLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMWithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'ctrl' in pretrained_model_name_or_path: + return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta','ctrl'".format(pretrained_model_name_or_path)) + + +class AutoModelForSequenceClassification(object): + r""" + :class:`~transformers.AutoModelForSequenceClassification` is a generic model class + that will be instantiated as one of the sequence classification model classes of the library + when created with the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model) + - contains `roberta`: RobertaForSequenceClassification (RoBERTa model) + - contains `bert`: BertForSequenceClassification (Bert model) + - contains `xlnet`: XLNetForSequenceClassification (XLNet model) + - contains `xlm`: XLMForSequenceClassification (XLM model) + + This class cannot be instantiated using `__init__()` (throws an error). + """ + def __init__(self): + raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " + "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the sequence classification model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model) + - contains `roberta`: RobertaForSequenceClassification (RoBERTa model) + - contains `bert`: BertForSequenceClassification (Bert model) + - contains `xlnet`: XLNetForSequenceClassification (XLNet model) + - contains `xlm`: XLMForSequenceClassification (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModelForSequenceClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModelForSequenceClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)) + + +class AutoModelForQuestionAnswering(object): + r""" + :class:`~transformers.AutoModelForQuestionAnswering` is a generic model class + that will be instantiated as one of the question answering model classes of the library + when created with the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model) + - contains `bert`: BertForQuestionAnswering (Bert model) + - contains `xlnet`: XLNetForQuestionAnswering (XLNet model) + - contains `xlm`: XLMForQuestionAnswering (XLM model) + + This class cannot be instantiated using `__init__()` (throws an error). + """ + def __init__(self): + raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " + "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the question answering model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForQuestionAnswering (DistilBERT model) + - contains `bert`: BertForQuestionAnswering (Bert model) + - contains `xlnet`: XLNetForQuestionAnswering (XLNet model) + - contains `xlm`: XLMForQuestionAnswering (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModelForQuestionAnswering.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModelForQuestionAnswering.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModelForQuestionAnswering.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMForQuestionAnswering.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_bert.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_bert.py new file mode 100644 index 0000000..fecf1e4 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_bert.py @@ -0,0 +1,1149 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_bert import BertConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-pytorch_model.bin", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-pytorch_model.bin", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-pytorch_model.bin", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-pytorch_model.bin", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-pytorch_model.bin", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-pytorch_model.bin", + 'bert-base-german-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-pytorch_model.bin", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-pytorch_model.bin", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-pytorch_model.bin", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", +} + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model. + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + """ Original Implementation of the gelu activation function in Google Bert repo when initially created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + +def gelu_new(x): + """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new} + + +BertLayerNorm = torch.nn.LayerNorm + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.output_attentions = config.output_attentions + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size) + heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, input_tensor, attention_mask=None, head_mask=None): + self_outputs = self.self(input_tensor, attention_mask, head_mask) + attention_output = self.output(self_outputs[0], input_tensor) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + attention_outputs = self.attention(hidden_states, attention_mask, head_mask) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + all_hidden_states = () + all_attentions = () + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i]) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, + config.vocab_size, + bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = BertConfig + pretrained_model_archive_map = BERT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = "bert" + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +BERT_START_DOCSTRING = r""" The BERT model was proposed in + `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ + by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer + pre-trained using a combination of masked language modeling objective and next sentence prediction + on a large corpus comprising the Toronto Book Corpus and Wikipedia. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`: + https://arxiv.org/abs/1810.04805 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + Indices can be obtained using :class:`transformers.BertTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertModel(BertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(BertModel, self).__init__(config) + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + encoder_outputs = self.encoder(embedding_output, + extended_attention_mask, + head_mask=head_mask) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) + + outputs = (sequence_output, pooled_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + return outputs # sequence_output, pooled_output, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with two heads on top as done during the pre-training: + a `masked language modeling` head and a `next sentence prediction (classification)` head. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForPreTraining(BertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when both ``masked_lm_labels`` and ``next_sentence_label`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForPreTraining.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + prediction_scores, seq_relationship_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + masked_lm_labels=None, next_sentence_label=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + outputs = (total_loss,) + outputs + + return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMaskedLM.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.cls.predictions.decoder, + self.bert.embeddings.word_embeddings) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + masked_lm_labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + outputs = (masked_lm_loss,) + outputs + + return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForNextSentencePrediction(BertPreTrainedModel): + r""" + **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) + Indices should be in ``[0, 1]``. + ``0`` indicates sequence B is a continuation of sequence A, + ``1`` indicates sequence B is a random sequence. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``next_sentence_label`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Next sequence prediction (classification) loss. + **seq_relationship_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, 2)`` + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + seq_relationship_scores = outputs[0] + + """ + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + next_sentence_label=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + seq_relationship_score = self.cls(pooled_output) + + outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + outputs = (next_sentence_loss,) + outputs + + return outputs # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForSequenceClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForSequenceClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForMultipleChoice(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForMultipleChoice.from_pretrained('bert-base-uncased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForMultipleChoice, self).__init__(config) + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + num_choices = input_ids.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForTokenClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForTokenClassification.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, + position_ids=None, head_mask=None, labels=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels)[active_loss] + active_labels = labels.view(-1)[active_loss] + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING) +class BertForQuestionAnswering(BertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.bert(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_ctrl.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_ctrl.py new file mode 100644 index 0000000..55e64d3 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_ctrl.py @@ -0,0 +1,485 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch CTRL model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import json +import logging +import math +import os +import sys +from io import open +import numpy as np +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_ctrl import CTRLConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +CTRL_PRETRAINED_MODEL_ARCHIVE_MAP = {"ctrl": "https://storage.googleapis.com/sf-ctrl/pytorch/seqlen256_v1.bin"} + + +def angle_defn(pos, i, d_model_size): + angle_rates = 1 / torch.pow(10000, (2 * (i//2)) / d_model_size) + return pos * angle_rates + +def positional_encoding(position, d_model_size, dtype): + # create the sinusoidal pattern for the positional encoding + angle_rads = (angle_defn(torch.arange(position, dtype=dtype).unsqueeze(1), + torch.arange(d_model_size, dtype=dtype).unsqueeze(0), + d_model_size)) + + sines = torch.sin(angle_rads[:, 0::2]) + cosines = torch.cos(angle_rads[:, 1::2]) + + pos_encoding = torch.cat([sines, cosines], dim=-1) + return pos_encoding + +def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None): + # calculate attention + matmul_qk = torch.matmul(q, k.permute(0,1,3,2)) + + dk = k.shape[-1] + scaled_attention_logits = matmul_qk / np.sqrt(dk) + + if mask is not None: + scaled_attention_logits += (mask * -1e4) + + if attention_mask is not None: + # Apply the attention mask + scaled_attention_logits = scaled_attention_logits + attention_mask + + attention_weights = torch.softmax(scaled_attention_logits, dim=-1) + + # Mask heads if we want to + if head_mask is not None: + attention_weights = attention_weights * head_mask + + output = torch.matmul(attention_weights, v) + + return output, attention_weights + + +class MultiHeadAttention(torch.nn.Module): + def __init__(self, d_model_size, num_heads, output_attentions=False): + super(MultiHeadAttention, self).__init__() + self.output_attentions = output_attentions + self.num_heads = num_heads + self.d_model_size = d_model_size + + self.depth = int(d_model_size / self.num_heads) + + self.Wq = torch.nn.Linear(d_model_size, d_model_size) + self.Wk = torch.nn.Linear(d_model_size, d_model_size) + self.Wv = torch.nn.Linear(d_model_size, d_model_size) + + self.dense = torch.nn.Linear(d_model_size, d_model_size) + + def split_into_heads(self, x, batch_size): + x = x.reshape(batch_size, -1, self.num_heads, self.depth) + return x.permute([0, 2, 1, 3]) + + def forward(self, v, k, q, mask, layer_past=None, attention_mask=None, head_mask=None): + batch_size = q.shape[0] + + q = self.Wq(q) + k = self.Wk(k) + v = self.Wv(v) + + q = self.split_into_heads(q, batch_size) + k = self.split_into_heads(k, batch_size) + v = self.split_into_heads(v, batch_size) + if layer_past is not None: + past_key, past_value = layer_past[0], layer_past[1] + k = torch.cat((past_key, k), dim=-2) + v = torch.cat((past_value, v), dim=-2) + present = torch.stack((k, v)) + + output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask) + scaled_attention = output[0].permute([0, 2, 1, 3]) + attn = output[1] + original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size) + output = self.dense(original_size_attention) + + outputs = (output, present) + if self.output_attentions: + outputs = outputs + (attn,) + return outputs + + + +def point_wise_feed_forward_network(d_model_size, dff): + return torch.nn.Sequential(torch.nn.Linear(d_model_size, dff), + torch.nn.ReLU(), + torch.nn.Linear(dff, d_model_size)) + + +class EncoderLayer(torch.nn.Module): + def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False): + super(EncoderLayer, self).__init__() + + self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions) + self.ffn = point_wise_feed_forward_network(d_model_size, dff) + + self.layernorm1 = torch.nn.LayerNorm(d_model_size, eps=1e-6) + self.layernorm2 = torch.nn.LayerNorm(d_model_size, eps=1e-6) + + self.dropout1 = torch.nn.Dropout(rate) + self.dropout2 = torch.nn.Dropout(rate) + + def forward(self, x, mask, layer_past=None, attention_mask=None, head_mask=None): + normed = self.layernorm1(x) + attn_outputs = self.multi_head_attention(normed, normed, normed, mask, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask) + attn_output = attn_outputs[0] + attn_output = self.dropout1(attn_output) + out1 = x + attn_output + + out2 = self.layernorm2(out1) + ffn_output = self.ffn(out2) + ffn_output = self.dropout2(ffn_output) + out2 = out1 + ffn_output + + outputs = (out2,) + attn_outputs[1:] + return outputs + + +class CTRLPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = CTRLConfig + pretrained_model_archive_map = CTRL_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "transformer" + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +CTRL_START_DOCSTRING = r""" CTRL model was proposed in + `CTRL: A Conditional Transformer Language Model for Controllable Generation`_ + by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. + It's a causal (unidirectional) transformer pre-trained using language modeling on a very large + corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.). + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`CTRL: A Conditional Transformer Language Model for Controllable Generation`: + https://www.github.com/salesforce/ctrl + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.CTRLConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +CTRL_INPUTS_DOCSTRING = r""" Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + Indices can be obtained using :class:`transformers.CTRLTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **past**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + (see `past` output below). Can be used to speed up sequential decoding. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The embeddings from these tokens will be summed with the respective token embeddings. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.", + CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) +class CTRLModel(CTRLPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **past**: + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + that contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = CTRLTokenizer.from_pretrained('ctrl') + model = CTRLModel.from_pretrained('ctrl') + input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(CTRLModel, self).__init__(config) + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + self.output_past = config.output_past + + self.d_model_size = config.n_embd + self.num_layers = config.n_layer + + self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float) + + self.w = nn.Embedding(config.vocab_size, config.n_embd) + + self.dropout = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([EncoderLayer(config.n_embd, + config.n_head, + config.dff, + config.resid_pdrop, + config.output_attentions) for _ in range(config.n_layer)]) + self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + self.w = self._get_resized_embeddings(self.w, new_num_tokens) + return self.w + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + if past is None: + past_length = 0 + past = [None] * len(self.h) + else: + past_length = past[0][0].size(-2) + if position_ids is None: + position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + # Attention mask. + if attention_mask is not None: + attention_mask = attention_mask.view(-1, input_shape[-1]) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.n_layer + + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + token_type_embeds = self.w(token_type_ids) + token_type_embeds *= np.sqrt(self.d_model_size) + else: + token_type_embeds = 0 + position_ids = position_ids.view(-1, input_shape[-1]) + + inputs_embeds = self.w(input_ids) + # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded + seq_len = input_ids.shape[-1] + mask = torch.triu(torch.ones(seq_len, seq_len), 1).to(inputs_embeds.device) + + inputs_embeds *= np.sqrt(self.d_model_size) + + pos_embeds = self.pos_encoding[position_ids, :].to(inputs_embeds.device) + + hidden_states = inputs_embeds + pos_embeds + token_type_embeds + + hidden_states = self.dropout(hidden_states) + + output_shape = input_shape + (inputs_embeds.size(-1),) + presents = () + all_hidden_states = () + all_attentions = [] + for i, (h, layer_past) in enumerate(zip(self.h, past)): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) + outputs = h(hidden_states, + mask, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask[i]) + hidden_states, present = outputs[:2] + if self.output_past: + presents = presents + (present,) + + if self.output_attentions: + all_attentions.append(outputs[2]) + + hidden_states = self.layernorm(hidden_states) + hidden_states = hidden_states.view(*output_shape) + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_past: + outputs = outputs + (presents,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + # let the number of heads free (-1) so we can extract attention even after head pruning + attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] + all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) + outputs = outputs + (all_attentions,) + return outputs + + +@add_start_docstrings("""The CTRL Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, CTRL_START_DOCSTRING, CTRL_INPUTS_DOCSTRING) +class CTRLLMHeadModel(CTRLPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **past**: + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + that contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import torch + from transformers import CTRLTokenizer, CTRLLMHeadModel + + tokenizer = CTRLTokenizer.from_pretrained('ctrl') + model = CTRLLMHeadModel.from_pretrained('ctrl') + + input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=input_ids) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(CTRLLMHeadModel, self).__init__(config) + self.transformer = CTRLModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head, self.transformer.w) + + def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + labels=None): + transformer_outputs = self.transformer(input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + outputs = (lm_logits,) + transformer_outputs[1:] + + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_distilbert.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_distilbert.py new file mode 100644 index 0000000..d3b4ccf --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_distilbert.py @@ -0,0 +1,693 @@ +# coding=utf-8 +# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch DistilBERT model + adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM) + and in part from HuggingFace PyTorch version of Google AI Bert model (https://github.com/google-research/bert) +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import copy +import sys +from io import open + +import itertools +import numpy as np + +import torch +import torch.nn as nn + +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_distilbert import DistilBertConfig +from .file_utils import add_start_docstrings + +import logging +logger = logging.getLogger(__name__) + + +DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-pytorch_model.bin", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" +} + + +### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### +def gelu(x): + return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) + +def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([ + [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] + for pos in range(n_pos) + ]) + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + out.requires_grad = False + +class Embeddings(nn.Module): + def __init__(self, + config): + super(Embeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim) + if config.sinusoidal_pos_embds: + create_sinusoidal_embeddings(n_pos=config.max_position_embeddings, + dim=config.dim, + out=self.position_embeddings.weight) + + self.LayerNorm = nn.LayerNorm(config.dim, eps=1e-12) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, input_ids): + """ + Parameters + ---------- + input_ids: torch.tensor(bs, max_seq_length) + The token ids to embed. + + Outputs + ------- + embeddings: torch.tensor(bs, max_seq_length, dim) + The embedded tokens (plus position embeddings, no token_type embeddings) + """ + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) # (max_seq_length) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # (bs, max_seq_length) + + word_embeddings = self.word_embeddings(input_ids) # (bs, max_seq_length, dim) + position_embeddings = self.position_embeddings(position_ids) # (bs, max_seq_length, dim) + + embeddings = word_embeddings + position_embeddings # (bs, max_seq_length, dim) + embeddings = self.LayerNorm(embeddings) # (bs, max_seq_length, dim) + embeddings = self.dropout(embeddings) # (bs, max_seq_length, dim) + return embeddings + +class MultiHeadSelfAttention(nn.Module): + def __init__(self, config): + super(MultiHeadSelfAttention, self).__init__() + + self.n_heads = config.n_heads + self.dim = config.dim + self.dropout = nn.Dropout(p=config.attention_dropout) + self.output_attentions = config.output_attentions + + assert self.dim % self.n_heads == 0 + + self.q_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.k_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) + + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.dim // self.n_heads + if len(heads) == 0: + return + mask = torch.ones(self.n_heads, attention_head_size) + heads = set(heads) - self.pruned_heads + for head in heads: + head -= sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, query, key, value, mask, head_mask = None): + """ + Parameters + ---------- + query: torch.tensor(bs, seq_length, dim) + key: torch.tensor(bs, seq_length, dim) + value: torch.tensor(bs, seq_length, dim) + mask: torch.tensor(bs, seq_length) + + Outputs + ------- + weights: torch.tensor(bs, n_heads, seq_length, seq_length) + Attention weights + context: torch.tensor(bs, seq_length, dim) + Contextualized layer. Optional: only if `output_attentions=True` + """ + bs, q_length, dim = query.size() + k_length = key.size(1) + # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) + # assert key.size() == value.size() + + dim_per_head = self.dim // self.n_heads + + mask_reshp = (bs, 1, 1, k_length) + + def shape(x): + """ separate heads """ + return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x): + """ group heads """ + return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(query)) # (bs, n_heads, q_length, dim_per_head) + k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) + v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, q_length, dim_per_head) + scores = torch.matmul(q, k.transpose(2,3)) # (bs, n_heads, q_length, k_length) + mask = (mask==0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) + scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, q_length, k_length) + + weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) + weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, q_length, dim_per_head) + context = unshape(context) # (bs, q_length, dim) + context = self.out_lin(context) # (bs, q_length, dim) + + if self.output_attentions: + return (context, weights) + else: + return (context,) + +class FFN(nn.Module): + def __init__(self, config): + super(FFN, self).__init__() + self.dropout = nn.Dropout(p=config.dropout) + self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim) + self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim) + assert config.activation in ['relu', 'gelu'], "activation ({}) must be in ['relu', 'gelu']".format(config.activation) + self.activation = gelu if config.activation == 'gelu' else nn.ReLU() + + def forward(self, input): + x = self.lin1(input) + x = self.activation(x) + x = self.lin2(x) + x = self.dropout(x) + return x + +class TransformerBlock(nn.Module): + def __init__(self, config): + super(TransformerBlock, self).__init__() + + self.n_heads = config.n_heads + self.dim = config.dim + self.hidden_dim = config.hidden_dim + self.dropout = nn.Dropout(p=config.dropout) + self.activation = config.activation + self.output_attentions = config.output_attentions + + assert config.dim % config.n_heads == 0 + + self.attention = MultiHeadSelfAttention(config) + self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + self.ffn = FFN(config) + self.output_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12) + + def forward(self, x, attn_mask=None, head_mask=None): + """ + Parameters + ---------- + x: torch.tensor(bs, seq_length, dim) + attn_mask: torch.tensor(bs, seq_length) + + Outputs + ------- + sa_weights: torch.tensor(bs, n_heads, seq_length, seq_length) + The attention weights + ffn_output: torch.tensor(bs, seq_length, dim) + The output of the transformer block contextualization. + """ + # Self-Attention + sa_output = self.attention(query=x, key=x, value=x, mask=attn_mask, head_mask=head_mask) + if self.output_attentions: + sa_output, sa_weights = sa_output # (bs, seq_length, dim), (bs, n_heads, seq_length, seq_length) + else: # To handle these `output_attention` or `output_hidden_states` cases returning tuples + assert type(sa_output) == tuple + sa_output = sa_output[0] + sa_output = self.sa_layer_norm(sa_output + x) # (bs, seq_length, dim) + + # Feed Forward Network + ffn_output = self.ffn(sa_output) # (bs, seq_length, dim) + ffn_output = self.output_layer_norm(ffn_output + sa_output) # (bs, seq_length, dim) + + output = (ffn_output,) + if self.output_attentions: + output = (sa_weights,) + output + return output + + +class Transformer(nn.Module): + def __init__(self, config): + super(Transformer, self).__init__() + self.n_layers = config.n_layers + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + layer = TransformerBlock(config) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.n_layers)]) + + def forward(self, x, attn_mask=None, head_mask=None): + """ + Parameters + ---------- + x: torch.tensor(bs, seq_length, dim) + Input sequence embedded. + attn_mask: torch.tensor(bs, seq_length) + Attention mask on the sequence. + + Outputs + ------- + hidden_state: torch.tensor(bs, seq_length, dim) + Sequence of hiddens states in the last (top) layer + all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)] + Tuple of length n_layers with the hidden states from each layer. + Optional: only if output_hidden_states=True + all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] + Tuple of length n_layers with the attention weights from each layer + Optional: only if output_attentions=True + """ + all_hidden_states = () + all_attentions = () + + hidden_state = x + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + layer_outputs = layer_module(x=hidden_state, + attn_mask=attn_mask, + head_mask=head_mask[i]) + hidden_state = layer_outputs[-1] + + if self.output_attentions: + assert len(layer_outputs) == 2 + attentions = layer_outputs[0] + all_attentions = all_attentions + (attentions,) + else: + assert len(layer_outputs) == 1 + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_state,) + + outputs = (hidden_state,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + + +### INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL ### +class DistilBertPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for downloading and loading pretrained models. + """ + config_class = DistilBertConfig + pretrained_model_archive_map = DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = None + base_model_prefix = "distilbert" + + def __init__(self, *inputs, **kwargs): + super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, nn.Embedding): + if module.weight.requires_grad: + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +DISTILBERT_START_DOCSTRING = r""" + DistilBERT is a small, fast, cheap and light Transformer model + trained by distilling Bert base. It has 40% less parameters than + `bert-base-uncased`, runs 60% faster while preserving over 95% of + Bert's performances as measured on the GLUE language understanding benchmark. + + Here are the differences between the interface of Bert and DistilBert: + + - DistilBert doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or `[SEP]`) + - DistilBert doesn't have options to select the input positions (`position_ids` input). This could be added if necessary though, just let's us know if you need this option. + + For more information on DistilBERT, please refer to our + `detailed blog post`_ + + .. _`detailed blog post`: + https://medium.com/huggingface/distilbert-8cf3380435b5 + + Parameters: + config (:class:`~transformers.DistilBertConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +DISTILBERT_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids** ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + The input sequences should start with `[CLS]` and end with `[SEP]` tokens. + + For now, ONLY BertTokenizer(`bert-base-uncased`) is supported and you should use this tokenizer when using DistilBERT. + **attention_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.", + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertModel(DistilBertPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertModel.from_pretrained('distilbert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(DistilBertModel, self).__init__(config) + + self.embeddings = Embeddings(config) # Embeddings + self.transformer = Transformer(config) # Encoder + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.embeddings.word_embeddings + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + self.embeddings.word_embeddings = new_embeddings + return self.embeddings.word_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.transformer.layer[layer].attention.prune_heads(heads) + + def forward(self, + input_ids, attention_mask=None, head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) # (bs, seq_length) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + embedding_output = self.embeddings(input_ids) # (bs, seq_length, dim) + tfmr_output = self.transformer(x=embedding_output, + attn_mask=attention_mask, + head_mask=head_mask) + hidden_state = tfmr_output[0] + output = (hidden_state, ) + tfmr_output[1:] + + return output # last-layer hidden-state, (all hidden_states), (all attentions) + + +@add_start_docstrings("""DistilBert Model with a `masked language modeling` head on top. """, + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForMaskedLM(DistilBertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(DistilBertForMaskedLM, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + self.distilbert = DistilBertModel(config) + self.vocab_transform = nn.Linear(config.dim, config.dim) + self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) + self.vocab_projector = nn.Linear(config.dim, config.vocab_size) + + self.init_weights() + self.tie_weights() + + self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.vocab_projector, + self.distilbert.embeddings.word_embeddings) + + def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None): + dlbrt_output = self.distilbert(input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask) + hidden_states = dlbrt_output[0] # (bs, seq_length, dim) + prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) + prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) + prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) + + outputs = (prediction_logits, ) + dlbrt_output[1:] + if masked_lm_labels is not None: + mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), + masked_lm_labels.view(-1)) + outputs = (mlm_loss,) + outputs + + return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) + + +@add_start_docstrings("""DistilBert Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForSequenceClassification(DistilBertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(DistilBertForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.distilbert = DistilBertModel(config) + self.pre_classifier = nn.Linear(config.dim, config.dim) + self.classifier = nn.Linear(config.dim, config.num_labels) + self.dropout = nn.Dropout(config.seq_classif_dropout) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None): + distilbert_output = self.distilbert(input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask) + hidden_state = distilbert_output[0] # (bs, seq_len, dim) + pooled_output = hidden_state[:, 0] # (bs, dim) + pooled_output = self.pre_classifier(pooled_output) # (bs, dim) + pooled_output = nn.ReLU()(pooled_output) # (bs, dim) + pooled_output = self.dropout(pooled_output) # (bs, dim) + logits = self.classifier(pooled_output) # (bs, dim) + + outputs = (logits,) + distilbert_output[1:] + if labels is not None: + if self.num_labels == 1: + loss_fct = nn.MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + + +@add_start_docstrings("""DistilBert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING) +class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') + model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:3] + + """ + def __init__(self, config): + super(DistilBertForQuestionAnswering, self).__init__(config) + + self.distilbert = DistilBertModel(config) + self.qa_outputs = nn.Linear(config.dim, config.num_labels) + assert config.num_labels == 2 + self.dropout = nn.Dropout(config.qa_dropout) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None): + distilbert_output = self.distilbert(input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask) + hidden_states = distilbert_output[0] # (bs, max_query_len, dim) + + hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim) + logits = self.qa_outputs(hidden_states) # (bs, max_query_len, 2) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) # (bs, max_query_len) + end_logits = end_logits.squeeze(-1) # (bs, max_query_len) + + outputs = (start_logits, end_logits,) + distilbert_output[1:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = nn.CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_gpt2.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_gpt2.py new file mode 100644 index 0000000..0b5b83a --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_gpt2.py @@ -0,0 +1,662 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT-2 model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import json +import logging +import math +import os +import sys +from io import open + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_gpt2 import GPT2Config +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin", + "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin", + "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin", + "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",} + +def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(gpt2_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array.squeeze()) + + for name, array in zip(names, arrays): + name = name[6:] # skip "model/" + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'w' or l[0] == 'g': + pointer = getattr(pointer, 'weight') + elif l[0] == 'b': + pointer = getattr(pointer, 'bias') + elif l[0] == 'wpe' or l[0] == 'wte': + pointer = getattr(pointer, l[0]) + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, config, scale=False): + super(Attention, self).__init__() + self.output_attentions = config.output_attentions + + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + assert n_state % config.n_head == 0 + self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = config.n_head + self.split_size = n_state + self.scale = scale + + self.c_attn = Conv1D(n_state * 3, nx) + self.c_proj = Conv1D(n_state, nx) + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.n_head, self.split_size // self.n_head) + heads = set(heads) - self.pruned_heads # Convert to set and emove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)]) + + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + + # Update hyper params + self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) + self.n_head = self.n_head - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, q, k, v, attention_mask=None, head_mask=None): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + nd, ns = w.size(-2), w.size(-1) + b = self.bias[:, :, ns-nd:ns, :ns] + w = w * b - 1e4 * (1 - b) + + if attention_mask is not None: + # Apply the attention mask + w = w + attention_mask + + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + + # Mask heads if we want to + if head_mask is not None: + w = w * head_mask + + outputs = [torch.matmul(w, v)] + if self.output_attentions: + outputs.append(w) + return outputs + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) # (batch, head, head_features, seq_length) + else: + return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def forward(self, x, layer_past=None, attention_mask=None, head_mask=None): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + if layer_past is not None: + past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1] # transpose back cf below + key = torch.cat((past_key, key), dim=-1) + value = torch.cat((past_value, value), dim=-2) + present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking + + attn_outputs = self._attn(query, key, value, attention_mask, head_mask) + a = attn_outputs[0] + + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + + outputs = [a, present] + attn_outputs[1:] + return outputs # a, present, (attentions) + + +class MLP(nn.Module): + def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = config.n_embd + self.c_fc = Conv1D(n_state, nx) + self.c_proj = Conv1D(nx, n_state) + self.act = gelu + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, config, scale=False): + super(Block, self).__init__() + nx = config.n_embd + self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) + self.attn = Attention(nx, n_ctx, config, scale) + self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) + self.mlp = MLP(4 * nx, config) + + def forward(self, x, layer_past=None, attention_mask=None, head_mask=None): + output_attn = self.attn(self.ln_1(x), + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask) + a = output_attn[0] # output_attn: a, present, (attentions) + + x = x + a + m = self.mlp(self.ln_2(x)) + x = x + m + + outputs = [x] + output_attn[1:] + return outputs # x, present, (attentions) + + +class GPT2PreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = GPT2Config + pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_gpt2 + base_model_prefix = "transformer" + + def __init__(self, *inputs, **kwargs): + super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in + `Language Models are Unsupervised Multitask Learners`_ + by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**. + It's a causal (unidirectional) transformer pre-trained using language modeling on a very large + corpus of ~40 GB of text data. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Language Models are Unsupervised Multitask Learners`: + https://openai.com/blog/better-language-models/ + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.GPT2Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +GPT2_INPUTS_DOCSTRING = r""" Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + Indices can be obtained using :class:`transformers.GPT2Tokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **past**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + (see `past` output below). Can be used to speed up sequential decoding. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The embeddings from these tokens will be summed with the respective token embeddings. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.", + GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +class GPT2Model(GPT2PreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **past**: + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + that contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(GPT2Model, self).__init__(config) + self.output_hidden_states = config.output_hidden_states + self.output_attentions = config.output_attentions + self.output_past = config.output_past + + self.wte = nn.Embedding(config.vocab_size, config.n_embd) + self.wpe = nn.Embedding(config.n_positions, config.n_embd) + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) + self.ln_f = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + self.wte = self._get_resized_embeddings(self.wte, new_num_tokens) + return self.wte + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, input_shape[-1]) + if position_ids is not None: + position_ids = position_ids.view(-1, input_shape[-1]) + + if past is None: + past_length = 0 + past = [None] * len(self.h) + else: + past_length = past[0][0].size(-2) + if position_ids is None: + position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + # Attention mask. + if attention_mask is not None: + attention_mask = attention_mask.view(-1, input_shape[-1]) + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.n_layer + + inputs_embeds = self.wte(input_ids) + position_embeds = self.wpe(position_ids) + if token_type_ids is not None: + token_type_embeds = self.wte(token_type_ids) + else: + token_type_embeds = 0 + hidden_states = inputs_embeds + position_embeds + token_type_embeds + hidden_states = self.drop(hidden_states) + + output_shape = input_shape + (hidden_states.size(-1),) + + presents = () + all_attentions = [] + all_hidden_states = () + for i, (block, layer_past) in enumerate(zip(self.h, past)): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) + + outputs = block(hidden_states, + layer_past=layer_past, + attention_mask=attention_mask, + head_mask=head_mask[i]) + + hidden_states, present = outputs[:2] + if self.output_past: + presents = presents + (present,) + + if self.output_attentions: + all_attentions.append(outputs[2]) + + hidden_states = self.ln_f(hidden_states) + + hidden_states = hidden_states.view(*output_shape) + # Add last hidden state + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_past: + outputs = outputs + (presents,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + # let the number of heads free (-1) so we can extract attention even after head pruning + attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] + all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) + outputs = outputs + (all_attentions,) + return outputs # last hidden state, (presents), (all hidden_states), (attentions) + + +@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +class GPT2LMHeadModel(GPT2PreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **past**: + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + that contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import torch + from transformers import GPT2Tokenizer, GPT2LMHeadModel + + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2LMHeadModel.from_pretrained('gpt2') + + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=input_ids) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(GPT2LMHeadModel, self).__init__(config) + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head, + self.transformer.wte) + + def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + labels=None): + transformer_outputs = self.transformer(input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + outputs = (lm_logits,) + transformer_outputs[1:] + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), lm_logits, presents, (all hidden_states), (attentions) + + +@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification +head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. +The language modeling head has its weights tied to the input embeddings, +the classification head takes as input the input of a specified classification token index in the input sequence). +""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING) +class GPT2DoubleHeadsModel(GPT2PreTrainedModel): + r""" + **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``: + Index of the classification token in each input sequence. + Selected in the range ``[0, input_ids.size(-1) - 1[``. + **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Multiple choice classification loss. + **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` + Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax). + **past**: + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + that contains pre-computed hidden-states (key and values in the attention blocks). + Can be used (see `past` input) to speed up sequential decoding. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import torch + from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel + + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2DoubleHeadsModel.from_pretrained('gpt2') + + # Add a [CLS] to the vocabulary (we should train it also!) + tokenizer.add_special_tokens({'cls_token': '[CLS]'}) + model.resize_token_embeddings(len(tokenizer)) # Update the model embeddings with the new vocabulary size + print(tokenizer.cls_token_id, len(tokenizer)) # The newly token the last token of the vocabulary + + choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + encoded_choices = [tokenizer.encode(s) for s in choices] + cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices] + + input_ids = torch.tensor(encoded_choices).unsqueeze(0) # Batch size: 1, number of choices: 2 + mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 + + outputs = model(input_ids, mc_token_ids=mc_token_ids) + lm_prediction_scores, mc_prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(GPT2DoubleHeadsModel, self).__init__(config) + self.transformer = GPT2Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.multiple_choice_head = SequenceSummary(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head, + self.transformer.wte) + + def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + mc_token_ids=None, lm_labels=None, mc_labels=None): + transformer_outputs = self.transformer(input_ids, + past=past, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) + + outputs = (lm_logits, mc_logits) + transformer_outputs[1:] + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), + mc_labels.view(-1)) + outputs = (loss,) + outputs + if lm_labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_openai.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_openai.py new file mode 100644 index 0000000..52f3b7d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_openai.py @@ -0,0 +1,621 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch OpenAI GPT model.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import json +import logging +import math +import os +import sys +from io import open + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_openai import OpenAIGPTConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"} + + +def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): + """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here) + """ + import re + import numpy as np + + if '.ckpt' in openai_checkpoint_folder_path: + openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path) + + logger.info("Loading weights from {}".format(openai_checkpoint_folder_path)) + + names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8')) + shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8')) + offsets = np.cumsum([np.prod(shape) for shape in shapes]) + init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)] + init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1] + init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)] + + # This was used when we had a single embedding matrix for positions and tokens + # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0) + # del init_params[1] + init_params = [arr.squeeze() for arr in init_params] + + try: + assert model.tokens_embed.weight.shape == init_params[1].shape + assert model.positions_embed.weight.shape == init_params[0].shape + except AssertionError as e: + e.args += (model.tokens_embed.weight.shape, init_params[1].shape) + e.args += (model.positions_embed.weight.shape, init_params[0].shape) + raise + + model.tokens_embed.weight.data = torch.from_numpy(init_params[1]) + model.positions_embed.weight.data = torch.from_numpy(init_params[0]) + names.pop(0) + # Pop position and token embedding arrays + init_params.pop(0) + init_params.pop(0) + + for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]): + name = name[6:] # skip "model/" + assert name[-2:] == ":0" + name = name[:-2] + name = name.split('/') + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+\d+', m_name): + l = re.split(r'(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'g': + pointer = getattr(pointer, 'weight') + elif l[0] == 'b': + pointer = getattr(pointer, 'bias') + elif l[0] == 'w': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +def gelu(x): + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} + + +class Attention(nn.Module): + def __init__(self, nx, n_ctx, config, scale=False): + super(Attention, self).__init__() + n_state = nx # in Attention: n_state=768 (nx=n_embd) + # [switch nx => n_state from Block to Attention to keep identical to TF implem] + assert n_state % config.n_head == 0 + self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.n_head = config.n_head + self.split_size = n_state + self.scale = scale + + self.output_attentions = config.output_attentions + + self.c_attn = Conv1D(n_state * 3, nx) + self.c_proj = Conv1D(n_state, nx) + self.attn_dropout = nn.Dropout(config.attn_pdrop) + self.resid_dropout = nn.Dropout(config.resid_pdrop) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + mask = torch.ones(self.n_head, self.split_size // self.n_head) + heads = set(heads) - self.pruned_heads + for head in heads: + head -= sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)]) + # Prune conv1d layers + self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1) + self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0) + # Update hyper params + self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads)) + self.n_head = self.n_head - len(heads) + self.pruned_heads = self.pruned_heads.union(heads) + + def _attn(self, q, k, v, attention_mask=None, head_mask=None): + w = torch.matmul(q, k) + if self.scale: + w = w / math.sqrt(v.size(-1)) + # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights + # XD: self.b may be larger than w, so we need to crop it + b = self.bias[:, :, : w.size(-2), : w.size(-1)] + w = w * b + - 1e4 * (1 - b) + + if attention_mask is not None: + # Apply the attention mask + w = w + attention_mask + + w = nn.Softmax(dim=-1)(w) + w = self.attn_dropout(w) + + # Mask heads if we want to + if head_mask is not None: + w = w * head_mask + + outputs = [torch.matmul(w, v)] + if self.output_attentions: + outputs.append(w) + return outputs + + def merge_heads(self, x): + x = x.permute(0, 2, 1, 3).contiguous() + new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),) + return x.view(*new_x_shape) # in Tensorflow implem: fct merge_states + + def split_heads(self, x, k=False): + new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head) + x = x.view(*new_x_shape) # in Tensorflow implem: fct split_states + if k: + return x.permute(0, 2, 3, 1) + else: + return x.permute(0, 2, 1, 3) + + def forward(self, x, attention_mask=None, head_mask=None): + x = self.c_attn(x) + query, key, value = x.split(self.split_size, dim=2) + query = self.split_heads(query) + key = self.split_heads(key, k=True) + value = self.split_heads(value) + + attn_outputs = self._attn(query, key, value, attention_mask, head_mask) + a = attn_outputs[0] + + a = self.merge_heads(a) + a = self.c_proj(a) + a = self.resid_dropout(a) + + outputs = [a] + attn_outputs[1:] + return outputs # a, (attentions) + + +class MLP(nn.Module): + def __init__(self, n_state, config): # in MLP: n_state=3072 (4 * n_embd) + super(MLP, self).__init__() + nx = config.n_embd + self.c_fc = Conv1D(n_state, nx) + self.c_proj = Conv1D(nx, n_state) + self.act = ACT_FNS[config.afn] + self.dropout = nn.Dropout(config.resid_pdrop) + + def forward(self, x): + h = self.act(self.c_fc(x)) + h2 = self.c_proj(h) + return self.dropout(h2) + + +class Block(nn.Module): + def __init__(self, n_ctx, config, scale=False): + super(Block, self).__init__() + nx = config.n_embd + self.attn = Attention(nx, n_ctx, config, scale) + self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) + self.mlp = MLP(4 * nx, config) + self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) + + def forward(self, x, attention_mask=None, head_mask=None): + attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask) + a = attn_outputs[0] + + n = self.ln_1(x + a) + m = self.mlp(n) + h = self.ln_2(n + m) + + outputs = [h] + attn_outputs[1:] + return outputs + + +class OpenAIGPTPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = OpenAIGPTConfig + pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_openai_gpt + base_model_prefix = "transformer" + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in + `Improving Language Understanding by Generative Pre-Training`_ + by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. + It's a causal (unidirectional) transformer pre-trained using language modeling on a large + corpus will long range dependencies, the Toronto Book Corpus. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Improving Language Understanding by Generative Pre-Training`: + https://openai.com/blog/language-unsupervised/ + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + Indices can be obtained using :class:`transformers.BPT2Tokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The embeddings from these tokens will be summed with the respective token embeddings. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices) + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.", + OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +class OpenAIGPTModel(OpenAIGPTPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') + model = OpenAIGPTModel.from_pretrained('openai-gpt') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(OpenAIGPTModel, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd) + self.positions_embed = nn.Embedding(config.n_positions, config.n_embd) + self.drop = nn.Dropout(config.embd_pdrop) + self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)]) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens) + return self.tokens_embed + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + """ + for layer, heads in heads_to_prune.items(): + self.h[layer].attn.prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + if position_ids is None: + # This was used when we had a single embedding matrice from position and token embeddings + # start = self.config.vocab_size + self.config.n_special + # end = start + input_ids.size(-1) + # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device) + position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + + # Attention mask. + if attention_mask is not None: + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + attention_mask = (1.0 - attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # head_mask has shape n_layer x batch x n_heads x N x N + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.n_layer + + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_ids.size(-1)) + position_ids = position_ids.view(-1, position_ids.size(-1)) + + inputs_embeds = self.tokens_embed(input_ids) + position_embeds = self.positions_embed(position_ids) + if token_type_ids is not None: + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + token_type_embeds = self.tokens_embed(token_type_ids) + else: + token_type_embeds = 0 + hidden_states = inputs_embeds + position_embeds + token_type_embeds + hidden_states = self.drop(hidden_states) + + output_shape = input_shape + (hidden_states.size(-1),) + + all_attentions = () + all_hidden_states = () + for i, block in enumerate(self.h): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) + + outputs = block(hidden_states, attention_mask, head_mask[i]) + hidden_states = outputs[0] + if self.output_attentions: + all_attentions = all_attentions + (outputs[1],) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) + + outputs = (hidden_states.view(*output_shape),) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last hidden state, (all hidden states), (all attentions) + + +@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top +(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') + model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=input_ids) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(OpenAIGPTLMHeadModel, self).__init__(config) + self.transformer = OpenAIGPTModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head, + self.transformer.tokens_embed) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + hidden_states = transformer_outputs[0] + lm_logits = self.lm_head(hidden_states) + + outputs = (lm_logits,) + transformer_outputs[1:] + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), lm_logits, (all hidden states), (all attentions) + + +@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification +head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. +The language modeling head has its weights tied to the input embeddings, +the classification head takes as input the input of a specified classification token index in the input sequence). +""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING) +class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): + r""" + **mc_token_ids**: (`optional`, default to index of the last token of the input) ``torch.LongTensor`` of shape ``(batch_size, num_choices)``: + Index of the classification token in each input sequence. + Selected in the range ``[0, input_ids.size(-1) - 1[``. + **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size] + with indices selected in [0, ..., num_choices]. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Multiple choice classification loss. + **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` + Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') + model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') + tokenizer.add_special_tokens({'cls_token': '[CLS]'}) # Add a [CLS] to the vocabulary (we should train it also!) + choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"] + input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + mc_token_ids = torch.tensor([input_ids.size(-1), input_ids.size(-1)]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, mc_token_ids=mc_token_ids) + lm_prediction_scores, mc_prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(OpenAIGPTDoubleHeadsModel, self).__init__(config) + + self.transformer = OpenAIGPTModel(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) + self.multiple_choice_head = SequenceSummary(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head, + self.transformer.tokens_embed) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + mc_token_ids=None, lm_labels=None, mc_labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1) + + outputs = (lm_logits, mc_logits) + transformer_outputs[1:] + if mc_labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), + mc_labels.view(-1)) + outputs = (loss,) + outputs + if lm_labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_roberta.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_roberta.py new file mode 100644 index 0000000..4ea0800 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_roberta.py @@ -0,0 +1,470 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch RoBERTa model. """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +import torch +import torch.nn as nn +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu +from .configuration_roberta import RobertaConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin", +} + +class RobertaEmbeddings(BertEmbeddings): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + def __init__(self, config): + super(RobertaEmbeddings, self).__init__(config) + self.padding_idx = 1 + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=self.padding_idx) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, + padding_idx=self.padding_idx) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + # Position numbers begin at padding_idx+1. Padding symbols are ignored. + # cf. fairseq's `utils.make_positions` + position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + return super(RobertaEmbeddings, self).forward(input_ids, + token_type_ids=token_type_ids, + position_ids=position_ids) + + +ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in + `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ + by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, + Veselin Stoyanov. It is based on Google's BERT model released in 2018. + + It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining + objective and training with much larger mini-batches and learning rates. + + This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained + models. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`: + https://arxiv.org/abs/1907.11692 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +ROBERTA_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, RoBERTa input sequence should be formatted with and tokens as follows: + + (a) For sequence pairs: + + ``tokens: Is this Jacksonville ? No it is not . `` + + (b) For single sequences: + + ``tokens: the dog is hairy . `` + + Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with + the ``add_special_tokens`` parameter set to ``True``. + + RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Optional segment token indices to indicate first and second portions of the inputs. + This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it + during finetuning. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1[``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaModel(BertModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaModel.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaModel, self).__init__(config) + + self.embeddings = RobertaEmbeddings(config) + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + if input_ids[:, 0].sum().item() != 0: + logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. " + "This model requires special tokens in order to work. " + "Please specify add_special_tokens=True in your tokenize.encode()" + "or tokenizer.convert_tokens_to_ids().") + return super(RobertaModel, self).forward(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + +@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForMaskedLM(BertPreTrainedModel): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMaskedLM.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaForMaskedLM, self).__init__(config) + + self.roberta = RobertaModel(config) + self.lm_head = RobertaLMHead(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the input and output embeddings. + Export to TorchScript can't handle parameter sharing so we are cloning them instead. + """ + self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + masked_lm_labels=None): + outputs = self.roberta(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + outputs = (masked_lm_loss,) + outputs + + return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) + + +class RobertaLMHead(nn.Module): + """Roberta Head for masked language modeling.""" + + def __init__(self, config): + super(RobertaLMHead, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + self.bias + + return x + + +@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer + on top of the pooled output) e.g. for GLUE tasks. """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForSequenceClassification(BertPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForSequenceClassification.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config) + self.classifier = RobertaClassificationHead(config) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + labels=None): + outputs = self.roberta(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (loss), logits, (hidden_states), (attentions) + +@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForMultipleChoice(BertPreTrainedModel): + r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Indices can be obtained using :class:`transformers.BertTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Mask to avoid performing attention on padding token indices. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMultipleChoice.from_pretrained('roberta-base') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaForMultipleChoice, self).__init__(config) + + self.roberta = RobertaModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + self.init_weights() + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, + position_ids=None, head_mask=None): + num_choices = input_ids.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, head_mask=head_mask) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + outputs = (loss,) + outputs + + return outputs # (loss), reshaped_logits, (hidden_states), (attentions) + + + +class RobertaClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super(RobertaClassificationHead, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl.py new file mode 100644 index 0000000..6d430e1 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl.py @@ -0,0 +1,890 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Transformer XL model. + Adapted from https://github.com/kimiyoung/transformer-xl. + In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py +""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import json +import math +import logging +import collections +import sys +from io import open + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn import CrossEntropyLoss +from torch.nn.parameter import Parameter + +from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary +from .configuration_transfo_xl import TransfoXLConfig +from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-pytorch_model.bin", +} + +def build_tf_to_pytorch_map(model, config): + """ A map of modules from TF to PyTorch. + This time I use a map to keep the PyTorch model as identical to the original PyTorch model as possible. + """ + tf_to_pt_map = {} + + if hasattr(model, 'transformer'): + # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax + tf_to_pt_map.update({ + "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight, + "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias}) + for i, (out_l, proj_l, tie_proj) in enumerate(zip( + model.crit.out_layers, + model.crit.out_projs, + config.tie_projs)): + layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i + if config.tie_weight: + tf_to_pt_map.update({ + layer_str + 'b': out_l.bias}) + else: + raise NotImplementedError + # I don't think this is implemented in the TF code + tf_to_pt_map.update({ + layer_str + 'lookup_table': out_l.weight, + layer_str + 'b': out_l.bias}) + if not tie_proj: + tf_to_pt_map.update({ + layer_str + 'proj': proj_l + }) + # Now load the rest of the transformer + model = model.transformer + + # Embeddings + for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)): + layer_str = "transformer/adaptive_embed/cutoff_%d/" % i + tf_to_pt_map.update({ + layer_str + 'lookup_table': embed_l.weight, + layer_str + 'proj_W': proj_l + }) + + # Transformer blocks + for i, b in enumerate(model.layers): + layer_str = "transformer/layer_%d/" % i + tf_to_pt_map.update({ + layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight, + layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight, + layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight, + layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight, + layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias, + layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight, + layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias, + }) + + # Relative positioning biases + if config.untie_r: + r_r_list = [] + r_w_list = [] + for b in model.layers: + r_r_list.append(b.dec_attn.r_r_bias) + r_w_list.append(b.dec_attn.r_w_bias) + else: + r_r_list = [model.r_r_bias] + r_w_list = [model.r_w_bias] + tf_to_pt_map.update({ + 'transformer/r_r_bias': r_r_list, + 'transformer/r_w_bias': r_w_list}) + return tf_to_pt_map + +def load_tf_weights_in_transfo_xl(model, config, tf_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + # Build TF to PyTorch weights loading map + tf_to_pt_map = build_tf_to_pytorch_map(model, config) + + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_weights = {} + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + tf_weights[name] = array + + for name, pointer in tf_to_pt_map.items(): + assert name in tf_weights + array = tf_weights[name] + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if 'kernel' in name or 'proj' in name: + array = np.transpose(array) + if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1: + # Here we will split the TF weigths + assert len(pointer) == array.shape[0] + for i, p_i in enumerate(pointer): + arr_i = array[i, ...] + try: + assert p_i.shape == arr_i.shape + except AssertionError as e: + e.args += (p_i.shape, arr_i.shape) + raise + logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) + p_i.data = torch.from_numpy(arr_i) + else: + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + tf_weights.pop(name, None) + tf_weights.pop(name + '/Adam', None) + tf_weights.pop(name + '/Adam_1', None) + + logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + return model + + +class PositionalEmbedding(nn.Module): + def __init__(self, demb): + super(PositionalEmbedding, self).__init__() + + self.demb = demb + + inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, pos_seq, bsz=None): + sinusoid_inp = torch.ger(pos_seq, self.inv_freq) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + + if bsz is not None: + return pos_emb[:,None,:].expand(-1, bsz, -1) + else: + return pos_emb[:,None,:] + + + +class PositionwiseFF(nn.Module): + def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5): + super(PositionwiseFF, self).__init__() + + self.d_model = d_model + self.d_inner = d_inner + self.dropout = dropout + + self.CoreNet = nn.Sequential( + nn.Linear(d_model, d_inner), nn.ReLU(inplace=True), + nn.Dropout(dropout), + nn.Linear(d_inner, d_model), + nn.Dropout(dropout), + ) + + self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) + + self.pre_lnorm = pre_lnorm + + def forward(self, inp): + if self.pre_lnorm: + ##### layer normalization + positionwise feed-forward + core_out = self.CoreNet(self.layer_norm(inp)) + + ##### residual connection + output = core_out + inp + else: + ##### positionwise feed-forward + core_out = self.CoreNet(inp) + + ##### residual connection + layer normalization + output = self.layer_norm(inp + core_out) + + return output + + +class RelPartialLearnableMultiHeadAttn(nn.Module): + def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, + tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False, + r_r_bias=None, r_w_bias=None, output_attentions=False, + layer_norm_epsilon=1e-5): + super(RelPartialLearnableMultiHeadAttn, self).__init__() + + self.output_attentions = output_attentions + self.n_head = n_head + self.d_model = d_model + self.d_head = d_head + self.dropout = dropout + + self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False) + + self.drop = nn.Dropout(dropout) + self.dropatt = nn.Dropout(dropatt) + self.o_net = nn.Linear(n_head * d_head, d_model, bias=False) + + self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon) + + self.scale = 1 / (d_head ** 0.5) + + self.pre_lnorm = pre_lnorm + + if r_r_bias is None or r_w_bias is None: # Biases are not shared + self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + else: + self.r_r_bias = r_r_bias + self.r_w_bias = r_w_bias + + self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) + + def _rel_shift(self, x): + zero_pad_shape = (x.size(0), 1) + x.size()[2:] + zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype) + x_padded = torch.cat([zero_pad, x], dim=1) + + x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:] + x_padded = x_padded.view(*x_padded_shape) + + x = x_padded[1:].view_as(x) + + return x + + def forward(self, w, r, attn_mask=None, mems=None, head_mask=None): + qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) + + if mems is not None: + cat = torch.cat([mems, w], 0) + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(cat)) + else: + w_heads = self.qkv_net(cat) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + w_head_q = w_head_q[-qlen:] + else: + if self.pre_lnorm: + w_heads = self.qkv_net(self.layer_norm(w)) + else: + w_heads = self.qkv_net(w) + r_head_k = self.r_net(r) + + w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1) + + klen = w_head_k.size(0) + + w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head) # qlen x bsz x n_head x d_head + + r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head + + #### compute attention score + rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head + AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head + + rr_head_q = w_head_q + self.r_r_bias + BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head + BD = self._rel_shift(BD) + + # [qlen x klen x bsz x n_head] + attn_score = AC + BD + attn_score.mul_(self.scale) + + #### compute attention probability + if attn_mask is not None and torch.sum(attn_mask).item(): + attn_mask = (attn_mask == 1) # Switch to bool + if attn_mask.dim() == 2: + if next(self.parameters()).dtype == torch.float16: + attn_score = attn_score.float().masked_fill( + attn_mask[None,:,:,None], -65000).type_as(attn_score) + else: + attn_score = attn_score.float().masked_fill( + attn_mask[None,:,:,None], -1e30).type_as(attn_score) + elif attn_mask.dim() == 3: + if next(self.parameters()).dtype == torch.float16: + attn_score = attn_score.float().masked_fill( + attn_mask[:,:,:,None], -65000).type_as(attn_score) + else: + attn_score = attn_score.float().masked_fill( + attn_mask[:,:,:,None], -1e30).type_as(attn_score) + + # [qlen x klen x bsz x n_head] + attn_prob = F.softmax(attn_score, dim=1) + attn_prob = self.dropatt(attn_prob) + + # Mask heads if we want to + if head_mask is not None: + attn_prob = attn_prob * head_mask + + #### compute attention vector + attn_vec = torch.einsum('ijbn,jbnd->ibnd', (attn_prob, w_head_v)) + + # [qlen x bsz x n_head x d_head] + attn_vec = attn_vec.contiguous().view( + attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head) + + ##### linear projection + attn_out = self.o_net(attn_vec) + attn_out = self.drop(attn_out) + + if self.pre_lnorm: + ##### residual connection + outputs = [w + attn_out] + else: + ##### residual connection + layer normalization + outputs = [self.layer_norm(w + attn_out)] + + if self.output_attentions: + outputs.append(attn_prob) + + return outputs + + +class RelPartialLearnableDecoderLayer(nn.Module): + def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, + **kwargs): + super(RelPartialLearnableDecoderLayer, self).__init__() + + self.dec_attn = RelPartialLearnableMultiHeadAttn(n_head, d_model, + d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs) + self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, + pre_lnorm=kwargs.get('pre_lnorm'), + layer_norm_epsilon=layer_norm_epsilon) + + def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None): + + attn_outputs = self.dec_attn(dec_inp, r, + attn_mask=dec_attn_mask, + mems=mems, head_mask=head_mask) + ff_output = self.pos_ff(attn_outputs[0]) + + outputs = [ff_output] + attn_outputs[1:] + + return outputs + + +class AdaptiveEmbedding(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + sample_softmax=False): + super(AdaptiveEmbedding, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + + self.cutoffs = cutoffs + [n_token] + self.div_val = div_val + self.d_proj = d_proj + + self.emb_scale = d_proj ** 0.5 + + self.cutoff_ends = [0] + self.cutoffs + + self.emb_layers = nn.ModuleList() + self.emb_projs = nn.ParameterList() + if div_val == 1: + self.emb_layers.append( + nn.Embedding(n_token, d_embed, sparse=sample_softmax>0) + ) + if d_proj != d_embed: + self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed))) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i)) + self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))) + + def forward(self, inp): + if self.div_val == 1: + embed = self.emb_layers[0](inp) + if self.d_proj != self.d_embed: + embed = F.linear(embed, self.emb_projs[0]) + else: + param = next(self.parameters()) + inp_flat = inp.view(-1) + emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], + dtype=param.dtype, device=param.device) + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + + mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + inp_i = inp_flat.index_select(0, indices_i) - l_idx + emb_i = self.emb_layers[i](inp_i) + emb_i = F.linear(emb_i, self.emb_projs[i]) + + emb_flat.index_copy_(0, indices_i, emb_i) + + embed_shape = inp.size() + (self.d_proj,) + embed = emb_flat.view(embed_shape) + + embed.mul_(self.emb_scale) + + return embed + + +class TransfoXLPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = TransfoXLConfig + pretrained_model_archive_map = TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_transfo_xl + base_model_prefix = "transformer" + + def _init_weight(self, weight): + if self.config.init == 'uniform': + nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) + elif self.config.init == 'normal': + nn.init.normal_(weight, 0.0, self.config.init_std) + + def _init_bias(self, bias): + nn.init.constant_(bias, 0.0) + + def _init_weights(self, m): + """ Initialize the weights. + """ + classname = m.__class__.__name__ + if classname.find('Linear') != -1: + if hasattr(m, 'weight') and m.weight is not None: + self._init_weight(m.weight) + if hasattr(m, 'bias') and m.bias is not None: + self._init_bias(m.bias) + elif classname.find('AdaptiveEmbedding') != -1: + if hasattr(m, 'emb_projs'): + for i in range(len(m.emb_projs)): + if m.emb_projs[i] is not None: + nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std) + elif classname.find('Embedding') != -1: + if hasattr(m, 'weight'): + self._init_weight(m.weight) + elif classname.find('ProjectedAdaptiveLogSoftmax') != -1: + if hasattr(m, 'cluster_weight') and m.cluster_weight is not None: + self._init_weight(m.cluster_weight) + if hasattr(m, 'cluster_bias') and m.cluster_bias is not None: + self._init_bias(m.cluster_bias) + if hasattr(m, 'out_projs'): + for i in range(len(m.out_projs)): + if m.out_projs[i] is not None: + nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std) + elif classname.find('LayerNorm') != -1: + if hasattr(m, 'weight'): + nn.init.normal_(m.weight, 1.0, self.config.init_std) + if hasattr(m, 'bias') and m.bias is not None: + self._init_bias(m.bias) + else: + if hasattr(m, 'r_emb'): + self._init_weight(m.r_emb) + if hasattr(m, 'r_w_bias'): + self._init_weight(m.r_w_bias) + if hasattr(m, 'r_r_bias'): + self._init_weight(m.r_r_bias) + if hasattr(m, 'r_bias'): + self._init_bias(m.r_bias) + + +TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in + `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_ + by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov. + It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse + previously computed hidden-states to attend to longer context (memory). + This model also uses adaptive softmax inputs and outputs (tied). + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`: + https://arxiv.org/abs/1901.02860 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +TRANSFO_XL_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + Transformer-XL is a model with relative position embeddings so you can either pad the inputs on + the right or on the left. + Indices can be obtained using :class:`transformers.TransfoXLTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **mems**: (`optional`) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) +class TransfoXLModel(TransfoXLPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **mems**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') + model = TransfoXLModel.from_pretrained('transfo-xl-wt103') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states, mems = outputs[:2] + + """ + def __init__(self, config): + super(TransfoXLModel, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + self.n_token = config.n_token + + self.d_embed = config.d_embed + self.d_model = config.d_model + self.n_head = config.n_head + self.d_head = config.d_head + + self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, + div_val=config.div_val) + + self.drop = nn.Dropout(config.dropout) + + self.n_layer = config.n_layer + + self.tgt_len = config.tgt_len + self.mem_len = config.mem_len + self.ext_len = config.ext_len + self.max_klen = config.tgt_len + config.ext_len + config.mem_len + + self.attn_type = config.attn_type + + if not config.untie_r: + self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + + self.layers = nn.ModuleList() + if config.attn_type == 0: # the default attention + for i in range(config.n_layer): + self.layers.append( + RelPartialLearnableDecoderLayer( + config.n_head, config.d_model, config.d_head, config.d_inner, config.dropout, + tgt_len=config.tgt_len, ext_len=config.ext_len, mem_len=config.mem_len, + dropatt=config.dropatt, pre_lnorm=config.pre_lnorm, + r_w_bias=None if config.untie_r else self.r_w_bias, + r_r_bias=None if config.untie_r else self.r_r_bias, + output_attentions=self.output_attentions, + layer_norm_epsilon=config.layer_norm_epsilon) + ) + else: # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints + raise NotImplementedError # Removed them to avoid maintaining dead code + + self.same_length = config.same_length + self.clamp_len = config.clamp_len + + if self.attn_type == 0: # default attention + self.pos_emb = PositionalEmbedding(self.d_model) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + return self.word_emb + + def backward_compatible(self): + self.sample_softmax = -1 + + def reset_length(self, tgt_len, ext_len, mem_len): + self.tgt_len = tgt_len + self.mem_len = mem_len + self.ext_len = ext_len + + def _prune_heads(self, heads): + logger.info("Head pruning is not implemented for Transformer-XL model") + pass + + def init_mems(self, data): + if self.mem_len > 0: + mems = [] + param = next(self.parameters()) + for i in range(self.n_layer): + empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model, + dtype=param.dtype, device=param.device) + mems.append(empty) + + return mems + else: + return None + + def _update_mems(self, hids, mems, qlen, mlen): + # does not deal with None + if mems is None: return None + + # mems is not None + assert len(hids) == len(mems), 'len(hids) != len(mems)' + + # There are `mlen + qlen` steps that can be cached into mems + # For the next step, the last `ext_len` of the `qlen` tokens + # will be used as the extended context. Hence, we only cache + # the tokens from `mlen + qlen - self.ext_len - self.mem_len` + # to `mlen + qlen - self.ext_len`. + with torch.no_grad(): + new_mems = [] + end_idx = mlen + max(0, qlen - 0 - self.ext_len) + beg_idx = max(0, end_idx - self.mem_len) + for i in range(len(hids)): + + cat = torch.cat([mems[i], hids[i]], dim=0) + new_mems.append(cat[beg_idx:end_idx].detach()) + + return new_mems + + def forward(self, input_ids, mems=None, head_mask=None): + # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library + # so we transpose here from shape [bsz, len] to shape [len, bsz] + input_ids = input_ids.transpose(0, 1).contiguous() + + if mems is None: + mems = self.init_mems(input_ids) + + qlen, bsz = input_ids.size() + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) + # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) + head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.n_layer + + word_emb = self.word_emb(input_ids) + + mlen = mems[0].size(0) if mems is not None else 0 + klen = mlen + qlen + if self.same_length: + all_ones = word_emb.new_ones((qlen, klen), dtype=torch.uint8) + mask_len = klen - self.mem_len + if mask_len > 0: + mask_shift_len = qlen - mask_len + else: + mask_shift_len = qlen + dec_attn_mask = (torch.triu(all_ones, 1+mlen) + + torch.tril(all_ones, -mask_shift_len))[:, :, None] # -1 + else: + dec_attn_mask = torch.triu( + word_emb.new_ones((qlen, klen), dtype=torch.uint8), diagonal=1+mlen)[:,:,None] + + hids = [] + attentions = [] + if self.attn_type == 0: # default + pos_seq = torch.arange(klen-1, -1, -1.0, device=word_emb.device, + dtype=word_emb.dtype) + if self.clamp_len > 0: + pos_seq.clamp_(max=self.clamp_len) + pos_emb = self.pos_emb(pos_seq) + + core_out = self.drop(word_emb) + pos_emb = self.drop(pos_emb) + + for i, layer in enumerate(self.layers): + hids.append(core_out) + mems_i = None if mems is None else mems[i] + layer_outputs = layer(core_out, pos_emb, dec_attn_mask=dec_attn_mask, + mems=mems_i, head_mask=head_mask[i]) + core_out = layer_outputs[0] + if self.output_attentions: + attentions.append(layer_outputs[1]) + else: # learnable embeddings and absolute embeddings + raise NotImplementedError # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint + + core_out = self.drop(core_out) + + new_mems = self._update_mems(hids, mems, mlen, qlen) + + # We transpose back here to shape [bsz, len, hidden_dim] + outputs = [core_out.transpose(0, 1).contiguous(), new_mems] + if self.output_hidden_states: + # Add last layer and transpose to library standard shape [bsz, len, hidden_dim] + hids.append(core_out) + hids = list(t.transpose(0, 1).contiguous() for t in hids) + outputs.append(hids) + if self.output_attentions: + # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len] + attentions = list(t.permute(2, 3, 0, 1).contiguous() for t in attentions) + outputs.append(attentions) + + return outputs # last hidden state, new_mems, (all hidden states), (all attentions) + + +@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top + (adaptive softmax with weights tied to the adaptive input embeddings)""", + TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) +class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): + r""" + **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + We don't output them when the loss is computed to speedup adaptive softmax decoding. + **mems**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') + model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + prediction_scores, mems = outputs[:2] + + """ + def __init__(self, config): + super(TransfoXLLMHeadModel, self).__init__(config) + self.transformer = TransfoXLModel(config) + self.sample_softmax = config.sample_softmax + # use sampled softmax + if config.sample_softmax > 0: + self.out_layer = nn.Linear(config.d_model, config.n_token) + self.sampler = LogUniformSampler(config.n_token, config.sample_softmax) + # use adaptive softmax (including standard softmax) + else: + self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, + config.cutoffs, div_val=config.div_val) + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ + Run this to be sure output and input (adaptive) softmax weights are tied + """ + # sampled softmax + if self.sample_softmax > 0: + if self.config.tie_weight: + self.out_layer.weight = self.transformer.word_emb.weight + # adaptive softmax (including standard softmax) + else: + if self.config.tie_weight: + for i in range(len(self.crit.out_layers)): + self._tie_or_clone_weights(self.crit.out_layers[i], + self.transformer.word_emb.emb_layers[i]) + if self.config.tie_projs: + for i, tie_proj in enumerate(self.config.tie_projs): + if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed: + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0] + elif tie_proj and self.config.div_val != 1: + if self.config.torchscript: + self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone()) + else: + self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i] + + def reset_length(self, tgt_len, ext_len, mem_len): + self.transformer.reset_length(tgt_len, ext_len, mem_len) + + def init_mems(self, data): + return self.transformer.init_mems(data) + + def forward(self, input_ids, mems=None, head_mask=None, labels=None): + bsz = input_ids.size(0) + tgt_len = input_ids.size(1) + + transformer_outputs = self.transformer(input_ids, mems=mems, head_mask=head_mask) + + last_hidden = transformer_outputs[0] + pred_hid = last_hidden[:, -tgt_len:] + outputs = transformer_outputs[1:] + if self.sample_softmax > 0 and self.training: + assert self.config.tie_weight + logit = sample_logits(self.transformer.word_emb, self.out_layer.bias, labels, pred_hid, self.sampler) + softmax_output = -F.log_softmax(logit, -1)[:, :, 0] + outputs = [softmax_output] + outputs + if labels is not None: + # TODO: This is not implemented + raise NotImplementedError + else: + softmax_output = self.crit(pred_hid.view(-1, pred_hid.size(-1)), labels) + if labels is None: + softmax_output = softmax_output.view(bsz, tgt_len, -1) + outputs = [softmax_output] + outputs + else: + softmax_output = softmax_output.view(bsz, tgt_len) + outputs = [softmax_output, None] + outputs + + return outputs # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl_utilities.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl_utilities.py new file mode 100644 index 0000000..0773d0d --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_transfo_xl_utilities.py @@ -0,0 +1,332 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Utilities for PyTorch Transformer XL model. + Directly adapted from https://github.com/kimiyoung/transformer-xl. +""" + +from collections import defaultdict + +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F + +# CUDA_MAJOR = int(torch.version.cuda.split('.')[0]) +# CUDA_MINOR = int(torch.version.cuda.split('.')[1]) + +class ProjectedAdaptiveLogSoftmax(nn.Module): + def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + keep_order=False): + super(ProjectedAdaptiveLogSoftmax, self).__init__() + + self.n_token = n_token + self.d_embed = d_embed + self.d_proj = d_proj + + self.cutoffs = cutoffs + [n_token] + self.cutoff_ends = [0] + self.cutoffs + self.div_val = div_val + + self.shortlist_size = self.cutoffs[0] + self.n_clusters = len(self.cutoffs) - 1 + self.head_size = self.shortlist_size + self.n_clusters + + if self.n_clusters > 0: + self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed)) + self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters)) + + self.out_layers = nn.ModuleList() + self.out_projs = nn.ParameterList() + + if div_val == 1: + for i in range(len(self.cutoffs)): + if d_proj != d_embed: + self.out_projs.append( + nn.Parameter(torch.FloatTensor(d_proj, d_embed)) + ) + else: + self.out_projs.append(None) + + self.out_layers.append(nn.Linear(d_embed, n_token)) + else: + for i in range(len(self.cutoffs)): + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1] + d_emb_i = d_embed // (div_val ** i) + + self.out_projs.append( + nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)) + ) + + self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx)) + + self.keep_order = keep_order + + def _compute_logit(self, hidden, weight, bias, proj): + if proj is None: + logit = F.linear(hidden, weight, bias=bias) + else: + # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1: + proj_hid = F.linear(hidden, proj.t().contiguous()) + logit = F.linear(proj_hid, weight, bias=bias) + # else: + # logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t())) + # if bias is not None: + # logit = logit + bias + + return logit + + def forward(self, hidden, labels=None, keep_order=False): + ''' + Params: + hidden :: [len*bsz x d_proj] + labels :: [len*bsz] + Return: + if labels is None: + out :: [len*bsz] Negative log likelihood + else: + out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary + We could replace this implementation by the native PyTorch one + if their's had an option to set bias on all clusters in the native one. + here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138 + ''' + + if labels is not None: + labels = labels.view(-1) + if hidden.size(0) != labels.size(0): + raise RuntimeError('Input and labels should have the same size ' + 'in the batch dimension.') + + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + if labels is not None: + out = -F.log_softmax(logit, dim=-1) \ + .gather(1, labels.unsqueeze(1)).squeeze(1) + else: + out = F.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + head_logprob = F.log_softmax(head_logit, dim=1) + + if labels is None: + out = hidden.new_empty((head_logit.size(0), self.n_token)) + else: + out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device) + + offset = 0 + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1] + + if labels is not None: + mask_i = (labels >= l_idx) & (labels < r_idx) + indices_i = mask_i.nonzero().squeeze() + + if indices_i.numel() == 0: + continue + + target_i = labels.index_select(0, indices_i) - l_idx + head_logprob_i = head_logprob.index_select(0, indices_i) + hidden_i = hidden.index_select(0, indices_i) + else: + hidden_i = hidden + + if i == 0: + if labels is not None: + logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1) + else: + out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + cluster_prob_idx = self.cutoffs[0] + i - 1 # No probability for the head cluster + if labels is not None: + logprob_i = head_logprob_i[:, cluster_prob_idx] \ + + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1) + else: + logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i + out[:, l_idx:r_idx] = logprob_i + + if labels is not None: + if (hasattr(self, 'keep_order') and self.keep_order) or keep_order: + out.index_copy_(0, indices_i, -logprob_i) + else: + out[offset:offset+logprob_i.size(0)].copy_(-logprob_i) + offset += logprob_i.size(0) + + return out + + + def log_prob(self, hidden): + r""" Computes log probabilities for all :math:`n\_classes` + From: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.py + Args: + hidden (Tensor): a minibatch of examples + Returns: + log-probabilities of for each class :math:`c` + in range :math:`0 <= c <= n\_classes`, where :math:`n\_classes` is a + parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. + Shape: + - Input: :math:`(N, in\_features)` + - Output: :math:`(N, n\_classes)` + """ + if self.n_clusters == 0: + logit = self._compute_logit(hidden, self.out_layers[0].weight, + self.out_layers[0].bias, self.out_projs[0]) + return F.log_softmax(logit, dim=-1) + else: + # construct weights and biases + weights, biases = [], [] + for i in range(len(self.cutoffs)): + if self.div_val == 1: + l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1] + weight_i = self.out_layers[0].weight[l_idx:r_idx] + bias_i = self.out_layers[0].bias[l_idx:r_idx] + else: + weight_i = self.out_layers[i].weight + bias_i = self.out_layers[i].bias + + if i == 0: + weight_i = torch.cat( + [weight_i, self.cluster_weight], dim=0) + bias_i = torch.cat( + [bias_i, self.cluster_bias], dim=0) + + weights.append(weight_i) + biases.append(bias_i) + + head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0] + head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj) + + out = hidden.new_empty((head_logit.size(0), self.n_token)) + head_logprob = F.log_softmax(head_logit, dim=1) + + cutoff_values = [0] + self.cutoffs + for i in range(len(cutoff_values) - 1): + start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1] + + if i == 0: + out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]] + else: + weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i] + + tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i) + tail_logprob_i = F.log_softmax(tail_logit_i, dim=1) + + logprob_i = head_logprob[:, -i] + tail_logprob_i + out[:, start_idx, stop_idx] = logprob_i + + return out + + +class LogUniformSampler(object): + def __init__(self, range_max, n_sample): + """ + Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py + `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)` + + expected count can be approximated by 1 - (1 - p)^n + and we use a numerically stable version -expm1(num_tries * log1p(-p)) + + Our implementation fixes num_tries at 2 * n_sample, and the actual #samples will vary from run to run + """ + with torch.no_grad(): + self.range_max = range_max + log_indices = torch.arange(1., range_max+2., 1.).log_() + self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1] + + self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float() + + self.n_sample = n_sample + + def sample(self, labels): + """ + labels: [b1, b2] + Return + true_log_probs: [b1, b2] + samp_log_probs: [n_sample] + neg_samples: [n_sample] + """ + + # neg_samples = torch.empty(0).long() + n_sample = self.n_sample + n_tries = 2 * n_sample + + with torch.no_grad(): + neg_samples = torch.multinomial(self.dist, n_tries, replacement=True).unique() + device = labels.device + neg_samples = neg_samples.to(device) + true_log_probs = self.log_q[labels].to(device) + samp_log_probs = self.log_q[neg_samples].to(device) + return true_log_probs, samp_log_probs, neg_samples + +def sample_logits(embedding, bias, labels, inputs, sampler): + """ + embedding: an nn.Embedding layer + bias: [n_vocab] + labels: [b1, b2] + inputs: [b1, b2, n_emb] + sampler: you may use a LogUniformSampler + Return + logits: [b1, b2, 1 + n_sample] + """ + true_log_probs, samp_log_probs, neg_samples = sampler.sample(labels) + n_sample = neg_samples.size(0) + b1, b2 = labels.size(0), labels.size(1) + all_ids = torch.cat([labels.view(-1), neg_samples]) + all_w = embedding(all_ids) + true_w = all_w[: -n_sample].view(b1, b2, -1) + sample_w = all_w[- n_sample:].view(n_sample, -1) + + all_b = bias[all_ids] + true_b = all_b[: -n_sample].view(b1, b2) + sample_b = all_b[- n_sample:] + + hit = (labels[:, :, None] == neg_samples).detach() + + true_logits = torch.einsum('ijk,ijk->ij', + [true_w, inputs]) + true_b - true_log_probs + sample_logits = torch.einsum('lk,ijk->ijl', + [sample_w, inputs]) + sample_b - samp_log_probs + sample_logits.masked_fill_(hit, -1e30) + logits = torch.cat([true_logits[:, :, None], sample_logits], -1) + + return logits diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_utils.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_utils.py new file mode 100644 index 0000000..7e54cb7 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_utils.py @@ -0,0 +1,817 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import copy +import json +import logging +import os +from io import open + +import six +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torch.nn import functional as F + +from .configuration_utils import PretrainedConfig +from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME + +logger = logging.getLogger(__name__) + + +try: + from torch.nn import Identity +except ImportError: + # Older PyTorch compatibility + class Identity(nn.Module): + r"""A placeholder identity operator that is argument-insensitive. + """ + def __init__(self, *args, **kwargs): + super(Identity, self).__init__() + + def forward(self, input): + return input + +class PreTrainedModel(nn.Module): + r""" Base class for all models. + + :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models + as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads. + + Class attributes (overridden by derived classes): + - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture. + - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values. + - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments: + + - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`, + - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`, + - ``path``: a path (string) to the TensorFlow checkpoint. + + - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model. + """ + config_class = None + pretrained_model_archive_map = {} + load_tf_weights = lambda model, config, path: None + base_model_prefix = "" + + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedModel, self).__init__() + if not isinstance(config, PretrainedConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. " + "To create a model from a pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + # Save config in model + self.config = config + + def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None): + """ Build a resized Embedding Module from a provided token Embedding Module. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + + Args: + new_num_tokens: (`optional`) int + New number of tokens in the embedding matrix. + Increasing the size will add newly initialized vectors at the end + Reducing the size will remove vectors from the end + If not provided or None: return the provided token Embedding Module. + Return: ``torch.nn.Embeddings`` + Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None + """ + if new_num_tokens is None: + return old_embeddings + + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + if old_num_tokens == new_num_tokens: + return old_embeddings + + # Build new embeddings + new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim) + new_embeddings.to(old_embeddings.weight.device) + + # initialize all new embeddings (in particular added tokens) + self._init_weights(new_embeddings) + + # Copy word embeddings from the previous weights + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :] + + return new_embeddings + + def _tie_or_clone_weights(self, first_module, second_module): + """ Tie or clone module weights depending of weither we are using TorchScript or not + """ + if self.config.torchscript: + first_module.weight = nn.Parameter(second_module.weight.clone()) + else: + first_module.weight = second_module.weight + + if hasattr(first_module, 'bias') and first_module.bias is not None: + first_module.bias.data = torch.nn.functional.pad( + first_module.bias.data, + (0, first_module.weight.shape[0] - first_module.bias.shape[0]), + 'constant', + 0 + ) + + def _tie_or_clone_data(self, first_module, second_module): + """ Tie or clone module weights depending of weither we are using TorchScript or not + """ + + if self.config.torchscript: + first_module.weight.data = nn.Parameter(second_module.weight.data.t().clone()) + else: + first_module.weight.data = second_module.weight.data.t() + if hasattr(first_module, 'bias') and first_module.bias is not None: + first_module.bias.data = torch.nn.functional.pad( + first_module.bias.data, + (0, first_module.weight.shape[0] - first_module.bias.shape[0]), + 'constant', + 0 + ) + + def resize_token_embeddings(self, new_num_tokens=None): + """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. + Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + + new_num_tokens: (`optional`) int: + New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. + If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model. + + Return: ``torch.nn.Embeddings`` + Pointer to the input tokens Embeddings Module of the model + """ + base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed + model_embeds = base_model._resize_token_embeddings(new_num_tokens) + if new_num_tokens is None: + return model_embeds + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + base_model.vocab_size = new_num_tokens + + # Tie weights again if needed + if hasattr(self, 'tie_weights'): + self.tie_weights() + + return model_embeds + + def init_weights(self): + """ Initialize and prunes weights if needed. """ + # Initialize weights + self.apply(self._init_weights) + + # Prune heads if needed + if self.config.pruned_heads: + self.prune_heads(self.config.pruned_heads) + + def prune_heads(self, heads_to_prune): + """ Prunes heads of the base model. + + Arguments: + + heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`). + E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2. + """ + base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed + + # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads + for layer, heads in heads_to_prune.items(): + union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) + self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON + + base_model._prune_heads(heads_to_prune) + + def save_pretrained(self, save_directory): + """ Save a model and its configuration file to a directory, so that it + can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method. + """ + assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved" + + # Only save the model it-self if we are using distributed training + model_to_save = self.module if hasattr(self, 'module') else self + + # Save configuration file + model_to_save.config.save_pretrained(save_directory) + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_file = os.path.join(save_directory, WEIGHTS_NAME) + torch.save(model_to_save.state_dict(), output_model_file) + logger.info("Model weights saved in {}".format(output_model_file)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r"""Instantiate a pretrained pytorch model from a pre-trained model configuration. + + The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with ``model.train()`` + + The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model. + It is up to you to train those weights with a downstream fine-tuning task. + + The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded. + + Parameters: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = BertModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = BertModel.from_pretrained('./test/saved_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = BertModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json') + model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + config = kwargs.pop('config', None) + state_dict = kwargs.pop('state_dict', None) + cache_dir = kwargs.pop('cache_dir', None) + from_tf = kwargs.pop('from_tf', False) + force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) + output_loading_info = kwargs.pop('output_loading_info', False) + + # Load config + if config is None: + config, model_kwargs = cls.config_class.from_pretrained( + pretrained_model_name_or_path, *model_args, + cache_dir=cache_dir, return_unused_kwargs=True, + force_download=force_download, + **kwargs + ) + else: + model_kwargs = kwargs + + # Load model + if pretrained_model_name_or_path is not None: + if pretrained_model_name_or_path in cls.pretrained_model_archive_map: + archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path] + elif os.path.isdir(pretrained_model_name_or_path): + if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")): + # Load from a TF 1.0 checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index") + elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): + # Load from a TF 2.0 checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + else: + raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format( + [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], + pretrained_model_name_or_path)) + elif os.path.isfile(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + else: + assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path) + archive_file = pretrained_model_name_or_path + ".index" + + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies) + except EnvironmentError: + if pretrained_model_name_or_path in cls.pretrained_model_archive_map: + msg = "Couldn't reach server at '{}' to download pretrained weights.".format( + archive_file) + else: + msg = "Model name '{}' was not found in model name list ({}). " \ + "We assumed '{}' was a path or url to model weight files named one of {} but " \ + "couldn't find any such file at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(cls.pretrained_model_archive_map.keys()), + archive_file, + [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME]) + raise EnvironmentError(msg) + + if resolved_archive_file == archive_file: + logger.info("loading weights file {}".format(archive_file)) + else: + logger.info("loading weights file {} from cache at {}".format( + archive_file, resolved_archive_file)) + else: + resolved_archive_file = None + + # Instantiate model. + model = cls(config, *model_args, **model_kwargs) + + if state_dict is None and not from_tf: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + + if from_tf: + if resolved_archive_file.endswith('.index'): + # Load from a TensorFlow 1.X checkpoint - provided by original authors + model = cls.load_tf_weights(model, config, resolved_archive_file[:-6]) # Remove the '.index' + else: + # Load from our TensorFlow 2.0 checkpoints + try: + from transformers import load_tf2_checkpoint_in_pytorch_model + model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True) + except ImportError as e: + logger.error("Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions.") + raise e + else: + # Convert old format to new format if needed from a PyTorch state_dict + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + # Make sure we are able to load base models as well as derived models (with heads) + start_prefix = '' + model_to_load = model + if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): + start_prefix = cls.base_model_prefix + '.' + if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()): + model_to_load = getattr(model, cls.base_model_prefix) + + load(model_to_load, prefix=start_prefix) + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(error_msgs) > 0: + raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( + model.__class__.__name__, "\n\t".join(error_msgs))) + + if hasattr(model, 'tie_weights'): + model.tie_weights() # make sure word embedding weights are still tied + + # Set model in evaluation mode to desactivate DropOut modules by default + model.eval() + + if output_loading_info: + loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs} + return model, loading_info + + return model + + +class Conv1D(nn.Module): + def __init__(self, nf, nx): + """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) + Basically works like a Linear layer but the weights are transposed + """ + super(Conv1D, self).__init__() + self.nf = nf + w = torch.empty(nx, nf) + nn.init.normal_(w, std=0.02) + self.weight = nn.Parameter(w) + self.bias = nn.Parameter(torch.zeros(nf)) + + def forward(self, x): + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(*size_out) + return x + + +class PoolerStartLogits(nn.Module): + """ Compute SQuAD start_logits from sequence hidden states. """ + def __init__(self, config): + super(PoolerStartLogits, self).__init__() + self.dense = nn.Linear(config.hidden_size, 1) + + def forward(self, hidden_states, p_mask=None): + """ Args: + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)` + invalid position mask such as query and special symbols (PAD, SEP, CLS) + 1.0 means token should be masked. + """ + x = self.dense(hidden_states).squeeze(-1) + + if p_mask is not None: + if next(self.parameters()).dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +class PoolerEndLogits(nn.Module): + """ Compute SQuAD end_logits from sequence hidden states and start token hidden state. + """ + def __init__(self, config): + super(PoolerEndLogits, self).__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense_1 = nn.Linear(config.hidden_size, 1) + + def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None): + """ Args: + One of ``start_states``, ``start_positions`` should be not None. + If both are set, ``start_positions`` overrides ``start_states``. + + **start_states**: ``torch.LongTensor`` of shape identical to hidden_states + hidden states of the first tokens for the labeled span. + **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the first token for the labeled span: + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` + Mask of invalid position such as query and special symbols (PAD, SEP, CLS) + 1.0 means token should be masked. + """ + assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" + if start_positions is not None: + slen, hsz = hidden_states.shape[-2:] + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) + start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) + + x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) + x = self.activation(x) + x = self.LayerNorm(x) + x = self.dense_1(x).squeeze(-1) + + if p_mask is not None: + if next(self.parameters()).dtype == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +class PoolerAnswerClass(nn.Module): + """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """ + def __init__(self, config): + super(PoolerAnswerClass, self).__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) + + def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None): + """ + Args: + One of ``start_states``, ``start_positions`` should be not None. + If both are set, ``start_positions`` overrides ``start_states``. + + **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``. + hidden states of the first tokens for the labeled span. + **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the first token for the labeled span. + **cls_index**: torch.LongTensor of shape ``(batch_size,)`` + position of the CLS token. If None, take the last token. + + note(Original repo): + no dependency on end_feature so that we can obtain one single `cls_logits` + for each sample + """ + hsz = hidden_states.shape[-1] + assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None" + if start_positions is not None: + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) + + if cls_index is not None: + cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) + else: + cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) + + x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) + x = self.activation(x) + x = self.dense_1(x).squeeze(-1) + + return x + + +class SQuADHead(nn.Module): + r""" A SQuAD head inspired by XLNet. + + Parameters: + config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. + + Inputs: + **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)`` + hidden states of sequence tokens + **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the first token for the labeled span. + **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)`` + position of the last token for the labeled span. + **cls_index**: torch.LongTensor of shape ``(batch_size,)`` + position of the CLS token. If None, take the last token. + **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)`` + Whether the question has a possible answer in the paragraph or not. + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)`` + Mask of invalid position such as query and special symbols (PAD, SEP, CLS) + 1.0 means token should be masked. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. + **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` + Indices for the top config.start_n_top start token possibilities (beam-search). + **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` + Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). + **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` + Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). + **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size,)`` + Log probabilities for the ``is_impossible`` label of the answers. + """ + def __init__(self, config): + super(SQuADHead, self).__init__() + self.start_n_top = config.start_n_top + self.end_n_top = config.end_n_top + + self.start_logits = PoolerStartLogits(config) + self.end_logits = PoolerEndLogits(config) + self.answer_class = PoolerAnswerClass(config) + + def forward(self, hidden_states, start_positions=None, end_positions=None, + cls_index=None, is_impossible=None, p_mask=None): + outputs = () + + start_logits = self.start_logits(hidden_states, p_mask=p_mask) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, let's remove the dimension added by batch splitting + for x in (start_positions, end_positions, cls_index, is_impossible): + if x is not None and x.dim() > 1: + x.squeeze_(-1) + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) + + loss_fct = CrossEntropyLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) + loss_fct_cls = nn.BCEWithLogitsLoss() + cls_loss = loss_fct_cls(cls_logits, is_impossible) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss + total_loss += cls_loss * 0.5 + + outputs = (total_loss,) + outputs + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = hidden_states.size() + start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + + start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) + p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None + end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) + end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + + end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) + end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) + + start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) + cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) + + outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs + + # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits + # or (if labels are provided) (total_loss,) + return outputs + + +class SequenceSummary(nn.Module): + r""" Compute a single vector summary of a sequence hidden states according to various possibilities: + Args of the config class: + summary_type: + - 'last' => [default] take the last token hidden state (like XLNet) + - 'first' => take the first token hidden state (like Bert) + - 'mean' => take the mean of all tokens hidden states + - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2) + - 'attn' => Not implemented now, use multi-head attention + summary_use_proj: Add a projection after the vector extraction + summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. + summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default + summary_first_dropout: Add a dropout before the projection and activation + summary_last_dropout: Add a dropout after the projection and activation + """ + def __init__(self, config): + super(SequenceSummary, self).__init__() + + self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last' + if self.summary_type == 'attn': + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = Identity() + if hasattr(config, 'summary_use_proj') and config.summary_use_proj: + if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + self.activation = Identity() + if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': + self.activation = nn.Tanh() + + self.first_dropout = Identity() + if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = Identity() + if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + def forward(self, hidden_states, cls_index=None): + """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer. + cls_index: [optional] position of the classification token if summary_type == 'cls_index', + shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states. + if summary_type == 'cls_index' and cls_index is None: + we take the last token of the sequence as classification token + """ + if self.summary_type == 'last': + output = hidden_states[:, -1] + elif self.summary_type == 'first': + output = hidden_states[:, 0] + elif self.summary_type == 'mean': + output = hidden_states.mean(dim=1) + elif self.summary_type == 'cls_index': + if cls_index is None: + cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == 'attn': + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +def prune_linear_layer(layer, index, dim=0): + """ Prune a linear layer (a model parameters) to keep only entries in index. + Return the pruned layer as a new layer with requires_grad=True. + Used to remove heads. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if layer.bias is not None: + if dim == 1: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +def prune_conv1d_layer(layer, index, dim=1): + """ Prune a Conv1D layer (a model parameters) to keep only entries in index. + A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed. + Return the pruned layer as a new layer with requires_grad=True. + Used to remove heads. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).clone().detach() + if dim == 0: + b = layer.bias.clone().detach() + else: + b = layer.bias[index].clone().detach() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +def prune_layer(layer, index, dim=None): + """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index. + Return the pruned layer as a new layer with requires_grad=True. + Used to remove heads. + """ + if isinstance(layer, nn.Linear): + return prune_linear_layer(layer, index, dim=0 if dim is None else dim) + elif isinstance(layer, Conv1D): + return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) + else: + raise ValueError("Can't prune layer of class {}".format(layer.__class__)) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlm.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlm.py new file mode 100644 index 0000000..b29e721 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlm.py @@ -0,0 +1,886 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch XLM model. +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging +import math + +import itertools +import numpy as np + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead +from .configuration_xlm import XLMConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-pytorch_model.bin", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-pytorch_model.bin", +} + + +def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([ + [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] + for pos in range(n_pos) + ]) + out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + out.detach_() + out.requires_grad = False + + +def gelu(x): + """ + GELU activation + https://arxiv.org/abs/1606.08415 + https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14 + https://github.com/huggingface/transformers/blob/master/modeling.py + """ + # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def get_masks(slen, lengths, causal, padding_mask=None): + """ + Generate hidden states mask, and optionally an attention mask. + """ + bs = lengths.size(0) + if padding_mask is not None: + mask = padding_mask + else: + assert lengths.max().item() <= slen + alen = torch.arange(slen, dtype=torch.long, device=lengths.device) + mask = alen < lengths[:, None] + + # attention mask is the same as mask, or triangular inferior attention (causal) + if causal: + attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None] + else: + attn_mask = mask + + # sanity check + assert mask.size() == (bs, slen) + assert causal is False or attn_mask.size() == (bs, slen, slen) + + return mask, attn_mask + + +class MultiHeadAttention(nn.Module): + + NEW_ID = itertools.count() + + def __init__(self, n_heads, dim, config): + super(MultiHeadAttention, self).__init__() + self.layer_id = next(MultiHeadAttention.NEW_ID) + self.output_attentions = config.output_attentions + self.dim = dim + self.n_heads = n_heads + self.dropout = config.attention_dropout + assert self.dim % self.n_heads == 0 + + self.q_lin = nn.Linear(dim, dim) + self.k_lin = nn.Linear(dim, dim) + self.v_lin = nn.Linear(dim, dim) + self.out_lin = nn.Linear(dim, dim) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.dim // self.n_heads + if len(heads) == 0: + return + mask = torch.ones(self.n_heads, attention_head_size) + heads = set(heads) - self.pruned_heads + for head in heads: + head -= sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear layers + self.q_lin = prune_linear_layer(self.q_lin, index) + self.k_lin = prune_linear_layer(self.k_lin, index) + self.v_lin = prune_linear_layer(self.v_lin, index) + self.out_lin = prune_linear_layer(self.out_lin, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward(self, input, mask, kv=None, cache=None, head_mask=None): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = input.size() + if kv is None: + klen = qlen if cache is None else cache['slen'] + qlen + else: + klen = kv.size(1) + # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) + n_heads = self.n_heads + dim_per_head = self.dim // n_heads + mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) + + def shape(x): + """ projection """ + return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x): + """ compute context """ + return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + if kv is None: + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) + v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + cache[self.layer_id] = (k, v) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + + weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + + outputs = (self.out_lin(context),) + if self.output_attentions: + outputs = outputs + (weights,) + return outputs + + +class TransformerFFN(nn.Module): + + def __init__(self, in_dim, dim_hidden, out_dim, config): + super(TransformerFFN, self).__init__() + self.dropout = config.dropout + self.lin1 = nn.Linear(in_dim, dim_hidden) + self.lin2 = nn.Linear(dim_hidden, out_dim) + self.act = gelu if config.gelu_activation else F.relu + + def forward(self, input): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = F.dropout(x, p=self.dropout, training=self.training) + return x + + +class XLMPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = XLMConfig + pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = None + base_model_prefix = "transformer" + + def __init__(self, *inputs, **kwargs): + super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs) + + def _init_weights(self, module): + """ Initialize the weights. """ + if isinstance(module, nn.Embedding): + if self.config is not None and self.config.embed_init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std) + if isinstance(module, nn.Linear): + if self.config is not None and self.config.init_std is not None: + nn.init.normal_(module.weight, mean=0, std=self.config.init_std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, 0.) + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +XLM_START_DOCSTRING = r""" The XLM model was proposed in + `Cross-lingual Language Model Pretraining`_ + by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives: + + - a causal language modeling (CLM) objective (next token prediction), + - a masked language modeling (MLM) objective (Bert-like), or + - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs) + + Original code can be found `here`_. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Cross-lingual Language Model Pretraining`: + https://arxiv.org/abs/1901.07291 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + .. _`here`: + https://github.com/facebookresearch/XLM + + Parameters: + config (:class:`~transformers.XLMConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +XLM_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + + XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + Indices can be obtained using :class:`transformers.XLMTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens to be used to indicate the language of each token in the input. + Indices are languages ids which can be obtained from the language names by using two conversion mappings + provided in the configuration of the model (only provided for multilingual models). + More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and + the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The embeddings from these tokens will be summed with the respective token embeddings. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Length of each sentence that can be used to avoid performing attention on padding token indices. + You can also use `attention_mask` for the same result (see above), kept here for compatbility. + Indices selected in ``[0, ..., input_ids.size(-1)]``: + **cache**: + dictionary with ``torch.FloatTensor`` that contains pre-computed + hidden-states (key and values in the attention blocks) as computed by the model + (see `cache` output below). Can be used to speed up sequential decoding. + The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare XLM Model transformer outputting raw hidden-states without any specific head on top.", + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class XLMModel(XLMPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMModel.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): #, dico, is_encoder, with_output): + super(XLMModel, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + # encoder / decoder, output layer + self.is_encoder = config.is_encoder + self.is_decoder = not config.is_encoder + if self.is_decoder: + raise NotImplementedError("Currently XLM can only be used as an encoder") + # self.with_output = with_output + self.causal = config.causal + + # dictionary / languages + self.n_langs = config.n_langs + self.use_lang_emb = config.use_lang_emb + self.n_words = config.n_words + self.eos_index = config.eos_index + self.pad_index = config.pad_index + # self.dico = dico + # self.id2lang = config.id2lang + # self.lang2id = config.lang2id + # assert len(self.dico) == self.n_words + # assert len(self.id2lang) == len(self.lang2id) == self.n_langs + + # model parameters + self.dim = config.emb_dim # 512 by default + self.hidden_dim = self.dim * 4 # 2048 by default + self.n_heads = config.n_heads # 8 by default + self.n_layers = config.n_layers + self.dropout = config.dropout + self.attention_dropout = config.attention_dropout + assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' + + # embeddings + self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim) + if config.sinusoidal_embeddings: + create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) + if config.n_langs > 1 and config.use_lang_emb: + self.lang_embeddings = nn.Embedding(self.n_langs, self.dim) + self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index) + self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps) + + # transformer layers + self.attentions = nn.ModuleList() + self.layer_norm1 = nn.ModuleList() + self.ffns = nn.ModuleList() + self.layer_norm2 = nn.ModuleList() + # if self.is_decoder: + # self.layer_norm15 = nn.ModuleList() + # self.encoder_attn = nn.ModuleList() + + for _ in range(self.n_layers): + self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config)) + self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config)) + self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + + if hasattr(config, "pruned_heads"): + pruned_heads = config.pruned_heads.copy().items() + config.pruned_heads = {} + for layer, heads in pruned_heads: + if self.attentions[int(layer)].n_heads == config.n_heads: + self.prune_heads({int(layer): list(map(int, heads))}) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens) + return self.embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.attentions[layer].prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, + lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None + if lengths is None: + lengths = (input_ids != self.pad_index).sum(dim=1).long() + # mask = input_ids != self.pad_index + + # check inputs + bs, slen = input_ids.size() + assert lengths.size(0) == bs + assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # position_ids + if position_ids is None: + position_ids = input_ids.new((slen,)).long() + position_ids = torch.arange(slen, out=position_ids).unsqueeze(0) + else: + assert position_ids.size() == (bs, slen) # (slen, bs) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + assert langs.size() == (bs, slen) # (slen, bs) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.n_layers + + # do not recompute cached elements + if cache is not None: + _slen = slen - cache['slen'] + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + tensor = self.embeddings(input_ids) + tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor) + if langs is not None and self.use_lang_emb: + tensor = tensor + self.lang_embeddings(langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + tensor = self.layer_norm_emb(tensor) + tensor = F.dropout(tensor, p=self.dropout, training=self.training) + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # transformer layers + hidden_states = () + attentions = () + for i in range(self.n_layers): + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i]) + attn = attn_outputs[0] + if self.output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = F.dropout(attn, p=self.dropout, training=self.training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + + # encoder attention (for decoder only) + # if self.is_decoder and src_enc is not None: + # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) + # attn = F.dropout(attn, p=self.dropout, training=self.training) + # tensor = tensor + attn + # tensor = self.layer_norm15[i](tensor) + + # FFN + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + tensor *= mask.unsqueeze(-1).to(tensor.dtype) + + # Add last hidden state + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # update cache length + if cache is not None: + cache['slen'] += tensor.size(1) + + # move back sequence length to dimension 0 + # tensor = tensor.transpose(0, 1) + + outputs = (tensor,) + if self.output_hidden_states: + outputs = outputs + (hidden_states,) + if self.output_attentions: + outputs = outputs + (attentions,) + return outputs # outputs, (hidden_states), (attentions) + + +class XLMPredLayer(nn.Module): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + def __init__(self, config): + super(XLMPredLayer, self).__init__() + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + dim = config.emb_dim + + if config.asm is False: + self.proj = nn.Linear(dim, config.n_words, bias=True) + else: + self.proj = nn.AdaptiveLogSoftmaxWithLoss( + in_features=dim, + n_classes=config.n_words, + cutoffs=config.asm_cutoffs, + div_value=config.asm_div_value, + head_bias=True, # default is False + ) + + def forward(self, x, y=None): + """ Compute the loss, and optionally the scores. + """ + outputs = () + if self.asm is False: + scores = self.proj(x) + outputs = (scores,) + outputs + if y is not None: + loss = F.cross_entropy(scores.view(-1, self.n_words), y.view(-1), reduction='elementwise_mean') + outputs = (loss,) + outputs + else: + scores = self.proj.log_prob(x) + outputs = (scores,) + outputs + if y is not None: + _, loss = self.proj(x, y) + outputs = (loss,) + outputs + + return outputs + + +@add_start_docstrings("""The XLM Model transformer with a language modeling head on top + (linear layer with weights tied to the input embeddings). """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class XLMWithLMHeadModel(XLMPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(XLMWithLMHeadModel, self).__init__(config) + self.transformer = XLMModel(config) + self.pred_layer = XLMPredLayer(config) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the embeddings + """ + self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings) + + def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, + lengths=None, cache=None, head_mask=None, labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask) + + output = transformer_outputs[0] + outputs = self.pred_layer(output, labels) + outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + + return outputs + + +@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class XLMForSequenceClassification(XLMPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(XLMForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.transformer = XLMModel(config) + self.sequence_summary = SequenceSummary(config) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, + lengths=None, cache=None, head_mask=None, labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask) + + output = transformer_outputs[0] + logits = self.sequence_summary(output) + + outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs + + +@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels whether a question has an answer or no answer (SQuAD 2.0) + **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the classification token to use as input for computing plausibility of the answer. + **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLMForQuestionAnsweringSimple, self).__init__(config) + + self.transformer = XLMModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, + lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask) + + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + + return outputs + + +@add_start_docstrings("""XLM Model with a beam-search span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class XLMForQuestionAnswering(XLMPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels whether a question has an answer or no answer (SQuAD 2.0) + **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the classification token to use as input for computing plausibility of the answer. + **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLMForQuestionAnswering, self).__init__(config) + + self.transformer = XLMModel(config) + self.qa_outputs = SQuADHead(config) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, + lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None, + is_impossible=None, cls_index=None, p_mask=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + langs=langs, + token_type_ids=token_type_ids, + position_ids=position_ids, + lengths=lengths, + cache=cache, + head_mask=head_mask) + + output = transformer_outputs[0] + + outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions, + cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask) + + outputs = outputs + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + + return outputs diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlnet.py b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlnet.py new file mode 100644 index 0000000..064e9d9 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/modeling_xlnet.py @@ -0,0 +1,1368 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch XLNet model. +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_utils import PreTrainedModel, prune_linear_layer, SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits +from .configuration_xlnet import XLNetConfig +from .file_utils import add_start_docstrings + + +logger = logging.getLogger(__name__) + +XLNET_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin", + 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-pytorch_model.bin", +} + + +def build_tf_xlnet_to_pytorch_map(model, config, tf_weights=None): + """ A map of modules from TF to PyTorch. + I use a map to keep the PyTorch model as + identical to the original PyTorch model as possible. + """ + + tf_to_pt_map = {} + + if hasattr(model, 'transformer'): + if hasattr(model, 'lm_loss'): + # We will load also the output bias + tf_to_pt_map['model/lm_loss/bias'] = model.lm_loss.bias + if hasattr(model, 'sequence_summary') and 'model/sequnece_summary/summary/kernel' in tf_weights: + # We will load also the sequence summary + tf_to_pt_map['model/sequnece_summary/summary/kernel'] = model.sequence_summary.summary.weight + tf_to_pt_map['model/sequnece_summary/summary/bias'] = model.sequence_summary.summary.bias + if hasattr(model, 'logits_proj') and config.finetuning_task is not None \ + and 'model/regression_{}/logit/kernel'.format(config.finetuning_task) in tf_weights: + tf_to_pt_map['model/regression_{}/logit/kernel'.format(config.finetuning_task)] = model.logits_proj.weight + tf_to_pt_map['model/regression_{}/logit/bias'.format(config.finetuning_task)] = model.logits_proj.bias + + # Now load the rest of the transformer + model = model.transformer + + # Embeddings and output + tf_to_pt_map.update({'model/transformer/word_embedding/lookup_table': model.word_embedding.weight, + 'model/transformer/mask_emb/mask_emb': model.mask_emb}) + + # Transformer blocks + for i, b in enumerate(model.layer): + layer_str = "model/transformer/layer_%d/" % i + tf_to_pt_map.update({ + layer_str + "rel_attn/LayerNorm/gamma": b.rel_attn.layer_norm.weight, + layer_str + "rel_attn/LayerNorm/beta": b.rel_attn.layer_norm.bias, + layer_str + "rel_attn/o/kernel": b.rel_attn.o, + layer_str + "rel_attn/q/kernel": b.rel_attn.q, + layer_str + "rel_attn/k/kernel": b.rel_attn.k, + layer_str + "rel_attn/r/kernel": b.rel_attn.r, + layer_str + "rel_attn/v/kernel": b.rel_attn.v, + layer_str + "ff/LayerNorm/gamma": b.ff.layer_norm.weight, + layer_str + "ff/LayerNorm/beta": b.ff.layer_norm.bias, + layer_str + "ff/layer_1/kernel": b.ff.layer_1.weight, + layer_str + "ff/layer_1/bias": b.ff.layer_1.bias, + layer_str + "ff/layer_2/kernel": b.ff.layer_2.weight, + layer_str + "ff/layer_2/bias": b.ff.layer_2.bias, + }) + + # Relative positioning biases + if config.untie_r: + r_r_list = [] + r_w_list = [] + r_s_list = [] + seg_embed_list = [] + for b in model.layer: + r_r_list.append(b.rel_attn.r_r_bias) + r_w_list.append(b.rel_attn.r_w_bias) + r_s_list.append(b.rel_attn.r_s_bias) + seg_embed_list.append(b.rel_attn.seg_embed) + else: + r_r_list = [model.r_r_bias] + r_w_list = [model.r_w_bias] + r_s_list = [model.r_s_bias] + seg_embed_list = [model.seg_embed] + tf_to_pt_map.update({ + 'model/transformer/r_r_bias': r_r_list, + 'model/transformer/r_w_bias': r_w_list, + 'model/transformer/r_s_bias': r_s_list, + 'model/transformer/seg_embed': seg_embed_list}) + return tf_to_pt_map + +def load_tf_weights_in_xlnet(model, config, tf_path): + """ Load tf checkpoints in a pytorch model + """ + try: + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + tf_weights = {} + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + tf_weights[name] = array + + # Build TF to PyTorch weights loading map + tf_to_pt_map = build_tf_xlnet_to_pytorch_map(model, config, tf_weights) + + for name, pointer in tf_to_pt_map.items(): + logger.info("Importing {}".format(name)) + if name not in tf_weights: + logger.info("{} not in tf pre-trained weights, skipping".format(name)) + continue + array = tf_weights[name] + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if 'kernel' in name and ('ff' in name or 'summary' in name or 'logit' in name): + logger.info("Transposing") + array = np.transpose(array) + if isinstance(pointer, list): + # Here we will split the TF weigths + assert len(pointer) == array.shape[0] + for i, p_i in enumerate(pointer): + arr_i = array[i, ...] + try: + assert p_i.shape == arr_i.shape + except AssertionError as e: + e.args += (p_i.shape, arr_i.shape) + raise + logger.info("Initialize PyTorch weight {} for layer {}".format(name, i)) + p_i.data = torch.from_numpy(arr_i) + else: + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + tf_weights.pop(name, None) + tf_weights.pop(name + '/Adam', None) + tf_weights.pop(name + '/Adam_1', None) + + logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + return model + + +def gelu(x): + """ Implementation of the gelu activation function. + XLNet is using OpenAI GPT's gelu (not exactly the same as BERT) + Also see https://arxiv.org/abs/1606.08415 + """ + cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + return x * cdf + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm +except (ImportError, AttributeError) as e: + logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .") + from torch.nn import LayerNorm as XLNetLayerNorm + +class XLNetRelativeAttention(nn.Module): + def __init__(self, config): + super(XLNetRelativeAttention, self).__init__() + self.output_attentions = config.output_attentions + + if config.d_model % config.n_head != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.d_model, config.n_head)) + + self.n_head = config.n_head + self.d_head = config.d_head + self.d_model = config.d_model + self.scale = 1 / (config.d_head ** 0.5) + + self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) + self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) + self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) + self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) + self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head)) + + self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head)) + self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head)) + + self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.dropout) + + def prune_heads(self, heads): + raise NotImplementedError + + @staticmethod + def rel_shift(x, klen=-1): + """perform relative shift to form the relative attention score.""" + x_size = x.shape + + x = x.reshape(x_size[1], x_size[0], x_size[2], x_size[3]) + x = x[1:, ...] + x = x.reshape(x_size[0], x_size[1] - 1, x_size[2], x_size[3]) + # x = x[:, 0:klen, :, :] + x = torch.index_select(x, 1, torch.arange(klen, device=x.device, dtype=torch.long)) + + return x + + @staticmethod + def rel_shift_bnij(x, klen=-1): + x_size = x.shape + + x = x.reshape(x_size[0], x_size[1], x_size[3], x_size[2]) + x = x[:, :, 1:, :] + x = x.reshape(x_size[0], x_size[1], x_size[2], x_size[3]-1) + # Note: the tensor-slice form was faster in my testing than torch.index_select + # However, tracing doesn't like the nature of the slice, and if klen changes + # during the run then it'll fail, whereas index_select will be fine. + x = torch.index_select(x, 3, torch.arange(klen, device=x.device, dtype=torch.long)) + # x = x[:, :, :, :klen] + + return x + + def rel_attn_core(self, q_head, k_head_h, v_head_h, k_head_r, seg_mat=None, attn_mask=None, head_mask=None): + """Core relative positional attention operations.""" + + # content based attention score + ac = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_w_bias, k_head_h) + + # position based attention score + bd = torch.einsum('ibnd,jbnd->bnij', q_head + self.r_r_bias, k_head_r) + bd = self.rel_shift_bnij(bd, klen=ac.shape[3]) + + # segment based attention score + if seg_mat is None: + ef = 0 + else: + ef = torch.einsum('ibnd,snd->ibns', q_head + self.r_s_bias, self.seg_embed) + ef = torch.einsum('ijbs,ibns->bnij', seg_mat, ef) + + # merge attention scores and perform masking + attn_score = (ac + bd + ef) * self.scale + if attn_mask is not None: + # attn_score = attn_score * (1 - attn_mask) - 1e30 * attn_mask + if attn_mask.dtype == torch.float16: + attn_score = attn_score - 65500 * torch.einsum('ijbn->bnij', attn_mask) + else: + attn_score = attn_score - 1e30 * torch.einsum('ijbn->bnij', attn_mask) + + # attention probability + attn_prob = F.softmax(attn_score, dim=3) + attn_prob = self.dropout(attn_prob) + + # Mask heads if we want to + if head_mask is not None: + attn_prob = attn_prob * torch.einsum('ijbn->bnij', head_mask) + + # attention output + attn_vec = torch.einsum('bnij,jbnd->ibnd', attn_prob, v_head_h) + + if self.output_attentions: + return attn_vec, torch.einsum('bnij->ijbn', attn_prob) + + return attn_vec + + def post_attention(self, h, attn_vec, residual=True): + """Post-attention processing.""" + # post-attention projection (back to `d_model`) + attn_out = torch.einsum('ibnd,hnd->ibh', attn_vec, self.o) + + attn_out = self.dropout(attn_out) + if residual: + attn_out = attn_out + h + output = self.layer_norm(attn_out) + + return output + + def forward(self, h, g, + attn_mask_h, attn_mask_g, + r, seg_mat, + mems=None, target_mapping=None, head_mask=None): + if g is not None: + ###### Two-stream attention with relative positional encoding. + # content based attention score + if mems is not None and mems.dim() > 1: + cat = torch.cat([mems, h], dim=0) + else: + cat = h + + # content-based key head + k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k) + + # content-based value head + v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v) + + # position-based key head + k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r) + + ##### h-stream + # content-stream query head + q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q) + + # core attention ops + attn_vec_h = self.rel_attn_core( + q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask) + + if self.output_attentions: + attn_vec_h, attn_prob_h = attn_vec_h + + # post processing + output_h = self.post_attention(h, attn_vec_h) + + ##### g-stream + # query-stream query head + q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q) + + # core attention ops + if target_mapping is not None: + q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) + attn_vec_g = self.rel_attn_core( + q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask) + + if self.output_attentions: + attn_vec_g, attn_prob_g = attn_vec_g + + attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) + else: + attn_vec_g = self.rel_attn_core( + q_head_g, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_g, head_mask=head_mask) + + if self.output_attentions: + attn_vec_g, attn_prob_g = attn_vec_g + + # post processing + output_g = self.post_attention(g, attn_vec_g) + + if self.output_attentions: + attn_prob = attn_prob_h, attn_prob_g + + else: + ###### Multi-head attention with relative positional encoding + if mems is not None and mems.dim() > 1: + cat = torch.cat([mems, h], dim=0) + else: + cat = h + + # content heads + q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q) + k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k) + v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v) + + # positional heads + k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r) + + # core attention ops + attn_vec = self.rel_attn_core( + q_head_h, k_head_h, v_head_h, k_head_r, seg_mat=seg_mat, attn_mask=attn_mask_h, head_mask=head_mask) + + if self.output_attentions: + attn_vec, attn_prob = attn_vec + + # post processing + output_h = self.post_attention(h, attn_vec) + output_g = None + + outputs = (output_h, output_g) + if self.output_attentions: + outputs = outputs + (attn_prob,) + return outputs + +class XLNetFeedForward(nn.Module): + def __init__(self, config): + super(XLNetFeedForward, self).__init__() + self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps) + self.layer_1 = nn.Linear(config.d_model, config.d_inner) + self.layer_2 = nn.Linear(config.d_inner, config.d_model) + self.dropout = nn.Dropout(config.dropout) + if isinstance(config.ff_activation, str) or \ + (sys.version_info[0] == 2 and isinstance(config.ff_activation, unicode)): + self.activation_function = ACT2FN[config.ff_activation] + else: + self.activation_function = config.ff_activation + + def forward(self, inp): + output = inp + output = self.layer_1(output) + output = self.activation_function(output) + output = self.dropout(output) + output = self.layer_2(output) + output = self.dropout(output) + output = self.layer_norm(output + inp) + return output + +class XLNetLayer(nn.Module): + def __init__(self, config): + super(XLNetLayer, self).__init__() + self.rel_attn = XLNetRelativeAttention(config) + self.ff = XLNetFeedForward(config) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, output_h, output_g, + attn_mask_h, attn_mask_g, + r, seg_mat, mems=None, target_mapping=None, head_mask=None): + outputs = self.rel_attn(output_h, output_g, attn_mask_h, attn_mask_g, + r, seg_mat, mems=mems, target_mapping=target_mapping, + head_mask=head_mask) + output_h, output_g = outputs[:2] + + if output_g is not None: + output_g = self.ff(output_g) + output_h = self.ff(output_h) + + outputs = (output_h, output_g) + outputs[2:] # Add again attentions if there are there + return outputs + + +class XLNetPreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = XLNetConfig + pretrained_model_archive_map = XLNET_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_xlnet + base_model_prefix = "transformer" + + def _init_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, XLNetLayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, XLNetRelativeAttention): + for param in [module.q, module.k, module.v, module.o, module.r, + module.r_r_bias, module.r_s_bias, module.r_w_bias, + module.seg_embed]: + param.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, XLNetModel): + module.mask_emb.data.normal_(mean=0.0, std=self.config.initializer_range) + + +XLNET_START_DOCSTRING = r""" The XLNet model was proposed in + `XLNet: Generalized Autoregressive Pretraining for Language Understanding`_ + by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le. + XLnet is an extension of the Transformer-XL model pre-trained using an autoregressive method + to learn bidirectional contexts by maximizing the expected likelihood over all permutations + of the input sequence factorization order. + + The specific attention pattern can be controlled at training and test time using the `perm_mask` input. + + Do to the difficulty of training a fully auto-regressive model over various factorization order, + XLNet is pretrained using only a sub-set of the output tokens as target which are selected + with the `target_mapping` input. + + To use XLNet for sequential decoding (i.e. not in fully bi-directional setting), use the `perm_mask` and + `target_mapping` inputs to control the attention span and outputs (see examples in `examples/run_generation.py`) + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`XLNet: Generalized Autoregressive Pretraining for Language Understanding`: + http://arxiv.org/abs/1906.08237 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +XLNET_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + XLNet is a model with relative position embeddings so you can either pad the inputs on + the right or on the left. + Indices can be obtained using :class:`transformers.XLNetTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and + the important thing is that they should be different for tokens which belong to different segments. + The model will compute relative segment differences from the given type indices: + 0 if the segment id of two tokens are the same, 1 if not. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **mems**: (`optional`) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model + (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context. + To activate mems you need to set up config.mem_len to a positive value which will be the max number of tokens in + the memory output by the model. E.g. `model = XLNetModel.from_pretrained('xlnet-base-case, mem_len=1024)` will + instantiate a model which can use up to 1024 tokens of memory (in addition to the input it self). + **perm_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, sequence_length)``: + Mask to indicate the attention pattern for each input token with values selected in ``[0, 1]``: + If ``perm_mask[k, i, j] = 0``, i attend to j in batch k; + if ``perm_mask[k, i, j] = 1``, i does not attend to j in batch k. + If None, each token attends to all the others (full bidirectional attention). + Only used during pretraining (to define factorization order) or for sequential decoding (generation). + **target_mapping**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_predict, sequence_length)``: + Mask to indicate the output tokens to use. + If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token. + Only used during pretraining for partial prediction or for sequential decoding (generation). + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The type indices in XLNet are NOT selected in the vocabulary, they can be arbitrary numbers and + the important thing is that they should be different for tokens which belong to different segments. + The model will compute relative segment differences from the given type indices: + 0 if the segment id of two tokens are the same, 1 if not. + **input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding. + Kept for compatibility with the original code base. + You can only uses one of `input_mask` and `attention_mask` + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare XLNet Model transformer outputting raw hidden-states without any specific head on top.", + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetModel(XLNetPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = XLNetModel.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(XLNetModel, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.output_past = config.output_past + + self.mem_len = config.mem_len + self.reuse_len = config.reuse_len + self.d_model = config.d_model + self.same_length = config.same_length + self.attn_type = config.attn_type + self.bi_data = config.bi_data + self.clamp_len = config.clamp_len + self.n_layer = config.n_layer + + self.word_embedding = nn.Embedding(config.n_token, config.d_model) + self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model)) + self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) + self.dropout = nn.Dropout(config.dropout) + + self.init_weights() + + def _resize_token_embeddings(self, new_num_tokens): + self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens) + return self.word_embedding + + def _prune_heads(self, heads_to_prune): + raise NotImplementedError + + def create_mask(self, qlen, mlen): + """ + Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. + + Args: + qlen: TODO Lysandre didn't fill + mlen: TODO Lysandre didn't fill + + :: + + same_length=False: same_length=True: + < qlen > < qlen > + ^ [0 0 0 0 0 1 1 1 1] [0 0 0 0 0 1 1 1 1] + [0 0 0 0 0 0 1 1 1] [1 0 0 0 0 0 1 1 1] + qlen [0 0 0 0 0 0 0 1 1] [1 1 0 0 0 0 0 1 1] + [0 0 0 0 0 0 0 0 1] [1 1 1 0 0 0 0 0 1] + v [0 0 0 0 0 0 0 0 0] [1 1 1 1 0 0 0 0 0] + + """ + attn_mask = torch.ones([qlen, qlen]) + mask_up = torch.triu(attn_mask, diagonal=1) + attn_mask_pad = torch.zeros([qlen, mlen]) + ret = torch.cat([attn_mask_pad, mask_up], dim=1) + if self.same_length: + mask_lo = torch.tril(attn_mask, diagonal=-1) + ret = torch.cat([ret[:, :qlen] + mask_lo, ret[:, qlen:]], dim=1) + + ret = ret.to(next(self.parameters())) + return ret + + def cache_mem(self, curr_out, prev_mem): + """cache hidden states into memory.""" + if self.reuse_len is not None and self.reuse_len > 0: + curr_out = curr_out[:self.reuse_len] + + if prev_mem is None: + new_mem = curr_out[-self.mem_len:] + else: + new_mem = torch.cat([prev_mem, curr_out], dim=0)[-self.mem_len:] + + return new_mem.detach() + + @staticmethod + def positional_embedding(pos_seq, inv_freq, bsz=None): + sinusoid_inp = torch.einsum('i,d->id', pos_seq, inv_freq) + pos_emb = torch.cat([torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)], dim=-1) + pos_emb = pos_emb[:, None, :] + + if bsz is not None: + pos_emb = pos_emb.expand(-1, bsz, -1) + + return pos_emb + + def relative_positional_encoding(self, qlen, klen, bsz=None): + """create relative positional encoding.""" + freq_seq = torch.arange(0, self.d_model, 2.0, dtype=torch.float) + inv_freq = 1 / torch.pow(10000, (freq_seq / self.d_model)) + + if self.attn_type == 'bi': + # beg, end = klen - 1, -qlen + beg, end = klen, -qlen + elif self.attn_type == 'uni': + # beg, end = klen - 1, -1 + beg, end = klen, -1 + else: + raise ValueError('Unknown `attn_type` {}.'.format(self.attn_type)) + + if self.bi_data: + fwd_pos_seq = torch.arange(beg, end, -1.0, dtype=torch.float) + bwd_pos_seq = torch.arange(-beg, -end, 1.0, dtype=torch.float) + + if self.clamp_len > 0: + fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) + bwd_pos_seq = bwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) + + if bsz is not None: + fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz//2) + bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq, bsz//2) + else: + fwd_pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq) + bwd_pos_emb = self.positional_embedding(bwd_pos_seq, inv_freq) + + pos_emb = torch.cat([fwd_pos_emb, bwd_pos_emb], dim=1) + else: + fwd_pos_seq = torch.arange(beg, end, -1.0) + if self.clamp_len > 0: + fwd_pos_seq = fwd_pos_seq.clamp(-self.clamp_len, self.clamp_len) + pos_emb = self.positional_embedding(fwd_pos_seq, inv_freq, bsz) + + pos_emb = pos_emb.to(next(self.parameters())) + return pos_emb + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None): + # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end + # but we want a unified interface in the library with the batch size on the first dimension + # so we move here the first dimension (batch) to the end + input_ids = input_ids.transpose(0, 1).contiguous() + token_type_ids = token_type_ids.transpose(0, 1).contiguous() if token_type_ids is not None else None + input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None + attention_mask = attention_mask.transpose(0, 1).contiguous() if attention_mask is not None else None + perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None + target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None + + qlen, bsz = input_ids.shape[0], input_ids.shape[1] + mlen = mems[0].shape[0] if mems is not None and mems[0] is not None else 0 + klen = mlen + qlen + + dtype_float = next(self.parameters()).dtype + device = next(self.parameters()).device + + ##### Attention mask + # causal attention mask + if self.attn_type == 'uni': + attn_mask = self.create_mask(qlen, mlen) + attn_mask = attn_mask[:, :, None, None] + elif self.attn_type == 'bi': + attn_mask = None + else: + raise ValueError('Unsupported attention type: {}'.format(self.attn_type)) + + # data mask: input mask & perm mask + assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " + "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." + if input_mask is None and attention_mask is not None: + input_mask = 1.0 - attention_mask + if input_mask is not None and perm_mask is not None: + data_mask = input_mask[None] + perm_mask + elif input_mask is not None and perm_mask is None: + data_mask = input_mask[None] + elif input_mask is None and perm_mask is not None: + data_mask = perm_mask + else: + data_mask = None + + if data_mask is not None: + # all mems can be attended to + if mlen > 0: + mems_mask = torch.zeros([data_mask.shape[0], mlen, bsz]).to(data_mask) + data_mask = torch.cat([mems_mask, data_mask], dim=1) + if attn_mask is None: + attn_mask = data_mask[:, :, :, None] + else: + attn_mask += data_mask[:, :, :, None] + + if attn_mask is not None: + attn_mask = (attn_mask > 0).to(dtype_float) + + if attn_mask is not None: + non_tgt_mask = -torch.eye(qlen).to(attn_mask) + if mlen > 0: + non_tgt_mask = torch.cat([torch.zeros([qlen, mlen]).to(attn_mask), non_tgt_mask], dim=-1) + non_tgt_mask = ((attn_mask + non_tgt_mask[:, :, None, None]) > 0).to(attn_mask) + else: + non_tgt_mask = None + + ##### Word embeddings and prepare h & g hidden states + word_emb_k = self.word_embedding(input_ids) + output_h = self.dropout(word_emb_k) + if target_mapping is not None: + word_emb_q = self.mask_emb.expand(target_mapping.shape[0], bsz, -1) + # else: # We removed the inp_q input which was same as target mapping + # inp_q_ext = inp_q[:, :, None] + # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k + output_g = self.dropout(word_emb_q) + else: + output_g = None + + ##### Segment embedding + if token_type_ids is not None: + # Convert `token_type_ids` to one-hot `seg_mat` + if mlen > 0: + mem_pad = torch.zeros([mlen, bsz], dtype=torch.long, device=device) + cat_ids = torch.cat([mem_pad, token_type_ids], dim=0) + else: + cat_ids = token_type_ids + + # `1` indicates not in the same segment [qlen x klen x bsz] + seg_mat = (token_type_ids[:, None] != cat_ids[None, :]).long() + seg_mat = F.one_hot(seg_mat, num_classes=2).to(dtype_float) + else: + seg_mat = None + + ##### Positional encoding + pos_emb = self.relative_positional_encoding(qlen, klen, bsz=bsz) + pos_emb = self.dropout(pos_emb) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer) + # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0) + head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1) + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.n_layer + + new_mems = () + if mems is None: + mems = [None] * len(self.layer) + + attentions = [] + hidden_states = [] + for i, layer_module in enumerate(self.layer): + if self.mem_len is not None and self.mem_len > 0 and self.output_past: + # cache new mems + new_mems = new_mems + (self.cache_mem(output_h, mems[i]),) + if self.output_hidden_states: + hidden_states.append((output_h, output_g) if output_g is not None else output_h) + + outputs = layer_module(output_h, output_g, attn_mask_h=non_tgt_mask, attn_mask_g=attn_mask, + r=pos_emb, seg_mat=seg_mat, mems=mems[i], target_mapping=target_mapping, + head_mask=head_mask[i]) + output_h, output_g = outputs[:2] + if self.output_attentions: + attentions.append(outputs[2]) + + # Add last hidden state + if self.output_hidden_states: + hidden_states.append((output_h, output_g) if output_g is not None else output_h) + + output = self.dropout(output_g if output_g is not None else output_h) + + # Prepare outputs, we transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method) + outputs = (output.permute(1, 0, 2).contiguous(),) + + if self.mem_len is not None and self.mem_len > 0 and self.output_past: + outputs = outputs + (new_mems,) + + if self.output_hidden_states: + if output_g is not None: + hidden_states = tuple(h.permute(1, 0, 2).contiguous() for hs in hidden_states for h in hs) + else: + hidden_states = tuple(hs.permute(1, 0, 2).contiguous() for hs in hidden_states) + outputs = outputs + (hidden_states,) + if self.output_attentions: + attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions) + outputs = outputs + (attentions,) + + return outputs # outputs, (new_mems), (hidden_states), (attentions) + + +@add_start_docstrings("""XLNet Model with a language modeling head on top + (linear layer with weights tied to the input embeddings). """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetLMHeadModel(XLNetPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') + # We show how to setup inputs to predict a next token using a bi-directional context. + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ")).unsqueeze(0) # We will predict the masked token + perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) + perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token + target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token + target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) + outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) + next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] + + """ + def __init__(self, config): + super(XLNetLMHeadModel, self).__init__(config) + self.attn_type = config.attn_type + self.same_length = config.same_length + + self.transformer = XLNetModel(config) + self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) + + self.init_weights() + self.tie_weights() + + def tie_weights(self): + """ Make sure we are sharing the embeddings + """ + self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding) + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None, labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask) + + logits = self.lm_loss(transformer_outputs[0]) + + outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + if labels is not None: + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(logits.view(-1, logits.size(-1)), + labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # return (loss), logits, (mems), (hidden states), (attentions) + + +@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetForSequenceClassification(XLNetPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config): + super(XLNetForSequenceClassification, self).__init__(config) + self.num_labels = config.num_labels + + self.transformer = XLNetModel(config) + self.sequence_summary = SequenceSummary(config) + self.logits_proj = nn.Linear(config.d_model, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None, labels=None): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask) + output = transformer_outputs[0] + + output = self.sequence_summary(output) + logits = self.logits_proj(output) + + outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + if labels is not None: + if self.num_labels == 1: + # We are doing regression + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), labels.view(-1)) + else: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # return (loss), logits, (mems), (hidden states), (attentions) + +@add_start_docstrings("""XLNet Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RACE/SWAG tasks. """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetForMultipleChoice(XLNetPreTrainedModel): + r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + The second dimension of the input (`num_choices`) indicates the number of choices to scores. + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: + Mask to avoid performing attention on padding token indices. + The second dimension of the input (`num_choices`) indicates the number of choices to score. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') + model = XLNetForMultipleChoice.from_pretrained('xlnet-base-cased') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLNetForMultipleChoice, self).__init__(config) + + self.transformer = XLNetModel(config) + self.sequence_summary = SequenceSummary(config) + self.logits_proj = nn.Linear(config.d_model, 1) + + self.init_weights() + + def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None, + mems=None, perm_mask=None, target_mapping=None, + labels=None, head_mask=None): + num_choices = input_ids.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_input_mask = input_mask.view(-1, input_mask.size(-1)) if input_mask is not None else None + + transformer_outputs = self.transformer(flat_input_ids, token_type_ids=flat_token_type_ids, + input_mask=flat_input_mask, attention_mask=flat_attention_mask, + mems=mems, perm_mask=perm_mask, target_mapping=target_mapping, + head_mask=head_mask) + + + output = transformer_outputs[0] + + output = self.sequence_summary(output) + logits = self.logits_proj(output) + reshaped_logits = logits.view(-1, num_choices) + outputs = (reshaped_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # return (loss), logits, (mems), (hidden states), (attentions) + + +@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLNetForQuestionAnsweringSimple, self).__init__(config) + self.num_labels = config.num_labels + + self.transformer = XLNetModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.transformer(input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (mems), (hidden_states), (attentions) + + +@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetForQuestionAnswering(XLNetPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels whether a question has an answer or no answer (SQuAD 2.0) + **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the classification token to use as input for computing plausibility of the answer. + **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). + 1.0 means token should be masked. 0.0 mean token is not masked. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. + **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)`` + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)`` + Indices for the top config.start_n_top start token possibilities (beam-search). + **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` + Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). + **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)`` + Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search). + **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided) + ``torch.FloatTensor`` of shape ``(batch_size,)`` + Log probabilities for the ``is_impossible`` label of the answers. + **mems**: (`optional`, returned when ``config.mem_len > 0``) + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLNetForQuestionAnswering, self).__init__(config) + self.start_n_top = config.start_n_top + self.end_n_top = config.end_n_top + + self.transformer = XLNetModel(config) + self.start_logits = PoolerStartLogits(config) + self.end_logits = PoolerEndLogits(config) + self.answer_class = PoolerAnswerClass(config) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None, + start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,): + transformer_outputs = self.transformer(input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask) + hidden_states = transformer_outputs[0] + start_logits = self.start_logits(hidden_states, p_mask=p_mask) + + outputs = transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, let's remove the dimension added by batch splitting + for x in (start_positions, end_positions, cls_index, is_impossible): + if x is not None and x.dim() > 1: + x.squeeze_(-1) + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) + + loss_fct = CrossEntropyLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) + loss_fct_cls = nn.BCEWithLogitsLoss() + cls_loss = loss_fct_cls(cls_logits, is_impossible) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss + total_loss += cls_loss * 0.5 + + outputs = (total_loss,) + outputs + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = hidden_states.size() + start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen) + + start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz) + p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None + end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) + end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + + end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) + end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) + + start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) # get the representation of START as weighted sum of hidden states + cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) # Shape (batch size,): one single `cls_logits` for each sample + + outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs + + # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits + # or (if labels are provided) (total_loss,) + return outputs diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/optimization.py b/baselines/models_pytorch/classifier_pytorch/transformers/optimization.py new file mode 100644 index 0000000..39dc7a5 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/optimization.py @@ -0,0 +1,189 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import logging +import math + +import torch +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LambdaLR + +logger = logging.getLogger(__name__) + +class ConstantLRSchedule(LambdaLR): + """ Constant learning rate schedule. + """ + def __init__(self, optimizer, last_epoch=-1): + super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch) + + +class WarmupConstantSchedule(LambdaLR): + """ Linear warmup and then constant. + Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps. + Keeps learning rate schedule equal to 1. after warmup_steps. + """ + def __init__(self, optimizer, warmup_steps, last_epoch=-1): + self.warmup_steps = warmup_steps + super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return float(step) / float(max(1.0, self.warmup_steps)) + return 1. + + +class WarmupLinearSchedule(LambdaLR): + """ Linear warmup and then linear decay. + Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. + Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps. + """ + def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1): + self.warmup_steps = warmup_steps + self.t_total = t_total + super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return float(step) / float(max(1, self.warmup_steps)) + return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps))) + + +class WarmupCosineSchedule(LambdaLR): + """ Linear warmup and then cosine decay. + Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. + Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. + """ + def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): + self.warmup_steps = warmup_steps + self.t_total = t_total + self.cycles = cycles + super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return float(step) / float(max(1.0, self.warmup_steps)) + # progress after warmup + progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) + return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) + + +class WarmupCosineWithHardRestartsSchedule(LambdaLR): + """ Linear warmup and then cosine cycles with hard restarts. + Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying + learning rate (with hard restarts). + """ + def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): + self.warmup_steps = warmup_steps + self.t_total = t_total + self.cycles = cycles + super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + + def lr_lambda(self, step): + if step < self.warmup_steps: + return float(step) / float(max(1, self.warmup_steps)) + # progress after warmup + progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) + if progress >= 1.0: + return 0.0 + return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0)))) + + + +class AdamW(Optimizer): + """ Implements Adam algorithm with weight decay fix. + + Parameters: + lr (float): learning rate. Default 1e-3. + betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999) + eps (float): Adams epsilon. Default: 1e-6 + weight_decay (float): Weight decay. Default: 0.0 + correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True. + """ + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True): + if lr < 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1])) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps)) + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + correct_bias=correct_bias) + super(AdamW, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p.data) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + exp_avg.mul_(beta1).add_(1.0 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) + denom = exp_avg_sq.sqrt().add_(group['eps']) + + step_size = group['lr'] + if group['correct_bias']: # No bias correction for Bert + bias_correction1 = 1.0 - beta1 ** state['step'] + bias_correction2 = 1.0 - beta2 ** state['step'] + step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 + + p.data.addcdiv_(-step_size, exp_avg, denom) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want to decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + # Add weight decay at the end (fixed version) + if group['weight_decay'] > 0.0: + p.data.add_(-group['lr'] * group['weight_decay'], p.data) + + return loss diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_auto.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_auto.py new file mode 100644 index 0000000..ec056de --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_auto.py @@ -0,0 +1,124 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Auto Model class. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging + +from .tokenization_bert import BertTokenizer +from .tokenization_openai import OpenAIGPTTokenizer +from .tokenization_gpt2 import GPT2Tokenizer +from .tokenization_ctrl import CTRLTokenizer +from .tokenization_transfo_xl import TransfoXLTokenizer +from .tokenization_xlnet import XLNetTokenizer +from .tokenization_xlm import XLMTokenizer +from .tokenization_roberta import RobertaTokenizer +from .tokenization_distilbert import DistilBertTokenizer + +logger = logging.getLogger(__name__) + +class AutoTokenizer(object): + r""":class:`~transformers.AutoTokenizer` is a generic tokenizer class + that will be instantiated as one of the tokenizer classes of the library + when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method take care of returning the correct tokenizer class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The tokenizer class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertTokenizer (DistilBert model) + - contains `roberta`: RobertaTokenizer (RoBERTa model) + - contains `bert`: BertTokenizer (Bert model) + - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) + - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) + - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) + - contains `xlnet`: XLNetTokenizer (XLNet model) + - contains `xlm`: XLMTokenizer (XLM model) + + This class cannot be instantiated using `__init__()` (throw an error). + """ + def __init__(self): + raise EnvironmentError("AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.") + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + r""" Instantiate a one of the tokenizer classes of the library + from a pre-trained model vocabulary. + + The tokenizer class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertTokenizer (DistilBert model) + - contains `roberta`: RobertaTokenizer (XLM model) + - contains `bert`: BertTokenizer (Bert model) + - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) + - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) + - contains `ctrl`: CTRLTokenizer (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) + - contains `xlnet`: XLNetTokenizer (XLNet model) + - contains `xlm`: XLMTokenizer (XLM model) + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. + - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. + + kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + + Examples:: + + tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. + tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` + + """ + if 'distilbert' in pretrained_model_name_or_path: + return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'openai-gpt' in pretrained_model_name_or_path: + return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'gpt2' in pretrained_model_name_or_path: + return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'transfo-xl' in pretrained_model_name_or_path: + return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'xlm' in pretrained_model_name_or_path: + return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'ctrl' in pretrained_model_name_or_path: + return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_bert.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_bert.py new file mode 100644 index 0000000..8affdd9 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_bert.py @@ -0,0 +1,502 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", + 'bert-base-german-cased': "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt", + 'bert-large-uncased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt", + 'bert-large-cased-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt", + 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt", + 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt", + 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", + 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", + 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'bert-base-uncased': 512, + 'bert-large-uncased': 512, + 'bert-base-cased': 512, + 'bert-large-cased': 512, + 'bert-base-multilingual-uncased': 512, + 'bert-base-multilingual-cased': 512, + 'bert-base-chinese': 512, + 'bert-base-german-cased': 512, + 'bert-large-uncased-whole-word-masking': 512, + 'bert-large-cased-whole-word-masking': 512, + 'bert-large-uncased-whole-word-masking-finetuned-squad': 512, + 'bert-large-cased-whole-word-masking-finetuned-squad': 512, + 'bert-base-cased-finetuned-mrpc': 512, + 'bert-base-german-dbmdz-cased': 512, + 'bert-base-german-dbmdz-uncased': 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + 'bert-base-uncased': {'do_lower_case': True}, + 'bert-large-uncased': {'do_lower_case': True}, + 'bert-base-cased': {'do_lower_case': False}, + 'bert-large-cased': {'do_lower_case': False}, + 'bert-base-multilingual-uncased': {'do_lower_case': True}, + 'bert-base-multilingual-cased': {'do_lower_case': False}, + 'bert-base-chinese': {'do_lower_case': False}, + 'bert-base-german-cased': {'do_lower_case': False}, + 'bert-large-uncased-whole-word-masking': {'do_lower_case': True}, + 'bert-large-cased-whole-word-masking': {'do_lower_case': False}, + 'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True}, + 'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False}, + 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False}, + 'bert-base-german-dbmdz-cased': {'do_lower_case': False}, + 'bert-base-german-dbmdz-uncased': {'do_lower_case': True}, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip('\n') + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(PreTrainedTokenizer): + r""" + Constructs a BertTokenizer. + :class:`~transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the + minimum of this value (if specified) and the underlying BERT model's sequence length. + never_split: List of tokens which will never be split during tokenization. Only has an effect when + do_wordpiece_only=False + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, + unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", + mask_token="[MASK]", tokenize_chinese_chars=True, **kwargs): + """Constructs a BertTokenizer. + + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input + Only has an effect when do_basic_tokenize=True + **do_basic_tokenize**: (`optional`) boolean (default True) + Whether to do basic tokenization before wordpiece. + **never_split**: (`optional`) list of string + List of tokens which will never be split during tokenization. + Only has an effect when do_basic_tokenize=True + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, + pad_token=pad_token, cls_token=cls_token, + mask_token=mask_token, **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token) + + @property + def vocab_size(self): + return len(self.vocab) + + def _tokenize(self, text): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ' '.join(tokens).replace(' ##', '').strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + return (vocab_file,) + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True): + """ Constructs a BasicTokenizer. + + Args: + **do_lower_case**: Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **tokenize_chinese_chars**: (`optional`) boolean (default True) + Whether to tokenize Chinese characters. + This should likely be deactivated for Japanese: + see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328 + """ + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = never_split + self.tokenize_chinese_chars = tokenize_chinese_chars + + def tokenize(self, text, never_split=None): + """ Basic Tokenization of a piece of text. + Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer. + + Args: + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + """ + never_split = self.never_split + (never_split if never_split is not None else []) + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case and token not in never_split: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if never_split is not None and text in never_split: + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer`. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_ctrl.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_ctrl.py new file mode 100644 index 0000000..2406fa2 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_ctrl.py @@ -0,0 +1,187 @@ +# coding=utf-8 +# Copyright 2018 Salesforce and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Salesforce CTRL.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import json +import logging +import os +import regex as re +from io import open + +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.json', + 'merges_file': 'merges.txt', +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-vocab.json", + }, + 'merges_file': + { + 'ctrl': "https://raw.githubusercontent.com/salesforce/ctrl/master/ctrl-merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'ctrl': 256, +} + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + + pairs = set(pairs) + return pairs + +class CTRLTokenizer(PreTrainedTokenizer): + """ + CTRL BPE tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): + super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs) + self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v:k for k,v in self.encoder.items()} + merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + @property + def vocab_size(self): + return len(self.encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + word = tuple(list(word[:-1]) + [word[-1]+'']) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = '@@ '.join(word) + word = word[:-4] + self.cache[token] = word + return word + + def _tokenize(self, text): + """ Tokenize a string. + """ + split_tokens = [] + + text = text.split(' ') + + for token in text: + split_tokens.extend([t for t in self.bpe(token).split(' ')]) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ' '.join(tokens).replace('@@ ', '').strip() + return out_string + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + return vocab_file, merge_file + + # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) + # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) + # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) + # return ''.join(tokens_generated_so_far) diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py new file mode 100644 index 0000000..dfa0292 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_distilbert.py @@ -0,0 +1,62 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for DistilBERT.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .tokenization_bert import BertTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'distilbert-base-uncased': 512, + 'distilbert-base-uncased-distilled-squad': 512, +} + + +class DistilBertTokenizer(BertTokenizer): + r""" + Constructs a DistilBertTokenizer. + :class:`~transformers.DistilBertTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + do_basic_tokenize: Whether to do basic tokenization before wordpiece. + max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the + minimum of this value (if specified) and the underlying BERT model's sequence length. + never_split: List of tokens which will never be split during tokenization. Only has an effect when + do_wordpiece_only=False + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_gpt2.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_gpt2.py new file mode 100644 index 0000000..6a7f75a --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_gpt2.py @@ -0,0 +1,234 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +import os +import regex as re +from io import open + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.json', + 'merges_file': 'merges.txt', +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json", + 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json", + 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json", + 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json", + }, + 'merges_file': + { + 'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt", + 'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt", + 'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt", + 'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'gpt2': 1024, + 'gpt2-medium': 1024, + 'gpt2-large': 1024, + 'distilgpt2': 1024, +} + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. + We specifically avoids mapping to whitespace/control characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + """ + _chr = unichr if sys.version_info[0] == 2 else chr + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +class GPT2Tokenizer(PreTrainedTokenizer): + """ + GPT-2 BPE tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>", + bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs): + super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) + self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_data] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + + # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + @property + def vocab_size(self): + return len(self.encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def _tokenize(self, text, add_prefix_space=False): + """ Tokenize a string. + Args: + - add_prefix_space (boolean, default False): + Begin the sentence with at least one space toto get invariance to word order in GPT-2 (and RoBERTa) tokenizers. + """ + if add_prefix_space: + text = ' ' + text + + bpe_tokens = [] + for token in re.findall(self.pat, text): + if sys.version_info[0] == 2: + token = ''.join(self.byte_encoder[ord(b)] for b in token) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + else: + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) # Maps all our bytes to unicode strings, avoiding controle tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + text = ''.join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors) + return text + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + return vocab_file, merge_file \ No newline at end of file diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_openai.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_openai.py new file mode 100644 index 0000000..0efbdb3 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_openai.py @@ -0,0 +1,208 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import json +import logging +import os +import re +from io import open + +from .tokenization_utils import PreTrainedTokenizer +from .tokenization_bert import BasicTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.json', + 'merges_file': 'merges.txt', +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json", + }, + 'merges_file': + { + 'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'openai-gpt': 512, +} + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + +def text_standardize(text): + """ + fixes some issues the spacy tokenizer had on books corpus + also does some whitespace standardization + """ + text = text.replace('—', '-') + text = text.replace('–', '-') + text = text.replace('―', '-') + text = text.replace('…', '...') + text = text.replace('´', "'") + text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text) + text = re.sub(r'\s*\n\s*', ' \n ', text) + text = re.sub(r'[^\S\n]+', ' ', text) + return text.strip() + +class OpenAIGPTTokenizer(PreTrainedTokenizer): + """ + BPE tokenizer. Peculiarities: + - lower case all inputs + - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not. + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, merges_file, unk_token="", **kwargs): + super(OpenAIGPTTokenizer, self).__init__(unk_token=unk_token, **kwargs) + + self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + + try: + import ftfy + from spacy.lang.en import English + _nlp = English() + self.nlp = _nlp.Defaults.create_tokenizer(_nlp) + self.fix_text = ftfy.fix_text + except ImportError: + logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.") + self.nlp = BasicTokenizer(do_lower_case=True) + self.fix_text = None + + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v:k for k,v in self.encoder.items()} + merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + @property + def vocab_size(self): + return len(self.encoder) + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def _tokenize(self, text): + """ Tokenize a string. """ + split_tokens = [] + if self.fix_text is None: + # Using BERT's BasicTokenizer + text = self.nlp.tokenize(text) + for token in text: + split_tokens.extend([t for t in self.bpe(token).split(' ')]) + else: + # Using SpaCy & ftfy (original tokenization process of OpenAI GPT) + text = self.nlp(text_standardize(self.fix_text(text))) + for token in text: + split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')]) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an id in a token (BPE) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ''.join(tokens).replace('', ' ').strip() + return out_string + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write(u'#version: 0.2\n') + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + return vocab_file, merge_file diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py new file mode 100644 index 0000000..9cc8a9a --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_roberta.py @@ -0,0 +1,140 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for RoBERTa.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import sys +import json +import logging +import os +import regex as re +from io import open + +from .tokenization_gpt2 import GPT2Tokenizer + +try: + from functools import lru_cache +except ImportError: + # Just a dummy decorator to get the checks to run on python2 + # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now. + def lru_cache(): + return lambda func: func + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.json', + 'merges_file': 'merges.txt', +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json", + }, + 'merges_file': + { + 'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt", + 'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt", + 'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'roberta-base': 512, + 'roberta-large': 512, + 'roberta-large-mnli': 512, +} + + +class RobertaTokenizer(GPT2Tokenizer): + """ + RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: + - Byte-level Byte-Pair-Encoding + - Requires a space to start the input string => the encoding methods should be called with the + ``add_prefix_space`` flag set to ``True``. + Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve + the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"` + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, merges_file, errors='replace', bos_token="", eos_token="", sep_token="", + cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): + super(RobertaTokenizer, self).__init__(vocab_file=vocab_file, merges_file=merges_file, errors=errors, + bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, + sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, + mask_token=mask_token, **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A RoBERTa sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_transfo_xl.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_transfo_xl.py new file mode 100644 index 0000000..8d5a0ce --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_transfo_xl.py @@ -0,0 +1,579 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for Transformer XL model. + Adapted from https://github.com/kimiyoung/transformer-xl. +""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import glob +import logging +import os +import sys +from collections import Counter, OrderedDict +from io import open + +import numpy as np + +from .file_utils import cached_path +from .tokenization_utils import PreTrainedTokenizer + +try: + import torch +except ImportError: + pass + +# if sys.version_info[0] == 2: +# import cPickle as pickle +# else: +# import pickle + + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'pretrained_vocab_file': 'vocab.bin', 'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'pretrained_vocab_file': + { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-vocab.bin", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'transfo-xl-wt103': None, +} + +PRETRAINED_CORPUS_ARCHIVE_MAP = { + 'transfo-xl-wt103': "https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-corpus.bin", +} +CORPUS_NAME = 'corpus.bin' + +class TransfoXLTokenizer(PreTrainedTokenizer): + """ + Transformer-XL tokenizer adapted from Vocab class in https://github.com/kimiyoung/transformer-xl + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, special=None, min_freq=0, max_size=None, lower_case=False, + delimiter=None, vocab_file=None, pretrained_vocab_file=None, + never_split=None, unk_token="", eos_token="", + additional_special_tokens=[""], **kwargs): + super(TransfoXLTokenizer, self).__init__(unk_token=unk_token, eos_token=eos_token, + additional_special_tokens=additional_special_tokens, + **kwargs) + + self.max_len_single_sentence = self.max_len # no default special tokens - you can update this value if you add special tokens + self.max_len_sentences_pair = self.max_len # no default special tokens - you can update this value if you add special tokens + + if never_split is None: + never_split = self.all_special_tokens + if special is None: + special = [] + self.counter = Counter() + self.special = special + self.min_freq = min_freq + self.max_size = max_size + self.lower_case = lower_case + self.delimiter = delimiter + self.vocab_file = vocab_file + self.never_split = never_split + + if pretrained_vocab_file is not None: + # Hack because, honestly this tokenizer was not made to be used + # in a library like ours, at all. + vocab_dict = torch.load(pretrained_vocab_file) + for key, value in vocab_dict.items(): + if key not in self.__dict__: + self.__dict__[key] = value + + if vocab_file is not None: + self.build_vocab() + + def count_file(self, path, verbose=False, add_eos=False): + if verbose: logger.info('counting file {} ...'.format(path)) + assert os.path.exists(path) + + sents = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos) + self.counter.update(symbols) + sents.append(symbols) + + return sents + + def count_sents(self, sents, verbose=False): + """ + sents : a list of sentences, each a list of tokenized symbols + """ + if verbose: logger.info('counting {} sents ...'.format(len(sents))) + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(' line {}'.format(idx)) + self.counter.update(symbols) + + def _build_from_file(self, vocab_file): + self.idx2sym = [] + self.sym2idx = OrderedDict() + + with open(vocab_file, 'r', encoding='utf-8') as f: + for line in f: + symb = line.strip().split()[0] + self.add_symbol(symb) + if '' in self.sym2idx: + self.unk_idx = self.sym2idx[''] + elif '' in self.sym2idx: + self.unk_idx = self.sym2idx[''] + else: + raise ValueError('No token in vocabulary') + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['pretrained_vocab_file']) + torch.save(self.__dict__, vocab_file) + return (vocab_file,) + + def build_vocab(self): + if self.vocab_file: + logger.info('building vocab from {}'.format(self.vocab_file)) + self._build_from_file(self.vocab_file) + logger.info('final vocab size {}'.format(len(self))) + else: + logger.info('building vocab with min_freq={}, max_size={}'.format( + self.min_freq, self.max_size)) + self.idx2sym = [] + self.sym2idx = OrderedDict() + + for sym in self.special: + self.add_special(sym) + + for sym, cnt in self.counter.most_common(self.max_size): + if cnt < self.min_freq: break + self.add_symbol(sym) + + logger.info('final vocab size {} from {} unique tokens'.format( + len(self), len(self.counter))) + + def encode_file(self, path, ordered=False, verbose=False, add_eos=True, + add_double_eos=False): + if verbose: logger.info('encoding file {} ...'.format(path)) + assert os.path.exists(path) + encoded = [] + with open(path, 'r', encoding='utf-8') as f: + for idx, line in enumerate(f): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(' line {}'.format(idx)) + symbols = self.tokenize(line, add_eos=add_eos, + add_double_eos=add_double_eos) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def encode_sents(self, sents, ordered=False, verbose=False): + if verbose: logger.info('encoding {} sents ...'.format(len(sents))) + encoded = [] + for idx, symbols in enumerate(sents): + if verbose and idx > 0 and idx % 500000 == 0: + logger.info(' line {}'.format(idx)) + encoded.append(self.convert_to_tensor(symbols)) + + if ordered: + encoded = torch.cat(encoded) + + return encoded + + def add_special(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + setattr(self, '{}_idx'.format(sym.strip('<>')), self.sym2idx[sym]) + + def add_symbol(self, sym): + if sym not in self.sym2idx: + self.idx2sym.append(sym) + self.sym2idx[sym] = len(self.idx2sym) - 1 + + def _convert_id_to_token(self, idx): + """Converts an id in a token (BPE) using the vocab.""" + assert 0 <= idx < len(self), 'Index {} out of vocabulary range'.format(idx) + return self.idx2sym[idx] + + def _convert_token_to_id(self, sym): + """ Converts a token (str/unicode) in an id using the vocab. """ + if sym in self.sym2idx: + return self.sym2idx[sym] + else: + # logger.info('encounter unk {}'.format(sym)) + # assert '' not in sym + if hasattr(self, 'unk_idx'): + return self.sym2idx.get(sym, self.unk_idx) + # Backward compatibility with pre-trained models + elif '' in self.sym2idx: + return self.sym2idx[''] + elif '' in self.sym2idx: + return self.sym2idx[''] + else: + raise ValueError('Token not in vocabulary and no token in vocabulary for replacement') + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ' '.join(tokens).strip() + return out_string + + def convert_to_tensor(self, symbols): + return torch.LongTensor(self.convert_tokens_to_ids(symbols)) + + @property + def vocab_size(self): + return len(self.idx2sym) + + def _tokenize(self, line, add_eos=False, add_double_eos=False): + line = line.strip() + # convert to lower case + if self.lower_case: + line = line.lower() + + # empty delimiter '' will evaluate False + if self.delimiter == '': + symbols = line + else: + symbols = line.split(self.delimiter) + + if add_double_eos: # lm1b + return [''] + symbols + [''] + elif add_eos: + return symbols + [''] + else: + return symbols + + +class LMOrderedIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None): + """ + data -- LongTensor -- the LongTensor is strictly ordered + """ + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + + # Work out how cleanly we can divide the dataset into bsz parts. + self.n_step = data.size(0) // bsz + + # Trim off any extra elements that wouldn't cleanly fit (remainders). + data = data.narrow(0, 0, self.n_step * bsz) + + # Evenly divide the data across the bsz batches. + self.data = data.view(bsz, -1).t().contiguous().to(device) + + # Number of mini-batches + self.n_batch = (self.n_step + self.bptt - 1) // self.bptt + + def get_batch(self, i, bptt=None): + if bptt is None: bptt = self.bptt + seq_len = min(bptt, self.data.size(0) - 1 - i) + + end_idx = i + seq_len + beg_idx = max(0, i - self.ext_len) + + data = self.data[beg_idx:end_idx] + target = self.data[i+1:i+1+seq_len] + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + return data_out, target_out, seq_len + + def get_fixlen_iter(self, start=0): + for i in range(start, self.data.size(0) - 1, self.bptt): + yield self.get_batch(i) + + def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3): + max_len = self.bptt + max_deviation * std + i = start + while True: + bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2. + bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std)))) + data, target, seq_len = self.get_batch(i, bptt) + i += seq_len + yield data, target, seq_len + if i >= self.data.size(0) - 2: + break + + def __iter__(self): + return self.get_fixlen_iter() + + +class LMShuffledIterator(object): + def __init__(self, data, bsz, bptt, device='cpu', ext_len=None, shuffle=False): + """ + data -- list[LongTensor] -- there is no order among the LongTensors + """ + self.data = data + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self): + # index iterator + epoch_indices = np.random.permutation(len(self.data)) if self.shuffle \ + else np.array(range(len(self.data))) + + # sentence iterator + for idx in epoch_indices: + yield self.data[idx] + + def stream_iterator(self, sent_stream): + # streams for each data in the batch + streams = [None] * self.bsz + + data = torch.LongTensor(self.bptt, self.bsz) + target = torch.LongTensor(self.bptt, self.bsz) + + n_retain = 0 + + while True: + # data : [n_retain+bptt x bsz] + # target : [bptt x bsz] + data[n_retain:].fill_(-1) + target.fill_(-1) + + valid_batch = True + + for i in range(self.bsz): + n_filled = 0 + try: + while n_filled < self.bptt: + if streams[i] is None or len(streams[i]) <= 1: + streams[i] = next(sent_stream) + # number of new tokens to fill in + n_new = min(len(streams[i]) - 1, self.bptt - n_filled) + # first n_retain tokens are retained from last batch + data[n_retain+n_filled:n_retain+n_filled+n_new, i] = \ + streams[i][:n_new] + target[n_filled:n_filled+n_new, i] = \ + streams[i][1:n_new+1] + streams[i] = streams[i][n_new:] + n_filled += n_new + except StopIteration: + valid_batch = False + break + + if not valid_batch: + return + + data_out = data.transpose(0, 1).contiguous().to(self.device) + target_out = target.transpose(0, 1).contiguous().to(self.device) + + yield data_out, target_out, self.bptt + + n_retain = min(data.size(0), self.ext_len) + if n_retain > 0: + data[:n_retain] = data[-n_retain:] + data.resize_(n_retain + self.bptt, data.size(1)) + + def __iter__(self): + # sent_stream is an iterator + sent_stream = self.get_sent_stream() + + for batch in self.stream_iterator(sent_stream): + yield batch + + +class LMMultiFileIterator(LMShuffledIterator): + def __init__(self, paths, vocab, bsz, bptt, device='cpu', ext_len=None, + shuffle=False): + + self.paths = paths + self.vocab = vocab + + self.bsz = bsz + self.bptt = bptt + self.ext_len = ext_len if ext_len is not None else 0 + + self.device = device + self.shuffle = shuffle + + def get_sent_stream(self, path): + sents = self.vocab.encode_file(path, add_double_eos=True) + if self.shuffle: + np.random.shuffle(sents) + sent_stream = iter(sents) + + return sent_stream + + def __iter__(self): + if self.shuffle: + np.random.shuffle(self.paths) + + for path in self.paths: + # sent_stream is an iterator + sent_stream = self.get_sent_stream(path) + for batch in self.stream_iterator(sent_stream): + yield batch + + +class TransfoXLCorpus(object): + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a pre-processed corpus. + """ + vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + if pretrained_model_name_or_path in PRETRAINED_CORPUS_ARCHIVE_MAP: + corpus_file = PRETRAINED_CORPUS_ARCHIVE_MAP[pretrained_model_name_or_path] + else: + corpus_file = os.path.join(pretrained_model_name_or_path, CORPUS_NAME) + # redirect to the cache, if necessary + try: + resolved_corpus_file = cached_path(corpus_file, cache_dir=cache_dir) + except EnvironmentError: + logger.error( + "Corpus '{}' was not found in corpus list ({}). " + "We assumed '{}' was a path or url but couldn't find files {} " + "at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys()), + pretrained_model_name_or_path, + corpus_file)) + return None + if resolved_corpus_file == corpus_file: + logger.info("loading corpus file {}".format(corpus_file)) + else: + logger.info("loading corpus file {} from cache at {}".format( + corpus_file, resolved_corpus_file)) + + # Instantiate tokenizer. + corpus = cls(*inputs, **kwargs) + corpus_dict = torch.load(resolved_corpus_file) + for key, value in corpus_dict.items(): + corpus.__dict__[key] = value + corpus.vocab = vocab + if corpus.train is not None: + corpus.train = torch.tensor(corpus.train, dtype=torch.long) + if corpus.valid is not None: + corpus.valid = torch.tensor(corpus.valid, dtype=torch.long) + if corpus.test is not None: + corpus.test = torch.tensor(corpus.test, dtype=torch.long) + return corpus + + def __init__(self, *args, **kwargs): + self.vocab = TransfoXLTokenizer(*args, **kwargs) + self.dataset = None + self.train = None + self.valid = None + self.test = None + + def build_corpus(self, path, dataset): + self.dataset = dataset + + if self.dataset in ['ptb', 'wt2', 'enwik8', 'text8']: + self.vocab.count_file(os.path.join(path, 'train.txt')) + self.vocab.count_file(os.path.join(path, 'valid.txt')) + self.vocab.count_file(os.path.join(path, 'test.txt')) + elif self.dataset == 'wt103': + self.vocab.count_file(os.path.join(path, 'train.txt')) + elif self.dataset == 'lm1b': + train_path_pattern = os.path.join( + path, '1-billion-word-language-modeling-benchmark-r13output', + 'training-monolingual.tokenized.shuffled', 'news.en-*') + train_paths = glob.glob(train_path_pattern) + # the vocab will load from file when build_vocab() is called + + self.vocab.build_vocab() + + if self.dataset in ['ptb', 'wt2', 'wt103']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True) + elif self.dataset in ['enwik8', 'text8']: + self.train = self.vocab.encode_file( + os.path.join(path, 'train.txt'), ordered=True, add_eos=False) + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=True, add_eos=False) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=True, add_eos=False) + elif self.dataset == 'lm1b': + self.train = train_paths + self.valid = self.vocab.encode_file( + os.path.join(path, 'valid.txt'), ordered=False, add_double_eos=True) + self.test = self.vocab.encode_file( + os.path.join(path, 'test.txt'), ordered=False, add_double_eos=True) + + def get_iterator(self, split, *args, **kwargs): + if split == 'train': + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(self.train, *args, **kwargs) + elif self.dataset == 'lm1b': + kwargs['shuffle'] = True + data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs) + elif split in ['valid', 'test']: + data = self.valid if split == 'valid' else self.test + if self.dataset in ['ptb', 'wt2', 'wt103', 'enwik8', 'text8']: + data_iter = LMOrderedIterator(data, *args, **kwargs) + elif self.dataset == 'lm1b': + data_iter = LMShuffledIterator(data, *args, **kwargs) + + return data_iter + + +def get_lm_corpus(datadir, dataset): + fn = os.path.join(datadir, 'cache.pt') + fn_pickle = os.path.join(datadir, 'cache.pkl') + if os.path.exists(fn): + logger.info('Loading cached dataset...') + corpus = torch.load(fn_pickle) + elif os.path.exists(fn): + logger.info('Loading cached dataset from pickle...') + with open(fn, "rb") as fp: + corpus = pickle.load(fp) + else: + logger.info('Producing dataset {}...'.format(dataset)) + kwargs = {} + if dataset in ['wt103', 'wt2']: + kwargs['special'] = [''] + kwargs['lower_case'] = False + elif dataset == 'ptb': + kwargs['special'] = [''] + kwargs['lower_case'] = True + elif dataset == 'lm1b': + kwargs['special'] = [] + kwargs['lower_case'] = False + kwargs['vocab_file'] = os.path.join(datadir, '1b_word_vocab.txt') + elif dataset in ['enwik8', 'text8']: + pass + + corpus = TransfoXLCorpus(datadir, dataset, **kwargs) + torch.save(corpus, fn) + + return corpus diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_utils.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_utils.py new file mode 100644 index 0000000..5e5be87 --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_utils.py @@ -0,0 +1,1068 @@ +# coding=utf-8 +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging +import os +import json +import six +import copy +from io import open + +from .file_utils import cached_path, is_tf_available, is_torch_available + +if is_tf_available(): + import tensorflow as tf +if is_torch_available(): + import torch + +logger = logging.getLogger(__name__) + +SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json' +ADDED_TOKENS_FILE = 'added_tokens.json' +TOKENIZER_CONFIG_FILE = 'tokenizer_config.json' + +class PreTrainedTokenizer(object): + """ Base class for all tokenizers. + Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary. + + This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + + Class attributes (overridden by derived classes): + + - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string). + - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file. + - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size. + - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, a dictionnary of specific arguments to pass to the ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the ``from_pretrained()`` method. + + Parameters: + + - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token`` and ``self.bos_token_id`` + + - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token`` and ``self.eos_token_id`` + + - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token`` and ``self.unk_token_id`` + + - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token`` and ``self.sep_token_id`` + + - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token`` and ``self.pad_token_id`` + + - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token`` and ``self.cls_token_id`` + + - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id`` + + - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids`` + """ + vocab_files_names = {} + pretrained_vocab_files_map = {} + pretrained_init_configuration = {} + max_model_input_sizes = {} + + SPECIAL_TOKENS_ATTRIBUTES = ["bos_token", "eos_token", "unk_token", "sep_token", + "pad_token", "cls_token", "mask_token", + "additional_special_tokens"] + + @property + def bos_token(self): + """ Beginning of sentence token (string). Log an error if used while not having been set. """ + if self._bos_token is None: + logger.error("Using bos_token, but it is not set yet.") + return self._bos_token + + @property + def eos_token(self): + """ End of sentence token (string). Log an error if used while not having been set. """ + if self._eos_token is None: + logger.error("Using eos_token, but it is not set yet.") + return self._eos_token + + @property + def unk_token(self): + """ Unknown token (string). Log an error if used while not having been set. """ + if self._unk_token is None: + logger.error("Using unk_token, but it is not set yet.") + return self._unk_token + + @property + def sep_token(self): + """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + if self._sep_token is None: + logger.error("Using sep_token, but it is not set yet.") + return self._sep_token + + @property + def pad_token(self): + """ Padding token (string). Log an error if used while not having been set. """ + if self._pad_token is None: + logger.error("Using pad_token, but it is not set yet.") + return self._pad_token + + @property + def cls_token(self): + """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + if self._cls_token is None: + logger.error("Using cls_token, but it is not set yet.") + return self._cls_token + + @property + def mask_token(self): + """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + if self._mask_token is None: + logger.error("Using mask_token, but it is not set yet.") + return self._mask_token + + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """ + if self._additional_special_tokens is None: + logger.error("Using additional_special_tokens, but it is not set yet.") + return self._additional_special_tokens + + @bos_token.setter + def bos_token(self, value): + self._bos_token = value + + @eos_token.setter + def eos_token(self, value): + self._eos_token = value + + @unk_token.setter + def unk_token(self, value): + self._unk_token = value + + @sep_token.setter + def sep_token(self, value): + self._sep_token = value + + @pad_token.setter + def pad_token(self, value): + self._pad_token = value + + @cls_token.setter + def cls_token(self, value): + self._cls_token = value + + @mask_token.setter + def mask_token(self, value): + self._mask_token = value + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value + + @property + def bos_token_id(self): + """ Id of the beginning of sentence token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.bos_token) + + @property + def eos_token_id(self): + """ Id of the end of sentence token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.eos_token) + + @property + def unk_token_id(self): + """ Id of the unknown token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.unk_token) + + @property + def sep_token_id(self): + """ Id of the separation token in the vocabulary. E.g. separate context and query in an input sequence. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.sep_token) + + @property + def pad_token_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.pad_token) + + @property + def cls_token_id(self): + """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.cls_token) + + @property + def mask_token_id(self): + """ Id of the mask token in the vocabulary. E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.mask_token) + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers). Log an error if used while not having been set. """ + return self.convert_tokens_to_ids(self.additional_special_tokens) + + def __init__(self, max_len=None, **kwargs): + self._bos_token = None + self._eos_token = None + self._unk_token = None + self._sep_token = None + self._pad_token = None + self._cls_token = None + self._mask_token = None + self._additional_special_tokens = [] + + self.max_len = max_len if max_len is not None else int(1e12) + + # Added tokens + self.added_tokens_encoder = {} + self.added_tokens_decoder = {} + + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + self.init_kwargs = {} + + for key, value in kwargs.items(): + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == 'additional_special_tokens': + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) + else: + assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) + setattr(self, key, value) + + + @classmethod + def from_pretrained(cls, *inputs, **kwargs): + r""" + Instantiate a :class:`~transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer. + + Args: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. + - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the vocabulary files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method. + + kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~transformers.PreTrainedTokenizer` for details. + + Examples:: + + # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer + + # Download vocabulary from S3 and cache. + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt') + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='') + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == '' + + """ + return cls._from_pretrained(*inputs, **kwargs) + + + @classmethod + def _from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs): + cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) + proxies = kwargs.pop('proxies', None) + + s3_models = list(cls.max_model_input_sizes.keys()) + vocab_files = {} + init_configuration = {} + if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket + for file_id, map_list in cls.pretrained_vocab_files_map.items(): + vocab_files[file_id] = map_list[pretrained_model_name_or_path] + if cls.pretrained_init_configuration and pretrained_model_name_or_path in cls.pretrained_init_configuration: + init_configuration = cls.pretrained_init_configuration[pretrained_model_name_or_path] + else: + # Get the vocabulary from local files + logger.info( + "Model name '{}' not found in model shortcut name list ({}). " + "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( + pretrained_model_name_or_path, ', '.join(s3_models), + pretrained_model_name_or_path)) + + # Look for the tokenizer main vocabulary files + for file_id, file_name in cls.vocab_files_names.items(): + if os.path.isdir(pretrained_model_name_or_path): + # If a directory is provided we look for the standard filenames + full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + else: + # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) + full_file_name = pretrained_model_name_or_path + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + vocab_files[file_id] = full_file_name + + # Look for the additional tokens files + additional_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, + 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE, + 'tokenizer_config_file': TOKENIZER_CONFIG_FILE, + } + + # If a path to a file was provided, get the parent directory + saved_directory = pretrained_model_name_or_path + if os.path.exists(saved_directory) and not os.path.isdir(saved_directory): + saved_directory = os.path.dirname(saved_directory) + + for file_id, file_name in additional_files_names.items(): + full_file_name = os.path.join(saved_directory, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + vocab_files[file_id] = full_file_name + + if all(full_file_name is None for full_file_name in vocab_files.values()): + raise EnvironmentError( + "Model name '{}' was not found in tokenizers model name list ({}). " + "We assumed '{}' was a path or url to a directory containing vocabulary files " + "named {} but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, ', '.join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values()))) + + # Get files from url, cache, or disk depending on the case + try: + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + else: + resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies) + except EnvironmentError: + if pretrained_model_name_or_path in s3_models: + msg = "Couldn't reach server at '{}' to download vocabulary files." + else: + msg = "Model name '{}' was not found in tokenizers model name list ({}). " \ + "We assumed '{}' was a path or url to a directory containing vocabulary files " \ + "named {}, but couldn't find such vocabulary files at this path or url.".format( + pretrained_model_name_or_path, ', '.join(s3_models), + pretrained_model_name_or_path, + list(cls.vocab_files_names.values())) + + raise EnvironmentError(msg) + + for file_id, file_path in vocab_files.items(): + if file_path == resolved_vocab_files[file_id]: + logger.info("loading file {}".format(file_path)) + else: + logger.info("loading file {} from cache at {}".format( + file_path, resolved_vocab_files[file_id])) + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop('tokenizer_config_file', None) + if tokenizer_config_file is not None: + init_kwargs = json.load(open(tokenizer_config_file, encoding="utf-8")) + saved_init_inputs = init_kwargs.pop('init_inputs', ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + init_kwargs = init_configuration + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Set max length if needed + if pretrained_model_name_or_path in cls.max_model_input_sizes: + # if we're using a pretrained model, ensure the tokenizer + # wont index sequences longer than the number of positional embeddings + max_len = cls.max_model_input_sizes[pretrained_model_name_or_path] + if max_len is not None and isinstance(max_len, (int, float)): + init_kwargs['max_len'] = min(init_kwargs.get('max_len', int(1e12)), max_len) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop('added_tokens_file', None) + special_tokens_map_file = resolved_vocab_files.pop('special_tokens_map_file', None) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + if special_tokens_map_file is not None: + special_tokens_map = json.load(open(special_tokens_map_file, encoding="utf-8")) + for key, value in special_tokens_map.items(): + if key not in init_kwargs: + init_kwargs[key] = value + + # Instantiate tokenizer. + tokenizer = cls(*init_inputs, **init_kwargs) + + # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` + tokenizer.init_inputs = init_inputs + tokenizer.init_kwargs = init_kwargs + + # Add supplementary tokens. + if added_tokens_file is not None: + added_tok_encoder = json.load(open(added_tokens_file, encoding="utf-8")) + added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} + tokenizer.added_tokens_encoder.update(added_tok_encoder) + tokenizer.added_tokens_decoder.update(added_tok_decoder) + + return tokenizer + + + def save_pretrained(self, save_directory): + """ Save the tokenizer vocabulary files together with: + - added tokens, + - special-tokens-to-class-attributes-mapping, + - tokenizer instantiation positional and keywords inputs (e.g. do_lower_case for Bert). + + This won't save modifications other than (added tokens and special token mapping) you may have + applied to the tokenizer after the instantiation (e.g. modifying tokenizer.do_lower_case after creation). + + This method make sure the full tokenizer can then be re-loaded using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + if not os.path.isdir(save_directory): + logger.error("Saving directory ({}) should be a directory".format(save_directory)) + return + + special_tokens_map_file = os.path.join(save_directory, SPECIAL_TOKENS_MAP_FILE) + added_tokens_file = os.path.join(save_directory, ADDED_TOKENS_FILE) + tokenizer_config_file = os.path.join(save_directory, TOKENIZER_CONFIG_FILE) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + tokenizer_config['init_inputs'] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + with open(tokenizer_config_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(tokenizer_config, ensure_ascii=False)) + + with open(special_tokens_map_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.special_tokens_map, ensure_ascii=False)) + + with open(added_tokens_file, 'w', encoding='utf-8') as f: + if self.added_tokens_encoder: + out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False) + else: + out_str = u"{}" + f.write(out_str) + + vocab_files = self.save_vocabulary(save_directory) + + return vocab_files + (special_tokens_map_file, added_tokens_file) + + + def save_vocabulary(self, save_directory): + """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens + and special token mappings. + + Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained` class method. + """ + raise NotImplementedError + + + def vocab_size(self): + """ Size of the base vocabulary (without the added tokens) """ + raise NotImplementedError + + + def __len__(self): + """ Size of the full vocabulary with the added tokens """ + return self.vocab_size + len(self.added_tokens_encoder) + + + def add_tokens(self, new_tokens): + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the + vocabulary, they are added to it with indices starting from length of the current vocabulary. + + Args: + new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + model = BertModel.from_pretrained('bert-base-uncased') + + num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2']) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + """ + if not new_tokens: + return 0 + + to_add_tokens = [] + for token in new_tokens: + assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode)) + if token != self.unk_token and \ + self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \ + token not in to_add_tokens: + to_add_tokens.append(token) + logger.info("Adding %s to the vocabulary", token) + + added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens)) + added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} + self.added_tokens_encoder.update(added_tok_encoder) + self.added_tokens_decoder.update(added_tok_decoder) + + return len(to_add_tokens) + + def num_added_tokens(self, pair=False): + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + Note: + This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this + inside your training loop. + + Args: + pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the + number of added tokens in the case of a single sequence if set to False. + + Returns: + Number of tokens added to sequences + """ + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def add_special_tokens(self, special_tokens_dict): + """ + Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them + to class attributes. If special tokens are NOT in the vocabulary, they are added + to it (indexed starting from the last index of the current vocabulary). + + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - special tokens are carefully handled by the tokenizer (they are never split) + - you can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (ex: BertTokenizer cls_token is already registered to be '[CLS]' and XLM's one is also registered to be '') + + Args: + special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: + [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, + ``additional_special_tokens``]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them). + + Returns: + Number of tokens added to the vocabulary. + + Examples:: + + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained('gpt2') + model = GPT2Model.from_pretrained('gpt2') + + special_tokens_dict = {'cls_token': ''} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print('We have added', num_added_toks, 'tokens') + model.resize_token_embeddings(len(tokenizer)) # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + + assert tokenizer.cls_token == '' + """ + if not special_tokens_dict: + return 0 + + added_tokens = 0 + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES + if key == 'additional_special_tokens': + assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value) + added_tokens += self.add_tokens(value) + else: + assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode)) + added_tokens += self.add_tokens([value]) + logger.info("Assigning %s to the %s key of the tokenizer", value, key) + setattr(self, key, value) + + return added_tokens + + def tokenize(self, text, **kwargs): + """ Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Take care of added tokens. + """ + def split_on_token(tok, text): + result = [] + split_text = text.split(tok) + for i, sub_text in enumerate(split_text): + sub_text = sub_text.strip() + if i == 0 and not sub_text: + result += [tok] + elif i == len(split_text) - 1: + if sub_text: + result += [sub_text] + else: + pass + else: + if sub_text: + result += [sub_text] + result += [tok] + return result + + def split_on_tokens(tok_list, text): + if not text: + return [] + if not tok_list: + return self._tokenize(text, **kwargs) + + tokenized_text = [] + text_list = [text] + for tok in tok_list: + tokenized_text = [] + for sub_text in text_list: + if sub_text not in self.added_tokens_encoder \ + and sub_text not in self.all_special_tokens: + tokenized_text += split_on_token(tok, sub_text) + else: + tokenized_text += [sub_text] + text_list = tokenized_text + + return sum((self._tokenize(token, **kwargs) if token not \ + in self.added_tokens_encoder and token not in self.all_special_tokens \ + else [token] for token in tokenized_text), []) + + added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens + tokenized_text = split_on_tokens(added_tokens, text) + return tokenized_text + + def _tokenize(self, text, **kwargs): + """ Converts a string in a sequence of tokens (string), using the tokenizer. + Split in words for word-based vocabulary or sub-words for sub-word-based + vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens): + """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id + (resp. a sequence of ids), using the vocabulary. + """ + if tokens is None: + return None + + if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)): + return self._convert_token_to_id_with_added_voc(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id_with_added_voc(token)) + if len(ids) > self.max_len: + logger.warning("Token indices sequence length is longer than the specified maximum sequence length " + "for this model ({} > {}). Running this sequence through the model will result in " + "indexing errors".format(len(ids), self.max_len)) + return ids + + def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + + if token in self.added_tokens_encoder: + return self.added_tokens_encoder[token] + return self._convert_token_to_id(token) + + def _convert_token_to_id(self, token): + raise NotImplementedError + + def encode(self, + text, + text_pair=None, + add_special_tokens=False, + max_length=None, + stride=0, + truncation_strategy='longest_first', + return_tensors=None, + **kwargs): + """ + Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``. + + Args: + text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + **kwargs: passed to the `self.tokenize()` method + """ + encoded_inputs = self.encode_plus(text, + text_pair=text_pair, + max_length=max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors, + **kwargs) + + return encoded_inputs["input_ids"] + + def encode_plus(self, + text, + text_pair=None, + add_special_tokens=False, + max_length=None, + stride=0, + truncation_strategy='longest_first', + return_tensors=None, + **kwargs): + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional informations: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + text: The first sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method) + text_pair: Optional second sequence to be encoded. This can be a string, a list of strings (tokenized + string using the `tokenize` method) or a list of integers (tokenized string ids using the + `convert_tokens_to_ids` method) + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary + stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + **kwargs: passed to the `self.tokenize()` method + """ + + def get_input_ids(text): + if isinstance(text, six.string_types): + return self.convert_tokens_to_ids(self.tokenize(text, **kwargs)) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], six.string_types): + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError("Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers.") + + first_ids = get_input_ids(text) + second_ids = get_input_ids(text_pair) if text_pair is not None else None + + return self.prepare_for_model(first_ids, + pair_ids=second_ids, + max_length=max_length, + add_special_tokens=add_special_tokens, + stride=stride, + truncation_strategy=truncation_strategy, + return_tensors=return_tensors) + + def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0, + truncation_strategy='longest_first', return_tensors=None): + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. + It adds special tokens, truncates + sequences if overflowing while taking into account the special tokens and manages a window stride for + overflowing tokens + + Args: + ids: list of tokenized input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + pair_ids: Optional second list of input ids. Can be obtained from a string by chaining the + `tokenize` and `convert_tokens_to_ids` methods. + max_length: maximum length of the returned list. Will truncate by taking into account the special tokens. + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + stride: window stride for overflowing tokens. Can be useful for edge effect removal when using sequential + list of inputs. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + overflowing_tokens: list[int] if a ``max_length`` is specified, else None + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` + } + + With the fields: + ``input_ids``: list of tokens to be fed to a model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. + """ + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + encoded_inputs = {} + total_len = len_ids + len_pair_ids + (self.num_added_tokens(pair=pair) if add_special_tokens else 0) + if max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences(ids, pair_ids=pair_ids, + num_tokens_to_remove=total_len-max_length, + truncation_strategy=truncation_strategy, + stride=stride) + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) + + if return_tensors == 'tf' and is_tf_available(): + sequence = tf.constant([sequence]) + token_type_ids = tf.constant([token_type_ids]) + elif return_tensors == 'pt' and is_torch_available(): + sequence = torch.tensor([sequence]) + token_type_ids = torch.tensor([token_type_ids]) + elif return_tensors is not None: + logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) + + encoded_inputs["input_ids"] = sequence + encoded_inputs["token_type_ids"] = token_type_ids + + if max_length and len(encoded_inputs["input_ids"]) > max_length: + encoded_inputs["input_ids"] = encoded_inputs["input_ids"][:max_length] + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"][:max_length] + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"][:max_length] + + return encoded_inputs + + def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): + """Truncates a sequence pair in place to the maximum length. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences). + Overflowing tokens only contains overflow from the first sequence. + - 'only_first': Only truncate the first sequence. raise an error if the first sequence is shorter or equal to than num_tokens_to_remove. + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if truncation_strategy == 'longest_first': + overflowing_tokens = [] + for _ in range(num_tokens_to_remove): + if pair_ids is None or len(ids) > len(pair_ids): + overflowing_tokens = [ids[-1]] + overflowing_tokens + ids = ids[:-1] + else: + pair_ids = pair_ids[:-1] + window_len = min(len(ids), stride) + if window_len > 0: + overflowing_tokens = ids[-window_len:] + overflowing_tokens + elif truncation_strategy == 'only_first': + assert len(ids) > num_tokens_to_remove + window_len = min(len(ids), stride + num_tokens_to_remove) + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + elif truncation_strategy == 'only_second': + assert pair_ids is not None and len(pair_ids) > num_tokens_to_remove + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif truncation_strategy == 'do_not_truncate': + raise ValueError("Input sequence are too long for max_length. Please select a truncation strategy.") + else: + raise ValueError("Truncation_strategy should be selected in ['longest_first', 'only_first', 'only_second', 'do_not_truncate']") + return (ids, pair_ids, overflowing_tokens) + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + logger.warning("This tokenizer does not make use of special tokens.") + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + logger.warning("This tokenizer does not make use of special tokens. Input is returned with no modification.") + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + + def convert_ids_to_tokens(self, ids, skip_special_tokens=False): + """ Converts a single index or a sequence of indices (integers) in a token " + (resp.) a sequence of tokens (str/unicode), using the vocabulary and added tokens. + + Args: + skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False + """ + if isinstance(ids, int): + if ids in self.added_tokens_decoder: + return self.added_tokens_decoder[ids] + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + if skip_special_tokens and index in self.all_special_ids: + continue + if index in self.added_tokens_decoder: + tokens.append(self.added_tokens_decoder[index]) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index): + raise NotImplementedError + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. + The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids)) + but we often want to remove sub-word tokenization artifacts at the same time. + """ + return ' '.join(self.convert_ids_to_tokens(tokens)) + + def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + """ + Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary + with options to remove special tokens and clean up tokenization spaces. + Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``. + + Args: + token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods. + skip_special_tokens: if set to True, will replace special tokens. + clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces. + """ + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separatly for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(" " + token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + text = ''.join(sub_texts) + + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + @property + def special_tokens_map(self): + """ A dictionary mapping special token class attribute (cls_token, unk_token...) to their + values ('', ''...) + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, "_" + attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens(self): + """ List all the special tokens ('', ''...) mapped to class attributes + (cls_token, unk_token...). + """ + all_toks = [] + set_attr = self.special_tokens_map + for attr_value in set_attr.values(): + all_toks = all_toks + (list(attr_value) if isinstance(attr_value, (list, tuple)) else [attr_value]) + all_toks = list(set(all_toks)) + return all_toks + + @property + def all_special_ids(self): + """ List the vocabulary indices of the special tokens ('', ''...) mapped to + class attributes (cls_token, unk_token...). + """ + all_toks = self.all_special_tokens + all_ids = list(self._convert_token_to_id(t) for t in all_toks) + return all_ids + + @staticmethod + def clean_up_tokenization(out_string): + """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms. + """ + out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ',' + ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't" + ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re") + return out_string diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlm.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlm.py new file mode 100644 index 0000000..d09ce6b --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlm.py @@ -0,0 +1,833 @@ +# coding=utf-8 +# Copyright 2019 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for OpenAI GPT.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import json +import logging +import os +import re +import sys +import unicodedata +from io import open + +import sacremoses as sm + +from .tokenization_utils import PreTrainedTokenizer +from .tokenization_bert import BasicTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = { + 'vocab_file': 'vocab.json', + 'merges_file': 'merges.txt', +} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-vocab.json", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-vocab.json", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-vocab.json", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-vocab.json", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-vocab.json", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-vocab.json", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-vocab.json", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-vocab.json", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-vocab.json", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-vocab.json", + }, + 'merges_file': + { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-merges.txt", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-merges.txt", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-merges.txt", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-merges.txt", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-merges.txt", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-merges.txt", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-merges.txt", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-merges.txt", + }, +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'xlm-mlm-en-2048': 512, + 'xlm-mlm-ende-1024': 512, + 'xlm-mlm-enfr-1024': 512, + 'xlm-mlm-enro-1024': 512, + 'xlm-mlm-tlm-xnli15-1024': 512, + 'xlm-mlm-xnli15-1024': 512, + 'xlm-clm-enfr-1024': 512, + 'xlm-clm-ende-1024': 512, + 'xlm-mlm-17-1280': 512, + 'xlm-mlm-100-1280': 512, +} + +PRETRAINED_INIT_CONFIGURATION = { + 'xlm-mlm-en-2048': {"do_lowercase_and_remove_accent": True}, + 'xlm-mlm-ende-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "de", + "1": "en"}, + "lang2id": { "de": 0, + "en": 1 }}, + 'xlm-mlm-enfr-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "en", + "1": "fr"}, + "lang2id": { "en": 0, + "fr": 1 }}, + 'xlm-mlm-enro-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "en", + "1": "ro"}, + "lang2id": { "en": 0, + "ro": 1 }}, + 'xlm-mlm-tlm-xnli15-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "ar", + "1": "bg", + "2": "de", + "3": "el", + "4": "en", + "5": "es", + "6": "fr", + "7": "hi", + "8": "ru", + "9": "sw", + "10": "th", + "11": "tr", + "12": "ur", + "13": "vi", + "14": "zh"}, + "lang2id": { "ar": 0, + "bg": 1, + "de": 2, + "el": 3, + "en": 4, + "es": 5, + "fr": 6, + "hi": 7, + "ru": 8, + "sw": 9, + "th": 10, + "tr": 11, + "ur": 12, + "vi": 13, + "zh": 14 }}, + 'xlm-mlm-xnli15-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "ar", + "1": "bg", + "2": "de", + "3": "el", + "4": "en", + "5": "es", + "6": "fr", + "7": "hi", + "8": "ru", + "9": "sw", + "10": "th", + "11": "tr", + "12": "ur", + "13": "vi", + "14": "zh"}, + "lang2id": { "ar": 0, + "bg": 1, + "de": 2, + "el": 3, + "en": 4, + "es": 5, + "fr": 6, + "hi": 7, + "ru": 8, + "sw": 9, + "th": 10, + "tr": 11, + "ur": 12, + "vi": 13, + "zh": 14 }}, + 'xlm-clm-enfr-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "en", + "1": "fr"}, + "lang2id": { "en": 0, + "fr": 1 }}, + 'xlm-clm-ende-1024': { "do_lowercase_and_remove_accent": True, + "id2lang": { "0": "de", + "1": "en"}, + "lang2id": { "de": 0, + "en": 1 }}, + 'xlm-mlm-17-1280': {"do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "ar", + "1": "de", + "2": "en", + "3": "es", + "4": "fr", + "5": "hi", + "6": "it", + "7": "ja", + "8": "ko", + "9": "nl", + "10": "pl", + "11": "pt", + "12": "ru", + "13": "sv", + "14": "tr", + "15": "vi", + "16": "zh" + }, + "lang2id": { + "ar": 0, + "de": 1, + "en": 2, + "es": 3, + "fr": 4, + "hi": 5, + "it": 6, + "ja": 7, + "ko": 8, + "nl": 9, + "pl": 10, + "pt": 11, + "ru": 12, + "sv": 13, + "tr": 14, + "vi": 15, + "zh": 16}}, + 'xlm-mlm-100-1280': {"do_lowercase_and_remove_accent": False, + "id2lang": { + "0": "af", + "1": "als", + "2": "am", + "3": "an", + "4": "ang", + "5": "ar", + "6": "arz", + "7": "ast", + "8": "az", + "9": "bar", + "10": "be", + "11": "bg", + "12": "bn", + "13": "br", + "14": "bs", + "15": "ca", + "16": "ceb", + "17": "ckb", + "18": "cs", + "19": "cy", + "20": "da", + "21": "de", + "22": "el", + "23": "en", + "24": "eo", + "25": "es", + "26": "et", + "27": "eu", + "28": "fa", + "29": "fi", + "30": "fr", + "31": "fy", + "32": "ga", + "33": "gan", + "34": "gl", + "35": "gu", + "36": "he", + "37": "hi", + "38": "hr", + "39": "hu", + "40": "hy", + "41": "ia", + "42": "id", + "43": "is", + "44": "it", + "45": "ja", + "46": "jv", + "47": "ka", + "48": "kk", + "49": "kn", + "50": "ko", + "51": "ku", + "52": "la", + "53": "lb", + "54": "lt", + "55": "lv", + "56": "mk", + "57": "ml", + "58": "mn", + "59": "mr", + "60": "ms", + "61": "my", + "62": "nds", + "63": "ne", + "64": "nl", + "65": "nn", + "66": "no", + "67": "oc", + "68": "pl", + "69": "pt", + "70": "ro", + "71": "ru", + "72": "scn", + "73": "sco", + "74": "sh", + "75": "si", + "76": "simple", + "77": "sk", + "78": "sl", + "79": "sq", + "80": "sr", + "81": "sv", + "82": "sw", + "83": "ta", + "84": "te", + "85": "th", + "86": "tl", + "87": "tr", + "88": "tt", + "89": "uk", + "90": "ur", + "91": "uz", + "92": "vi", + "93": "war", + "94": "wuu", + "95": "yi", + "96": "zh", + "97": "zh_classical", + "98": "zh_min_nan", + "99": "zh_yue" + }, + "lang2id": { + "af": 0, + "als": 1, + "am": 2, + "an": 3, + "ang": 4, + "ar": 5, + "arz": 6, + "ast": 7, + "az": 8, + "bar": 9, + "be": 10, + "bg": 11, + "bn": 12, + "br": 13, + "bs": 14, + "ca": 15, + "ceb": 16, + "ckb": 17, + "cs": 18, + "cy": 19, + "da": 20, + "de": 21, + "el": 22, + "en": 23, + "eo": 24, + "es": 25, + "et": 26, + "eu": 27, + "fa": 28, + "fi": 29, + "fr": 30, + "fy": 31, + "ga": 32, + "gan": 33, + "gl": 34, + "gu": 35, + "he": 36, + "hi": 37, + "hr": 38, + "hu": 39, + "hy": 40, + "ia": 41, + "id": 42, + "is": 43, + "it": 44, + "ja": 45, + "jv": 46, + "ka": 47, + "kk": 48, + "kn": 49, + "ko": 50, + "ku": 51, + "la": 52, + "lb": 53, + "lt": 54, + "lv": 55, + "mk": 56, + "ml": 57, + "mn": 58, + "mr": 59, + "ms": 60, + "my": 61, + "nds": 62, + "ne": 63, + "nl": 64, + "nn": 65, + "no": 66, + "oc": 67, + "pl": 68, + "pt": 69, + "ro": 70, + "ru": 71, + "scn": 72, + "sco": 73, + "sh": 74, + "si": 75, + "simple": 76, + "sk": 77, + "sl": 78, + "sq": 79, + "sr": 80, + "sv": 81, + "sw": 82, + "ta": 83, + "te": 84, + "th": 85, + "tl": 86, + "tr": 87, + "tt": 88, + "uk": 89, + "ur": 90, + "uz": 91, + "vi": 92, + "war": 93, + "wuu": 94, + "yi": 95, + "zh": 96, + "zh_classical": 97, + "zh_min_nan": 98, + "zh_yue": 99 + }}, +} + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + word is represented as tuple of symbols (symbols being variable-length strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def lowercase_and_remove_accent(text): + """ + Lowercase and strips accents from a piece of text based on + https://github.com/facebookresearch/XLM/blob/master/tools/lowercase_and_remove_accent.py + """ + text = ' '.join(text) + text = text.lower() + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output).lower().split(' ') + + +def replace_unicode_punct(text): + ''' + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/replace-unicode-punctuation.perl + ''' + text = text.replace(',', ',') + text = re.sub(r'。\s*', '. ', text) + text = text.replace('、', ',') + text = text.replace('”', '"') + text = text.replace('“', '"') + text = text.replace('∶', ':') + text = text.replace(':', ':') + text = text.replace('?', '?') + text = text.replace('《', '"') + text = text.replace('》', '"') + text = text.replace(')', ')') + text = text.replace('!', '!') + text = text.replace('(', '(') + text = text.replace(';', ';') + text = text.replace('1', '"') + text = text.replace('」', '"') + text = text.replace('「', '"') + text = text.replace('0', '0') + text = text.replace('3', '3') + text = text.replace('2', '2') + text = text.replace('5', '5') + text = text.replace('6', '6') + text = text.replace('9', '9') + text = text.replace('7', '7') + text = text.replace('8', '8') + text = text.replace('4', '4') + text = re.sub(r'.\s*', '. ', text) + text = text.replace('~', '~') + text = text.replace('’', '\'') + text = text.replace('…', '...') + text = text.replace('━', '-') + text = text.replace('〈', '<') + text = text.replace('〉', '>') + text = text.replace('【', '[') + text = text.replace('】', ']') + text = text.replace('%', '%') + return text + + +def remove_non_printing_char(text): + ''' + Port of https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/remove-non-printing-char.perl + ''' + output = [] + for char in text: + cat = unicodedata.category(char) + if cat.startswith('C'): + continue + output.append(char) + return "".join(output) + + +def romanian_preprocessing(text): + '''Sennrich's WMT16 scripts for Romanian preprocessing, used by model `xlm-mlm-enro-1024`''' + # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/normalise-romanian.py + text = text.replace("\u015e", "\u0218").replace("\u015f", "\u0219") + text = text.replace("\u0162", "\u021a").replace("\u0163", "\u021b") + # https://github.com/rsennrich/wmt16-scripts/blob/master/preprocess/remove-diacritics.py + text = text.replace("\u0218", "S").replace("\u0219", "s") #s-comma + text = text.replace("\u021a", "T").replace("\u021b", "t") #t-comma + text = text.replace("\u0102", "A").replace("\u0103", "a") + text = text.replace("\u00C2", "A").replace("\u00E2", "a") + text = text.replace("\u00CE", "I").replace("\u00EE", "i") + return text + + +class XLMTokenizer(PreTrainedTokenizer): + """ + BPE tokenizer for XLM + + - Moses preprocessing & tokenization for most supported languages + + - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP) + + - (optionally) lower case & normalize all inputs text + + - argument ``special_tokens`` and function ``set_special_tokens``, can be used to add additional symbols \ + (ex: "__classify__") to a vocabulary + + - `lang2id` attribute maps the languages supported by the model with their ids if provided (automatically set for pretrained vocabularies) + + - `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies) + + - `do_lowercase_and_remove_accent` controle lower casing and accent (automatically set for pretrained vocabularies) + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, merges_file, unk_token="", bos_token="", + sep_token="", pad_token="", cls_token="", + mask_token="", additional_special_tokens=["", + "", "", "", "", "", + "", "", "", ""], + lang2id=None, id2lang=None, do_lowercase_and_remove_accent=True, + **kwargs): + super(XLMTokenizer, self).__init__(unk_token=unk_token, bos_token=bos_token, + sep_token=sep_token, pad_token=pad_token, + cls_token=cls_token, mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs) + + # cache of sm.MosesPunctNormalizer instance + self.cache_moses_punct_normalizer = dict() + # cache of sm.MosesTokenizer instance + self.cache_moses_tokenizer = dict() + self.lang_with_custom_tokenizer = set(['zh', 'th', 'ja']) + # True for current supported model (v1.2.0), False for XLM-17 & 100 + self.do_lowercase_and_remove_accent = do_lowercase_and_remove_accent + self.lang2id = lang2id + self.id2lang = id2lang + if lang2id is not None and id2lang is not None: + assert len(lang2id) == len(id2lang) + + self.ja_word_tokenizer = None + self.zh_word_tokenizer = None + + self.encoder = json.load(open(vocab_file, encoding="utf-8")) + self.decoder = {v:k for k,v in self.encoder.items()} + merges = open(merges_file, encoding='utf-8').read().split('\n')[:-1] + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + def moses_punct_norm(self, text, lang): + if lang not in self.cache_moses_punct_normalizer: + punct_normalizer = sm.MosesPunctNormalizer(lang=lang) + self.cache_moses_punct_normalizer[lang] = punct_normalizer + else: + punct_normalizer = self.cache_moses_punct_normalizer[lang] + return punct_normalizer.normalize(text) + + def moses_tokenize(self, text, lang): + if lang not in self.cache_moses_tokenizer: + moses_tokenizer = sm.MosesTokenizer(lang=lang) + self.cache_moses_tokenizer[lang] = moses_tokenizer + else: + moses_tokenizer = self.cache_moses_tokenizer[lang] + return moses_tokenizer.tokenize(text, return_str=False, escape=False) + + def moses_pipeline(self, text, lang): + text = replace_unicode_punct(text) + text = self.moses_punct_norm(text, lang) + text = remove_non_printing_char(text) + return text + + def ja_tokenize(self, text): + if self.ja_word_tokenizer is None: + try: + import Mykytea + self.ja_word_tokenizer = Mykytea.Mykytea('-model %s/local/share/kytea/model.bin' % os.path.expanduser('~')) + except (AttributeError, ImportError) as e: + logger.error("Make sure you install KyTea (https://github.com/neubig/kytea) and it's python wrapper (https://github.com/chezou/Mykytea-python) with the following steps") + logger.error("1. git clone git@github.com:neubig/kytea.git && cd kytea") + logger.error("2. autoreconf -i") + logger.error("3. ./configure --prefix=$HOME/local") + logger.error("4. make && make install") + logger.error("5. pip install kytea") + raise e + return list(self.ja_word_tokenizer.getWS(text)) + + @property + def vocab_size(self): + return len(self.encoder) + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + '',) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token+'' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word)-1 and word[i+1] == second: + new_word.append(first+second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + if word == '\n ': + word = '\n' + self.cache[token] = word + return word + + def _tokenize(self, text, lang='en', bypass_tokenizer=False): + """ + Tokenize a string given language code. For Chinese, Japanese and Thai, we use a language specific tokenizerself. Otherwise, we use Moses. + + Details of tokenization: + - [sacremoses](https://github.com/alvations/sacremoses): port of Moses + - Install with `pip install sacremoses` + - [pythainlp](https://github.com/PyThaiNLP/pythainlp): Thai tokenizer + - Install with `pip install pythainlp` + - [kytea](https://github.com/chezou/Mykytea-python): Japanese tokenizer, wrapper of [KyTea](https://github.com/neubig/kytea) + - Install with the following steps: + ``` + git clone git@github.com:neubig/kytea.git && cd kytea + autoreconf -i + ./configure --prefix=$HOME/local + make && make install + pip install kytea + ``` + - [jieba](https://github.com/fxsjy/jieba): Chinese tokenizer * + - Install with `pip install jieba` + + \* The original XLM used [Stanford Segmenter](https://nlp.stanford.edu/software/stanford-segmenter-2018-10-16.zip). + However, the wrapper (`nltk.tokenize.stanford_segmenter`) is slow due to JVM overhead, and it will be deprecated. + Jieba is a lot faster and pip-installable. Note there is some mismatch with the Stanford Segmenter. It should be fine + if you fine-tune the model with Chinese supervisionself. If you want the same exact behaviour, use the original XLM + [preprocessing script](https://github.com/facebookresearch/XLM/tree/master/tools) to tokenize the sentence externally, + and set `bypass_tokenizer=True` to bypass the tokenizer. + + Args: + - lang: ISO language code (default = 'en') (string). Languages should belong of the model supported languages. However, we don't enforce it. + - bypass_tokenizer: Allow users to preprocess and tokenize the sentences externally (default = False) (bool). If True, we only apply BPE. + + Returns: + List of tokens. + """ + if lang and self.lang2id and lang not in self.lang2id: + logger.error("Supplied language code not found in lang2id mapping. Please check that your language is supported by the loaded pretrained model.") + if bypass_tokenizer: + text = text.split() + elif lang not in self.lang_with_custom_tokenizer: + text = self.moses_pipeline(text, lang=lang) + # TODO: make sure we are using `xlm-mlm-enro-1024`, since XLM-100 doesn't have this step + if lang == 'ro': + text = romanian_preprocessing(text) + text = self.moses_tokenize(text, lang=lang) + elif lang == 'th': + text = self.moses_pipeline(text, lang=lang) + try: + if 'pythainlp' not in sys.modules: + from pythainlp.tokenize import word_tokenize as th_word_tokenize + else: + th_word_tokenize = sys.modules['pythainlp'].word_tokenize + except (AttributeError, ImportError) as e: + logger.error("Make sure you install PyThaiNLP (https://github.com/PyThaiNLP/pythainlp) with the following steps") + logger.error("1. pip install pythainlp") + raise e + text = th_word_tokenize(text) + elif lang == 'zh': + try: + if 'jieba' not in sys.modules: + import jieba + else: + jieba = sys.modules['jieba'] + except (AttributeError, ImportError) as e: + logger.error("Make sure you install Jieba (https://github.com/fxsjy/jieba) with the following steps") + logger.error("1. pip install jieba") + raise e + text = ' '.join(jieba.cut(text)) + text = self.moses_pipeline(text, lang=lang) + text = text.split() + elif lang == 'ja': + text = self.moses_pipeline(text, lang=lang) + text = self.ja_tokenize(text) + else: + raise ValueError('It should not reach here') + + if self.do_lowercase_and_remove_accent and not bypass_tokenizer: + text = lowercase_and_remove_accent(text) + + split_tokens = [] + for token in text: + if token: + split_tokens.extend([t for t in self.bpe(token).split(' ')]) + + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ''.join(tokens).replace('', ' ').strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + sep = [self.sep_token_id] + cls = [self.cls_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + An XLM sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory): + """Save the tokenizer vocabulary and merge files to a directory.""" + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file']) + + with open(vocab_file, 'w', encoding='utf-8') as f: + f.write(json.dumps(self.encoder, ensure_ascii=False)) + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!".format(merge_file)) + index = token_index + writer.write(' '.join(bpe_tokens) + u'\n') + index += 1 + + return vocab_file, merge_file diff --git a/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlnet.py b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlnet.py new file mode 100644 index 0000000..deae8de --- /dev/null +++ b/baselines/models_pytorch/classifier_pytorch/transformers/tokenization_xlnet.py @@ -0,0 +1,253 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization classes for XLNet model.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging +import os +from shutil import copyfile + +import unicodedata +import six + +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'xlnet-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-spiece.model", + 'xlnet-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-large-cased-spiece.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'xlnet-base-cased': None, + 'xlnet-large-cased': None, +} + +SPIECE_UNDERLINE = u'▁' + +# Segments (not really needed) +SEG_ID_A = 0 +SEG_ID_B = 1 +SEG_ID_CLS = 2 +SEG_ID_SEP = 3 +SEG_ID_PAD = 4 + +class XLNetTokenizer(PreTrainedTokenizer): + """ + SentencePiece based tokenizer. Peculiarities: + + - requires `SentencePiece `_ + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, + do_lower_case=False, remove_space=True, keep_accents=False, + bos_token="", eos_token="", unk_token="", sep_token="", + pad_token="", cls_token="", mask_token="", + additional_special_tokens=["", ""], **kwargs): + super(XLNetTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, + unk_token=unk_token, sep_token=sep_token, + pad_token=pad_token, cls_token=cls_token, + mask_token=mask_token, additional_special_tokens= + additional_special_tokens, **kwargs) + + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + try: + import sentencepiece as spm + except ImportError: + logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece") + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(vocab_file) + + @property + def vocab_size(self): + return len(self.sp_model) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + try: + import sentencepiece as spm + except ImportError: + logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece") + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(self.vocab_file) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = ' '.join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if six.PY2 and isinstance(outputs, str): + outputs = outputs.decode('utf-8') + + if not self.keep_accents: + outputs = unicodedata.normalize('NFKD', outputs) + outputs = ''.join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def _tokenize(self, text, return_unicode=True, sample=False): + """ Tokenize a string. + return_unicode is used only for py2 + """ + text = self.preprocess_text(text) + # note(zhiliny): in some systems, sentencepiece only accepts str for py2 + if six.PY2 and isinstance(text, unicode): + text = text.encode('utf-8') + + if not sample: + pieces = self.sp_model.EncodeAsPieces(text) + else: + pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces( + piece[:-1].replace(SPIECE_UNDERLINE, '')) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + # note(zhiliny): convert back to unicode for py2 + if six.PY2 and return_unicode: + ret_pieces = [] + for piece in new_pieces: + if isinstance(piece, str): + piece = piece.decode('utf-8') + ret_pieces.append(piece) + new_pieces = ret_pieces + + return new_pieces + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index, return_unicode=True): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + token = self.sp_model.IdToPiece(index) + if six.PY2 and return_unicode and isinstance(token, str): + token = token.decode('utf-8') + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return token_ids_0 + sep + cls + return token_ids_0 + sep + token_ids_1 + sep + cls + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1, 1] + return ([0] * len(token_ids_0)) + [1, 1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 2 + | first sequence | second sequence | CLS segment ID + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + cls_segment_id = [2] + + if token_ids_1 is None: + return len(token_ids_0 + sep + cls) * [0] + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + cls_segment_id + + def save_vocabulary(self, save_directory): + """ Save the sentencepiece vocabulary (copy original file) and special tokens file + to a directory. + """ + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) diff --git a/baselines/models_pytorch/mrc_pytorch/convert_tf_checkpoint_to_pytorch.py b/baselines/models_pytorch/mrc_pytorch/convert_tf_checkpoint_to_pytorch.py new file mode 100755 index 0000000..b0dc278 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/convert_tf_checkpoint_to_pytorch.py @@ -0,0 +1,124 @@ +# coding=utf-8 +# Copyright 2018 The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +from __future__ import print_function + +import argparse +import os +import re + +import numpy as np +import tensorflow as tf +import torch + +from .pytorch_modeling import BertConfig, BertForPreTraining, ALBertConfig, ALBertForPreTraining + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path, is_albert): + config_path = os.path.abspath(bert_config_file) + tf_path = os.path.abspath(tf_checkpoint_path) + print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + print("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + # Initialise PyTorch model + if is_albert: + config = ALBertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = ALBertForPreTraining(config) + else: + config = BertConfig.from_json_file(bert_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = BertForPreTraining(config) + + for name, array in zip(names, arrays): + name = name.split('/') + if name[0] == 'global_step': + continue + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m"] for n in name): + print("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + else: + pointer = getattr(pointer, l[0]) + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name[-13:] == '_embeddings_2': + pointer = getattr(pointer, 'weight') + array = np.transpose(array) + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + print("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default='check_points/pretrain_models/albert_large_zh/albert_model.ckpt', + type=str, + help="Path the TensorFlow checkpoint path.") + parser.add_argument("--bert_config_file", + default='check_points/pretrain_models/albert_large_zh/albert_config_large.json', + type=str, + help="The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default='check_points/pretrain_models/albert_large_zh/pytorch_albert_model.pth', + type=str, + help="Path to the output PyTorch model.") + parser.add_argument("--is_albert", + default=True, + type=bool, + help="whether is albert?") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.bert_config_file, + args.pytorch_dump_path, + args.is_albert) diff --git a/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_output.py b/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_output.py new file mode 100644 index 0000000..76b60ff --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_output.py @@ -0,0 +1,469 @@ +import collections +import json +import math + +from tqdm import tqdm + +from ..tools.offical_tokenization import BasicTokenizer + + +def write_predictions_topk(config, all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file): + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + print("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature['example_index']].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + + for (example_index, example) in enumerate(tqdm(all_examples)): + features = example_index_to_features[example_index] + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature['unique_id']] + for i in range(config.start_n_top): + for j in range(config.end_n_top): + start_logit = result.start_top_logits[i] + start_index = result.start_top_index[i] + + j_index = i * config.end_n_top + j + + end_logit = result.end_top_logits[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature['tokens']): + continue + if end_index >= len(feature['tokens']): + continue + if not feature['token_is_max_context'].get(str(start_index), False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=start_logit, + end_logit=end_logit)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + # ipdb.set_trace() + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature['tokens'][pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature['token_to_orig_map'][str(pred.start_index)] + orig_doc_end = feature['token_to_orig_map'][str(pred.end_index)] + orig_tokens = example['ori_doc_tokens'][orig_doc_start:(orig_doc_end + 1)] + tok_text = "".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = "".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + # ipdb.set_trace() + + total_scores = [] + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + + probs = _compute_softmax(total_scores) + + # ipdb.set_trace() + + nbest_json = [] + # ipdb.set_trace() + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = float(probs[i]) + output["start_logit"] = float(entry.start_logit) + output["end_logit"] = float(entry.end_logit) + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + # ipdb.set_trace() + + all_predictions[example['qid']] = nbest_json[0]["text"] + all_nbest_json[example['qid']] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n") + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, version_2_with_negative=False, null_score_diff_threshold=0.): + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + print("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature['example_index']].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(tqdm(all_examples)): + features = example_index_to_features[example_index] + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min null score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature['unique_id']] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature['tokens']): + continue + if end_index >= len(feature['tokens']): + continue + if str(start_index) not in feature['token_to_orig_map'] and \ + start_index not in feature['token_to_orig_map']: + continue + if str(end_index) not in feature['token_to_orig_map'] and \ + end_index not in feature['token_to_orig_map']: + continue + if not feature['token_is_max_context'].get(str(start_index), False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature['tokens'][pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature['token_to_orig_map'][str(pred.start_index)] + orig_doc_end = feature['token_to_orig_map'][str(pred.end_index)] + orig_tokens = example['ori_doc_tokens'][orig_doc_start:(orig_doc_end + 1)] + tok_text = "".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = "".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if len(nbest) == 1: + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = float(probs[i]) + output["start_logit"] = float(entry.start_logit) + output["end_logit"] = float(entry.end_logit) + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example['qid']] = nbest_json[0]["text"] + all_nbest_json[example['qid']] = nbest_json + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) + scores_diff_json[example['qid']] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example['qid']] = "" + else: + all_predictions[example['qid']] = best_non_null_entry.text + all_nbest_json[example['qid']] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = "".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + print("Length not equal after stripping spaces: '%s' vs '%s'" % (orig_ns_text, tok_ns_text)) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + print("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + print("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs diff --git a/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_preprocess.py b/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_preprocess.py new file mode 100644 index 0000000..e5c2838 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/preprocess/DRCD_preprocess.py @@ -0,0 +1,345 @@ +import collections +import copy +import json +import os + +from tqdm import tqdm + +from ..tools.langconv import Converter + +SPIECE_UNDERLINE = '▁' + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def Traditional2Simplified(sentence): + ''' + 将sentence中的繁体字转为简体字 + :param sentence: 待转换的句子 + :return: 将句子中繁体字转换为简体字之后的句子 + ''' + sentence = Converter('zh-hans').convert(sentence) + return sentence + + +def json2features(input_file, output_files, tokenizer, is_training=False, max_query_length=64, + max_seq_length=512, doc_stride=128): + with open(input_file, 'r') as f: + train_data = json.load(f) + train_data = train_data['data'] + + def _is_chinese_char(cp): + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def is_fuhao(c): + if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ + or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ + or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ + or c == '‘' or c == '’' or c == '─' or c == ':': + return True + return False + + def _tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if _is_chinese_char(cp) or is_fuhao(char): + if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: + output.append(SPIECE_UNDERLINE) + output.append(char) + output.append(SPIECE_UNDERLINE) + else: + output.append(char) + return "".join(output) + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F or c == SPIECE_UNDERLINE: + return True + return False + + # to examples + examples = [] + mis_match = 0 + for article in tqdm(train_data): + for para in article['paragraphs']: + context = copy.deepcopy(para['context']) + # 转简体 + context = Traditional2Simplified(context) + # context中的中文前后加入空格 + context_chs = _tokenize_chinese_chars(context) + context_fhs = _tokenize_chinese_chars(para['context']) + + doc_tokens = [] + ori_doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + for ic, c in enumerate(context_chs): + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + ori_doc_tokens.append(context_fhs[ic]) + else: + doc_tokens[-1] += c + ori_doc_tokens[-1] += context_fhs[ic] + prev_is_whitespace = False + if c != SPIECE_UNDERLINE: + char_to_word_offset.append(len(doc_tokens) - 1) + + assert len(context_chs) == len(context_fhs) + for qas in para['qas']: + qid = qas['id'] + ques_text = Traditional2Simplified(qas['question']) + ans_text = Traditional2Simplified(qas['answers'][0]['text']) + start_position_final = None + end_position_final = None + + if is_training: + start_position = qas['answers'][0]['answer_start'] + end_position = start_position + len(ans_text) - 1 + + while context[start_position] == " " or context[start_position] == "\t" or \ + context[start_position] == "\r" or context[start_position] == "\n": + start_position += 1 + + start_position_final = char_to_word_offset[start_position] + end_position_final = char_to_word_offset[end_position] + + if doc_tokens[start_position_final] in {"。", ",", ":", ":", ".", ","}: + start_position_final += 1 + actual_text = "".join(doc_tokens[start_position_final:(end_position_final + 1)]) + cleaned_answer_text = "".join(whitespace_tokenize(ans_text)) + + if actual_text != cleaned_answer_text: + print(actual_text, 'V.S', cleaned_answer_text) + mis_match += 1 + + examples.append({'doc_tokens': doc_tokens, + 'ori_doc_tokens': ori_doc_tokens, + 'orig_answer_text': context, + 'qid': qid, + 'question': ques_text, + 'answer': ans_text, + 'start_position': start_position_final, + 'end_position': end_position_final}) + + print('examples num:', len(examples)) + print('mis match:', mis_match) + os.makedirs('/'.join(output_files[0].split('/')[0:-1]), exist_ok=True) + json.dump(examples, open(output_files[0], 'w')) + + # to features + features = [] + unique_id = 1000000000 + for (example_index, example) in enumerate(tqdm(examples)): + query_tokens = tokenizer.tokenize(example['question']) + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example['doc_tokens']): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training: + tok_start_position = orig_to_tok_index[example['start_position']] # 原来token到新token的映射,这是新token的起点 + if example['end_position'] < len(example['doc_tokens']) - 1: + tok_end_position = orig_to_tok_index[example['end_position'] + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example['orig_answer_text']) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + doc_spans = [] + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + if tok_start_position == -1 and tok_end_position == -1: + start_position = 0 # 问题本来没答案,0是[CLS]的位子 + end_position = 0 + else: # 如果原本是有答案的,那么去除没有答案的feature + out_of_span = False + doc_start = doc_span.start # 映射回原文的起点和终点 + doc_end = doc_span.start + doc_span.length - 1 + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): # 该划窗没答案作为无答案增强 + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + features.append({'unique_id': unique_id, + 'example_index': example_index, + 'doc_span_index': doc_span_index, + 'tokens': tokens, + 'token_to_orig_map': token_to_orig_map, + 'token_is_max_context': token_is_max_context, + 'input_ids': input_ids, + 'input_mask': input_mask, + 'segment_ids': segment_ids, + 'start_position': start_position, + 'end_position': end_position}) + unique_id += 1 + + print('features num:', len(features)) + json.dump(features, open(output_files[1], 'w')) diff --git a/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_evaluate.py b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_evaluate.py new file mode 100644 index 0000000..5bc96ae --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_evaluate.py @@ -0,0 +1,216 @@ +# -*- coding: utf-8 -*- +''' +Evaluation script for CMRC 2018 +version: v5 - special +Note: +v5 - special: Evaluate on SQuAD-style CMRC 2018 Datasets +v5: formatted output, add usage description +v4: fixed segmentation issues +''' +from __future__ import print_function + +import json +import re +from collections import OrderedDict + +import nltk + + +# split Chinese with English +def mixed_segmentation(in_str, rm_punc=False): + in_str = str(in_str).lower().strip() + segs_out = [] + temp_str = "" + sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', + ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', + '「', '」', '(', ')', '-', '~', '『', '』'] + for char in in_str: + if rm_punc and char in sp_char: + continue + if re.search(r'[\u4e00-\u9fa5]', char) or char in sp_char: + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + temp_str = "" + segs_out.append(char) + else: + temp_str += char + + # handling last part + if temp_str != "": + ss = nltk.word_tokenize(temp_str) + segs_out.extend(ss) + + return segs_out + + +# remove punctuation +def remove_punctuation(in_str): + in_str = str(in_str).lower().strip() + sp_char = ['-', ':', '_', '*', '^', '/', '\\', '~', '`', '+', '=', + ',', '。', ':', '?', '!', '“', '”', ';', '’', '《', '》', '……', '·', '、', + '「', '」', '(', ')', '-', '~', '『', '』'] + out_segs = [] + for char in in_str: + if char in sp_char: + continue + else: + out_segs.append(char) + return ''.join(out_segs) + + +# find longest common string +def find_lcs(s1, s2): + m = [[0 for i in range(len(s2) + 1)] for j in range(len(s1) + 1)] + mmax = 0 + p = 0 + for i in range(len(s1)): + for j in range(len(s2)): + if s1[i] == s2[j]: + m[i + 1][j + 1] = m[i][j] + 1 + if m[i + 1][j + 1] > mmax: + mmax = m[i + 1][j + 1] + p = i + 1 + return s1[p - mmax:p], mmax + + +def evaluate(ground_truth_file, prediction_file): + f1 = 0 + em = 0 + total_count = 0 + skip_count = 0 + for instance in ground_truth_file["data"]: + # context_id = instance['context_id'].strip() + # context_text = instance['context_text'].strip() + for para in instance["paragraphs"]: + for qas in para['qas']: + total_count += 1 + query_id = qas['id'].strip() + query_text = qas['question'].strip() + answers = [x["text"] for x in qas['answers']] + + if query_id not in prediction_file: + print('Unanswered question: {}\n'.format(query_id)) + skip_count += 1 + continue + + prediction = str(prediction_file[query_id]) + f1 += calc_f1_score(answers, prediction) + em += calc_em_score(answers, prediction) + + f1_score = 100.0 * f1 / total_count + em_score = 100.0 * em / total_count + return f1_score, em_score, total_count, skip_count + + +def evaluate2(ground_truth_file, prediction_file): + f1 = 0 + em = 0 + total_count = 0 + skip_count = 0 + yes_count = 0 + yes_correct = 0 + no_count = 0 + no_correct = 0 + unk_count = 0 + unk_correct = 0 + + for instance in ground_truth_file["data"]: + for para in instance["paragraphs"]: + for qas in para['qas']: + total_count += 1 + query_id = qas['id'].strip() + if query_id not in prediction_file: + print('Unanswered question: {}\n'.format(query_id)) + skip_count += 1 + continue + + prediction = str(prediction_file[query_id]) + + if len(qas['answers']) == 0: + unk_count += 1 + answers = [""] + if prediction == "": + unk_correct += 1 + else: + answers = [] + for x in qas['answers']: + answers.append(x['text']) + if x['text'] == 'YES': + if prediction == 'YES': + yes_correct += 1 + yes_count += 1 + if x['text'] == 'NO': + if prediction == 'NO': + no_correct += 1 + no_count += 1 + + f1 += calc_f1_score(answers, prediction) + em += calc_em_score(answers, prediction) + + f1_score = 100.0 * f1 / total_count + em_score = 100.0 * em / total_count + yes_acc = 100.0 * yes_correct / yes_count + no_acc = 100.0 * no_correct / no_count + unk_acc = 100.0 * unk_correct / unk_count + return f1_score, em_score, yes_acc, no_acc, unk_acc, total_count, skip_count + + +def calc_f1_score(answers, prediction): + f1_scores = [] + for ans in answers: + ans_segs = mixed_segmentation(ans, rm_punc=True) + prediction_segs = mixed_segmentation(prediction, rm_punc=True) + lcs, lcs_len = find_lcs(ans_segs, prediction_segs) + if lcs_len == 0: + f1_scores.append(0) + continue + precision = 1.0 * lcs_len / len(prediction_segs) + recall = 1.0 * lcs_len / len(ans_segs) + f1 = (2 * precision * recall) / (precision + recall) + f1_scores.append(f1) + return max(f1_scores) + + +def calc_em_score(answers, prediction): + em = 0 + for ans in answers: + ans_ = remove_punctuation(ans) + prediction_ = remove_punctuation(prediction) + if ans_ == prediction_: + em = 1 + break + return em + + +def get_eval(original_file, prediction_file): + ground_truth_file = json.load(open(original_file, 'r')) + prediction_file = json.load(open(prediction_file, 'r')) + F1, EM, TOTAL, SKIP = evaluate(ground_truth_file, prediction_file) + AVG = (EM + F1) * 0.5 + output_result = OrderedDict() + output_result['AVERAGE'] = '%.3f' % AVG + output_result['F1'] = '%.3f' % F1 + output_result['EM'] = '%.3f' % EM + output_result['TOTAL'] = TOTAL + output_result['SKIP'] = SKIP + + return output_result + + +def get_eval_with_neg(original_file, prediction_file): + ground_truth_file = json.load(open(original_file, 'r')) + prediction_file = json.load(open(prediction_file, 'r')) + F1, EM, YES_ACC, NO_ACC, UNK_ACC, TOTAL, SKIP = evaluate2(ground_truth_file, prediction_file) + AVG = (EM + F1) * 0.5 + output_result = OrderedDict() + output_result['AVERAGE'] = '%.3f' % AVG + output_result['F1'] = '%.3f' % F1 + output_result['EM'] = '%.3f' % EM + output_result['YES'] = '%.3f' % YES_ACC + output_result['NO'] = '%.3f' % NO_ACC + output_result['UNK'] = '%.3f' % UNK_ACC + output_result['TOTAL'] = TOTAL + output_result['SKIP'] = SKIP + + return output_result diff --git a/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_output.py b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_output.py new file mode 100644 index 0000000..ea5091f --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_output.py @@ -0,0 +1,471 @@ +import collections +import json +import math + +from tqdm import tqdm + +from ..tools.offical_tokenization import BasicTokenizer + + +def write_predictions_topk(FLAGS, all_examples, all_features, all_results, n_best_size, + max_answer_length, output_prediction_file, output_nbest_file): + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature['example_index']].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature['unique_id']] + + cur_null_score = result.cls_logits + + # if we could have irrelevant answers, get the min score of irrelevant + score_null = min(score_null, cur_null_score) + + for i in range(FLAGS.start_n_top): + for j in range(FLAGS.end_n_top): + start_log_prob = result.start_top_log_probs[i] + start_index = result.start_top_index[i] + + j_index = i * FLAGS.end_n_top + j + + end_log_prob = result.end_top_log_probs[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= feature['paragraph_len'] - 1: + continue + if end_index >= feature['paragraph_len'] - 1: + continue + + if not feature['token_is_max_context'].get(start_index, False) and \ + not feature['token_is_max_context'].get(str(start_index), False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + tok_start_to_orig_index = feature['tok_start_to_orig_index'] + tok_end_to_orig_index = feature['tok_end_to_orig_index'] + start_orig_pos = tok_start_to_orig_index[pred.start_index] + end_orig_pos = tok_end_to_orig_index[pred.end_index] + + paragraph_text = example['paragraph_text'] + final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) + + assert len(nbest_json) >= 1 + assert best_non_null_entry is not None + + score_diff = score_null + scores_diff_json[example['qas_id']] = score_diff + # note(zhiliny): always predict best_non_null_entry + # and the evaluation script will search for the best threshold + all_predictions[example['qas_id']] = best_non_null_entry.text + + all_nbest_json[example['qas_id']] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n") + + +def write_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, version_2_with_negative=False, null_score_diff_threshold=0.): + """Write final predictions to the json file and log-odds of null if needed.""" + print("Writing predictions to: %s" % (output_prediction_file)) + print("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature['example_index']].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(tqdm(all_examples)): + features = example_index_to_features[example_index] + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min null score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature['unique_id']] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature['tokens']): + continue + if end_index >= len(feature['tokens']): + continue + if str(start_index) not in feature['token_to_orig_map'] and \ + start_index not in feature['token_to_orig_map']: + continue + if str(end_index) not in feature['token_to_orig_map'] and \ + end_index not in feature['token_to_orig_map']: + continue + if not feature['token_is_max_context'].get(str(start_index), False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature['tokens'][pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature['token_to_orig_map'][str(pred.start_index)] + orig_doc_end = feature['token_to_orig_map'][str(pred.end_index)] + orig_tokens = example['doc_tokens'][orig_doc_start:(orig_doc_end + 1)] + tok_text = "".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = "".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if len(nbest) == 1: + nbest.insert(0, _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append(_NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = float(probs[i]) + output["start_logit"] = float(entry.start_logit) + output["end_logit"] = float(entry.end_logit) + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example['qid']] = nbest_json[0]["text"] + all_nbest_json[example['qid']] = nbest_json + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - (best_non_null_entry.end_logit) + scores_diff_json[example['qid']] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example['qid']] = "" + else: + all_predictions[example['qid']] = best_non_null_entry.text + all_nbest_json[example['qid']] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4, ensure_ascii=False) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4, ensure_ascii=False) + "\n") + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = "".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + print("Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + print("Length not equal after stripping spaces: '%s' vs '%s'" % (orig_ns_text, tok_ns_text)) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + print("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + print("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs diff --git a/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_preprocess.py b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_preprocess.py new file mode 100644 index 0000000..28ecbbe --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/preprocess/cmrc2018_preprocess.py @@ -0,0 +1,362 @@ +import collections +import json +import os + +from tqdm import tqdm + +from ..tools import offical_tokenization as tokenization + +SPIECE_UNDERLINE = '▁' + + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + + # The SQuAD annotations are character based. We first project them to + # whitespace-tokenized words. But then after WordPiece tokenization, we can + # often find a "better match". For example: + # + # Question: What year was John Smith born? + # Context: The leader was John Smith (1895-1943). + # Answer: 1895 + # + # The original whitespace-tokenized answer will be "(1895-1943).". However + # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match + # the exact answer, 1895. + # + # However, this is not always possible. Consider the following: + # + # Question: What country is the top exporter of electornics? + # Context: The Japanese electronics industry is the lagest in the world. + # Answer: Japan + # + # In this case, the annotator chose "Japan" as a character sub-span of + # the word "Japanese". Since our WordPiece tokenizer does not split + # "Japanese", we just use "Japanese" as the annotation. This is fairly rare + # in SQuAD, but does happen. + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + + # Because of the sliding window approach taken to scoring documents, a single + # token can appear in multiple documents. E.g. + # Doc: the man went to the store and bought a gallon of milk + # Span A: the man went to the + # Span B: to the store and bought + # Span C: and bought a gallon of + # ... + # + # Now the word 'bought' will have two scores from spans B and C. We only + # want to consider the score with "maximum context", which we define as + # the *minimum* of its left and right context (the *sum* of left and + # right context will always be the same, of course). + # + # In the example the maximum context for 'bought' would be span C since + # it has 1 left context and 3 right context, while span B has 4 left context + # and 0 right context. + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def json2features(input_file, output_files, tokenizer, is_training=False, repeat_limit=3, max_query_length=64, + max_seq_length=512, doc_stride=128): + with open(input_file, 'r') as f: + train_data = json.load(f) + train_data = train_data['data'] + + def _is_chinese_char(cp): + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def is_fuhao(c): + if c == '。' or c == ',' or c == '!' or c == '?' or c == ';' or c == '、' or c == ':' or c == '(' or c == ')' \ + or c == '-' or c == '~' or c == '「' or c == '《' or c == '》' or c == ',' or c == '」' or c == '"' or c == '“' or c == '”' \ + or c == '$' or c == '『' or c == '』' or c == '—' or c == ';' or c == '。' or c == '(' or c == ')' or c == '-' or c == '~' or c == '。' \ + or c == '‘' or c == '’': + return True + return False + + def _tokenize_chinese_chars(text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if _is_chinese_char(cp) or is_fuhao(char): + if len(output) > 0 and output[-1] != SPIECE_UNDERLINE: + output.append(SPIECE_UNDERLINE) + output.append(char) + output.append(SPIECE_UNDERLINE) + else: + output.append(char) + return "".join(output) + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F or c == SPIECE_UNDERLINE: + return True + return False + + # to examples + examples = [] + mis_match = 0 + for article in tqdm(train_data): + for para in article['paragraphs']: + context = para['context'] + context_chs = _tokenize_chinese_chars(context) + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in context_chs: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + if c != SPIECE_UNDERLINE: + char_to_word_offset.append(len(doc_tokens) - 1) + + for qas in para['qas']: + qid = qas['id'] + ques_text = qas['question'] + ans_text = qas['answers'][0]['text'] + + start_position_final = None + end_position_final = None + if is_training: + count_i = 0 + start_position = qas['answers'][0]['answer_start'] + + end_position = start_position + len(ans_text) - 1 + while context[start_position:end_position + 1] != ans_text and count_i < repeat_limit: + start_position -= 1 + end_position -= 1 + count_i += 1 + + while context[start_position] == " " or context[start_position] == "\t" or \ + context[start_position] == "\r" or context[start_position] == "\n": + start_position += 1 + + start_position_final = char_to_word_offset[start_position] + end_position_final = char_to_word_offset[end_position] + + if doc_tokens[start_position_final] in {"。", ",", ":", ":", ".", ","}: + start_position_final += 1 + + actual_text = "".join(doc_tokens[start_position_final:(end_position_final + 1)]) + cleaned_answer_text = "".join(tokenization.whitespace_tokenize(ans_text)) + + if actual_text != cleaned_answer_text: + print(actual_text, 'V.S', cleaned_answer_text) + mis_match += 1 + # ipdb.set_trace() + + examples.append({'doc_tokens': doc_tokens, + 'orig_answer_text': context, + 'qid': qid, + 'question': ques_text, + 'answer': ans_text, + 'start_position': start_position_final, + 'end_position': end_position_final}) + + print('examples num:', len(examples)) + print('mis_match:', mis_match) + os.makedirs('/'.join(output_files[0].split('/')[0:-1]), exist_ok=True) + json.dump(examples, open(output_files[0], 'w')) + + # to features + features = [] + unique_id = 1000000000 + for (example_index, example) in enumerate(tqdm(examples)): + query_tokens = tokenizer.tokenize(example['question']) + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example['doc_tokens']): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training: + tok_start_position = orig_to_tok_index[example['start_position']] # 原来token到新token的映射,这是新token的起点 + if example['end_position'] < len(example['doc_tokens']) - 1: + tok_end_position = orig_to_tok_index[example['end_position'] + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example['orig_answer_text']) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + doc_spans = [] + _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in query_tokens: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + is_max_context = _check_is_max_context(doc_spans, doc_span_index, split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + start_position = None + end_position = None + if is_training: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + if tok_start_position == -1 and tok_end_position == -1: + start_position = 0 # 问题本来没答案,0是[CLS]的位子 + end_position = 0 + else: # 如果原本是有答案的,那么去除没有答案的feature + out_of_span = False + doc_start = doc_span.start # 映射回原文的起点和终点 + doc_end = doc_span.start + doc_span.length - 1 + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): # 该划窗没答案作为无答案增强 + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + features.append({'unique_id': unique_id, + 'example_index': example_index, + 'doc_span_index': doc_span_index, + 'tokens': tokens, + 'token_to_orig_map': token_to_orig_map, + 'token_is_max_context': token_is_max_context, + 'input_ids': input_ids, + 'input_mask': input_mask, + 'segment_ids': segment_ids, + 'start_position': start_position, + 'end_position': end_position}) + unique_id += 1 + + print('features num:', len(features)) + json.dump(features, open(output_files[1], 'w')) + + +def _convert_index(index, pos, M=None, is_start=True): + if pos >= len(index): + pos = len(index) - 1 + if index[pos] is not None: + return index[pos] + N = len(index) + rear = pos + while rear < N - 1 and index[rear] is None: + rear += 1 + front = pos + while front > 0 and index[front] is None: + front -= 1 + assert index[front] is not None or index[rear] is not None + if index[front] is None: + if index[rear] >= 1: + if is_start: + return 0 + else: + return index[rear] - 1 + return index[rear] + if index[rear] is None: + if M is not None and index[front] < M - 1: + if is_start: + return index[front] + 1 + else: + return M - 1 + return index[front] + if is_start: + if index[rear] > index[front] + 1: + return index[front] + 1 + else: + return index[rear] + else: + if index[rear] > index[front] + 1: + return index[rear] - 1 + else: + return index[front] diff --git a/baselines/models_pytorch/mrc_pytorch/pytorch_modeling.py b/baselines/models_pytorch/mrc_pytorch/pytorch_modeling.py new file mode 100755 index 0000000..6191f7d --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/pytorch_modeling.py @@ -0,0 +1,1205 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" +from __future__ import print_function + +import copy +import json +import logging +import math +import os +import shutil +import tarfile +import tempfile + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss + +from .tools.file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_MODEL_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz", +} +CONFIG_NAME = 'bert_config.json' +WEIGHTS_NAME = 'pytorch_model.bin' + + +def gelu(x): + """Implementation of the gelu activation function. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +def swish(x): + return x * torch.sigmoid(x) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} + + +class BertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + if isinstance(vocab_size_or_config_json_file, str): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +class ALBertConfig(object): + """Configuration class to store the configuration of a `BertModel`. + """ + + def __init__(self, + vocab_size_or_config_json_file, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + embedding_size=128, + ln_type="postln", + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02): + """Constructs BertConfig. + + Args: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu" and "swish" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `BertModel`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + """ + if isinstance(vocab_size_or_config_json_file, str): + with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif isinstance(vocab_size_or_config_json_file, int): + self.vocab_size = vocab_size_or_config_json_file + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.embedding_size = embedding_size + self.ln_type = ln_type + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + else: + raise ValueError("First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)") + + @classmethod + def from_dict(cls, json_object): + """Constructs a `BertConfig` from a Python dictionary of parameters.""" + config = BertConfig(vocab_size_or_config_json_file=-1) + for key, value in json_object.items(): + config.__dict__[key] = value + return config + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `BertConfig` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + return cls.from_dict(json.loads(text)) + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + +try: + from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm +except ImportError: + print("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.") + + class BertLayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-5): + """Construct a layernorm module in the TF style (epsilon inside the square root). + """ + super(BertLayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + u = x.mean(-1, keepdim=True) + s = (x - u).pow(2).mean(-1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.variance_epsilon) + return self.weight * x + self.bias + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config): + super(BertEmbeddings, self).__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0) + # TODO:ROBERTA暂时存在一些问题,必须512才能加载一些模型,但是部分模型却不是用512长度训练的,要注意 + self.position_embeddings = nn.Embedding(512, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None): + seq_length = input_ids.size(1) + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + words_embeddings = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class ALBertEmbeddings(nn.Module): + """ ALBert embeddings. """ + + def __init__(self, config): + super(ALBertEmbeddings, self).__init__() + + # word_embeddings_2: project vector(output_middle) to the hidden space + if config.embedding_size == config.hidden_size: + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0) + self.word_embeddings_2 = None + else: + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0) + self.word_embeddings_2 = nn.Linear(config.embedding_size, config.hidden_size, bias=False) + + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, input_ids, token_type_ids=None, position_ids=None): + seq_length = input_ids.size(1) + if position_ids is None: + position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand_as(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + words_embeddings = self.word_embeddings(input_ids) + if self.word_embeddings_2: + words_embeddings = self.word_embeddings_2(words_embeddings) + position_embeddings = self.position_embeddings(position_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = words_embeddings + position_embeddings + token_type_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config): + super(BertSelfAttention, self).__init__() + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention " + "heads (%d)" % (config.hidden_size, config.num_attention_heads)) + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward(self, hidden_states, attention_mask): + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super(BertSelfOutput, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.ln_type = 'postln' + if 'ln_type' in config.__dict__: + self.ln_type = config.ln_type + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + if self.ln_type == 'preln': + hidden_states = hidden_states + input_tensor + else: + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertAttention(nn.Module): + def __init__(self, config): + super(BertAttention, self).__init__() + self.self = BertSelfAttention(config) + self.output = BertSelfOutput(config) + self.ln_type = 'postln' + if 'ln_type' in config.__dict__: + self.ln_type = config.ln_type + + def forward(self, input_tensor, attention_mask): + if self.ln_type == 'preln': + hidden_state = self.output.LayerNorm(input_tensor) # pre_ln + self_output = self.self(hidden_state, attention_mask) + else: + self_output = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) + return attention_output + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super(BertIntermediate, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + self.intermediate_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super(BertOutput, self).__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.ln_type = 'postln' + if 'ln_type' in config.__dict__: + self.ln_type = config.ln_type + + def forward(self, hidden_states, input_tensor): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + if self.ln_type == 'preln': + hidden_states = hidden_states + input_tensor + else: + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super(BertLayer, self).__init__() + self.ln_type = 'postln' + if 'ln_type' in config.__dict__: + self.ln_type = config.ln_type + self.attention = BertAttention(config) + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward(self, hidden_states, attention_mask): + attention_output = self.attention(hidden_states, attention_mask) + if self.ln_type == 'preln': + attention_output_pre = self.output.LayerNorm(attention_output) + else: + attention_output_pre = attention_output + intermediate_output = self.intermediate(attention_output_pre) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super(BertEncoder, self).__init__() + layer = BertLayer(config) + self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): + all_encoder_layers = [] + for layer_module in self.layer: + hidden_states = layer_module(hidden_states, attention_mask) + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + if not output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + return all_encoder_layers + + +class ALBertEncoder(nn.Module): + def __init__(self, config): + super(ALBertEncoder, self).__init__() + self.num_hidden_layers = config.num_hidden_layers + self.layer_shared = BertLayer(config) + + def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True): + all_encoder_layers = [] + for i in range(self.num_hidden_layers): + hidden_states = self.layer_shared(hidden_states, attention_mask) + if output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + if not output_all_encoded_layers: + all_encoder_layers.append(hidden_states) + return all_encoder_layers + + +class BertPooler(nn.Module): + def __init__(self, config): + super(BertPooler, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states): + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super(BertPredictionHeadTransform, self).__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.transform_act_fn = ACT2FN[config.hidden_act] \ + if isinstance(config.hidden_act, str) else config.hidden_act + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertLMPredictionHead, self).__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(bert_model_embedding_weights.size(1), + bert_model_embedding_weights.size(0), + bias=False) + self.decoder.weight = bert_model_embedding_weights + self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0))) + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertOnlyMLMHead, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + + def forward(self, sequence_output): + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super(BertOnlyNSPHead, self).__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config, bert_model_embedding_weights): + super(BertPreTrainingHeads, self).__init__() + self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class PreTrainedBertModel(nn.Module): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + + def __init__(self, config, *inputs, **kwargs): + super(PreTrainedBertModel, self).__init__() + if not isinstance(config, BertConfig): + raise ValueError( + "Parameter config in `{}(config)` should be an instance of class `BertConfig`. " + "To create a model from a Google pretrained model use " + "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( + self.__class__.__name__, self.__class__.__name__ + )) + self.config = config + + def init_bert_weights(self, module): + """ Initialize the weights. + """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, BertLayerNorm): + module.bias.data.normal_(mean=0.0, std=self.config.initializer_range) + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + @classmethod + def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file or a pytorch state dict. + Download and cache the pre-trained model file if needed. + + Params: + pretrained_model_name: either: + - a str with the name of a pre-trained model to load selected in the list of: + . `bert-base-uncased` + . `bert-large-uncased` + . `bert-base-cased` + . `bert-base-multilingual` + . `bert-base-chinese` + - a path or url to a pretrained model archive containing: + . `bert_config.json` a configuration file for the model + . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance + cache_dir: an optional path to a folder in which the pre-trained models will be cached. + state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + *inputs, **kwargs: additional input for the specific Bert class + (ex: num_labels for BertForSequenceClassification) + """ + if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP: + archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name] + else: + archive_file = pretrained_model_name + # redirect to the cache, if necessary + try: + resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name, + ', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), + archive_file)) + return None + if resolved_archive_file == archive_file: + logger.info("loading archive file {}".format(archive_file)) + else: + logger.info("loading archive file {} from cache at {}".format( + archive_file, resolved_archive_file)) + tempdir = None + if os.path.isdir(resolved_archive_file): + serialization_dir = resolved_archive_file + else: + # Extract archive to temp dir + tempdir = tempfile.mkdtemp() + logger.info("extracting archive file {} to temp dir {}".format( + resolved_archive_file, tempdir)) + with tarfile.open(resolved_archive_file, 'r:gz') as archive: + archive.extractall(tempdir) + serialization_dir = tempdir + # Load config + config_file = os.path.join(serialization_dir, CONFIG_NAME) + config = BertConfig.from_json_file(config_file) + logger.info("Model config {}".format(config)) + # Instantiate model. + model = cls(config, *inputs, **kwargs) + if state_dict is None: + weights_path = os.path.join(serialization_dir, WEIGHTS_NAME) + state_dict = torch.load(weights_path) + + old_keys = [] + new_keys = [] + for key in state_dict.keys(): + new_key = None + if 'gamma' in key: + new_key = key.replace('gamma', 'weight') + if 'beta' in key: + new_key = key.replace('beta', 'bias') + if new_key: + old_keys.append(key) + new_keys.append(new_key) + for old_key, new_key in zip(old_keys, new_keys): + state_dict[new_key] = state_dict.pop(old_key) + + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + if len(missing_keys) > 0: + logger.info("Weights of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Weights from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if tempdir: + # Clean up temp dir + shutil.rmtree(tempdir) + return model + + +class BertModel(PreTrainedBertModel): + def __init__(self, config): + super(BertModel, self).__init__(config) + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers) + sequence_output = encoded_layers[-1] + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output + + +class ALBertModel(PreTrainedBertModel): + def __init__(self, config): + super(ALBertModel, self).__init__(config) + self.embeddings = ALBertEmbeddings(config) + self.encoder = ALBertEncoder(config) + self.pooler = BertPooler(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + embedding_output = self.embeddings(input_ids, token_type_ids) + encoded_layers = self.encoder(embedding_output, + extended_attention_mask, + output_all_encoded_layers=output_all_encoded_layers) + sequence_output = encoded_layers[-1] + pooled_output = self.pooler(sequence_output) + if not output_all_encoded_layers: + encoded_layers = encoded_layers[-1] + return encoded_layers, pooled_output + + +class BertForPreTraining(PreTrainedBertModel): + """BERT model with pre-training heads. + This module comprises the BERT model followed by the two pre-training heads: + - the masked language modeling head, and + - the next sentence classification head. + + Params: + config: a BertConfig class instance with the configuration to build a new model. + + Inputs: + `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] + with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts + `extract_features.py`, `run_classifier.py` and `run_squad.py`) + `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token + types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to + a `sentence B` token (see BERT paper for more details). + `attention_mask`: an optional torch.LongTensor of shape [batch_size, sequence_length] with indices + selected in [0, 1]. It's a mask to be used if the input sequence length is smaller than the max + input sequence length in the current batch. It's the mask that we typically use for attention when + a batch has varying length sentences. + `masked_lm_labels`: masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] + with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss + is only computed for the labels set in [0, ..., vocab_size] + `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] + with indices selected in [0, 1]. + 0 => next sentence is the continuation, 1 => next sentence is a random sentence. + + Outputs: + if `masked_lm_labels` and `next_sentence_label` are not `None`: + Outputs the total_loss which is the sum of the masked language modeling loss and the next + sentence classification loss. + if `masked_lm_labels` or `next_sentence_label` is `None`: + Outputs a tuple comprising + - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and + - the next sentence classification logits of shape [batch_size, 2]. + + Example usage: + ```python + # Already been converted into WordPiece token ids + input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) + input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) + token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) + + config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768, + num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072) + + model = BertForPreTraining(config) + masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) + ``` + """ + + def __init__(self, config): + super(BertForPreTraining, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + next_sentence_label=None): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + if masked_lm_labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + return total_loss + else: + return prediction_scores, seq_relationship_score + + +class ALBertForPreTraining(PreTrainedBertModel): + def __init__(self, config): + super(ALBertForPreTraining, self).__init__(config) + self.bert = ALBertModel(config) + self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, + next_sentence_label=None): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + return sequence_output, pooled_output + # 不做预训练的话,这些不做也没事 + # prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + # + # if masked_lm_labels is not None and next_sentence_label is not None: + # loss_fct = CrossEntropyLoss(ignore_index=-1) + # masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + # next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + # total_loss = masked_lm_loss + next_sentence_loss + # return total_loss + # else: + # return prediction_scores, seq_relationship_score + + +class BertForMaskedLM(PreTrainedBertModel): + def __init__(self, config): + super(BertForMaskedLM, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + prediction_scores = self.cls(sequence_output) + + if masked_lm_labels is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + return masked_lm_loss + else: + return prediction_scores + + +class BertForNextSentencePrediction(PreTrainedBertModel): + def __init__(self, config): + super(BertForNextSentencePrediction, self).__init__(config) + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + seq_relationship_score = self.cls(pooled_output) + + if next_sentence_label is not None: + loss_fct = CrossEntropyLoss(ignore_index=-1) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + return next_sentence_loss + else: + return seq_relationship_score + + +class BertForSequenceClassification(PreTrainedBertModel): + + def __init__(self, config, num_labels=2): + super(BertForSequenceClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + _, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForMultipleChoice(PreTrainedBertModel): + + def __init__(self, config, num_choices=2): + super(BertForMultipleChoice, self).__init__(config) + self.num_choices = num_choices + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) + _, pooled_output = self.bert(flat_input_ids, flat_token_type_ids, flat_attention_mask, + output_all_encoded_layers=False) + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, self.num_choices) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + return loss + else: + return reshaped_logits + + +class BertForTokenClassification(PreTrainedBertModel): + def __init__(self, config, num_labels=2): + super(BertForTokenClassification, self).__init__(config) + self.num_labels = num_labels + self.bert = BertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_labels) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + return loss + else: + return logits + + +class BertForQuestionAnswering(PreTrainedBertModel): + def __init__(self, config): + super(BertForQuestionAnswering, self).__init__(config) + self.bert = BertModel(config) + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + else: + return start_logits, end_logits + + +class BertForQA_CLS(PreTrainedBertModel): + def __init__(self, config): + super(BertForQA_CLS, self).__init__(config) + self.bert = BertModel(config) + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.cls_outputs = nn.Linear(config.hidden_size, 3) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, + start_positions=None, end_positions=None, target_labels=None): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + target_logits = self.cls_outputs(pooled_output) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + + # classifier loss + loss_fct_cls = CrossEntropyLoss(ignore_index=-1) # no loss for has answer + cls_loss = loss_fct_cls(target_logits, target_labels) + + total_loss = ((start_loss + end_loss) / 2) + cls_loss + return total_loss + else: + return start_logits, end_logits, target_logits + + +class ALBertForQA(PreTrainedBertModel): + def __init__(self, config, dropout_rate): + super(ALBertForQA, self).__init__(config) + self.bert = ALBertModel(config) + self.ln_type = config.ln_type + if self.ln_type == 'ln_pre': + self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-5) + else: + self.LayerNorm = None + self.dropout = nn.Dropout(dropout_rate) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None, end_positions=None): + sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False) + if self.ln_type == 'ln_pre': + sequence_output = self.LayerNorm(sequence_output) + sequence_output = self.dropout(sequence_output) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + return total_loss + else: + return start_logits, end_logits + + +class ALBertForQA_CLS(PreTrainedBertModel): + def __init__(self, config, dropout_rate): + super(ALBertForQA_CLS, self).__init__(config) + self.bert = ALBertModel(config) + self.ln_type = config.ln_type + if self.ln_type == 'ln_pre': + self.LayerNorm_qa = BertLayerNorm(config.hidden_size, eps=1e-5) + self.LayerNorm_cls = BertLayerNorm(config.hidden_size, eps=1e-5) + else: + self.LayerNorm_qa = None + self.LayerNorm_cls = None + self.dropout_qa = nn.Dropout(dropout_rate) + self.dropout_cls = nn.Dropout(dropout_rate) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + self.cls_outputs = nn.Linear(config.hidden_size, 3) + self.apply(self.init_bert_weights) + + def forward(self, input_ids, token_type_ids=None, attention_mask=None, + start_positions=None, end_positions=None, target_labels=None): + sequence_output, pooled_output = self.bert(input_ids, token_type_ids, attention_mask, + output_all_encoded_layers=False) + if self.ln_type == 'ln_pre': + sequence_output = self.LayerNorm_qa(sequence_output) + pooled_output = self.LayerNorm_cls(pooled_output) + sequence_output = self.dropout_qa(sequence_output) + pooled_output = self.dropout_cls(pooled_output) + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + target_logits = self.cls_outputs(pooled_output) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + + # classifier loss + loss_fct_cls = CrossEntropyLoss(ignore_index=-1) # no loss for has answer + cls_loss = loss_fct_cls(target_logits, target_labels) + + total_loss = ((start_loss + end_loss) / 2) + cls_loss + return total_loss + else: + return start_logits, end_logits, target_logits diff --git a/baselines/models_pytorch/mrc_pytorch/run_mrc.py b/baselines/models_pytorch/mrc_pytorch/run_mrc.py new file mode 100644 index 0000000..5ee9a9c --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/run_mrc.py @@ -0,0 +1,287 @@ +import argparse +import collections +import json +import os +import random + +import numpy as np +import torch +from torch import nn +from torch.utils.data import TensorDataset, DataLoader +from tqdm import tqdm + +from .preprocess.cmrc2018_evaluate import get_eval +from .pytorch_modeling import BertConfig, BertForQuestionAnswering, ALBertConfig, ALBertForQA +from .tools import offical_tokenization as tokenization, utils +from .tools.pytorch_optimization import get_optimization, warmup_linear + + +def evaluate(model, args, eval_examples, eval_features, device, global_steps, best_f1, best_em, best_f1_em): + print("***** Eval *****") + RawResult = collections.namedtuple("RawResult", + ["unique_id", "start_logits", "end_logits"]) + output_prediction_file = os.path.join(args.checkpoint_dir, + "predictions_steps" + str(global_steps) + ".json") + output_nbest_file = output_prediction_file.replace('predictions', 'nbest') + + all_input_ids = torch.tensor([f['input_ids'] for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f['input_mask'] for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f['segment_ids'] for f in eval_features], dtype=torch.long) + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) + eval_dataloader = DataLoader(eval_data, batch_size=args.n_batch, shuffle=False) + + model.eval() + all_results = [] + print("Start evaluating") + for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating"): + input_ids = input_ids.to(device) + input_mask = input_mask.to(device) + segment_ids = segment_ids.to(device) + with torch.no_grad(): + batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask) + + for i, example_index in enumerate(example_indices): + start_logits = batch_start_logits[i].detach().cpu().tolist() + end_logits = batch_end_logits[i].detach().cpu().tolist() + eval_feature = eval_features[example_index.item()] + unique_id = int(eval_feature['unique_id']) + all_results.append(RawResult(unique_id=unique_id, + start_logits=start_logits, + end_logits=end_logits)) + + write_predictions(eval_examples, eval_features, all_results, + n_best_size=args.n_best, max_answer_length=args.max_ans_length, + do_lower_case=True, output_prediction_file=output_prediction_file, + output_nbest_file=output_nbest_file) + + tmp_result = get_eval(args.dev_file, output_prediction_file) + tmp_result['STEP'] = global_steps + with open(args.log_file, 'a') as aw: + aw.write(json.dumps(tmp_result) + '\n') + print(tmp_result) + + if float(tmp_result['F1']) > best_f1: + best_f1 = float(tmp_result['F1']) + + if float(tmp_result['EM']) > best_em: + best_em = float(tmp_result['EM']) + + if float(tmp_result['F1']) + float(tmp_result['EM']) > best_f1_em: + best_f1_em = float(tmp_result['F1']) + float(tmp_result['EM']) + utils.torch_save_model(model, args.checkpoint_dir, + {'f1': float(tmp_result['F1']), 'em': float(tmp_result['EM'])}, max_save_num=1) + + model.train() + + return best_f1, best_em, best_f1_em + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--gpu_ids', type=str, default='0,1,2,3') + + # training parameter + parser.add_argument('--train_epochs', type=int, default=2) + parser.add_argument('--n_batch', type=int, default=32) + parser.add_argument('--lr', type=float, default=3e-5) + parser.add_argument('--dropout', type=float, default=0.1) + parser.add_argument('--clip_norm', type=float, default=1.0) + parser.add_argument('--warmup_rate', type=float, default=0.05) + parser.add_argument("--schedule", default='warmup_linear', type=str, help='schedule') + parser.add_argument("--weight_decay_rate", default=0.01, type=float, help='weight_decay_rate') + parser.add_argument('--seed', type=list, default=[123, 456, 789, 556, 977]) + parser.add_argument('--float16', action='store_true', default=False) # only sm >= 7.0 (tensorcores) + parser.add_argument('--max_ans_length', type=int, default=50) + parser.add_argument('--n_best', type=int, default=20) + parser.add_argument('--eval_epochs', type=float, default=0.5) + parser.add_argument('--save_best', type=bool, default=True) + parser.add_argument('--vocab_size', type=int, default=21128) + parser.add_argument('--max_seq_length', type=int, default=256) + + # data dir + parser.add_argument('--train_dir', type=str, required=True) + parser.add_argument('--dev_dir1', type=str, required=True) + parser.add_argument('--dev_dir2', type=str, required=True) + parser.add_argument('--train_file', type=str, required=True) + parser.add_argument('--dev_file', type=str, required=True) + parser.add_argument('--bert_config_file', type=str, required=True) + parser.add_argument('--vocab_file', type=str, required=True) + parser.add_argument('--init_restore_dir', type=str, required=True) + parser.add_argument('--checkpoint_dir', type=str, required=True) + parser.add_argument('--task_name', type=str, required=True) + parser.add_argument('--setting_file', type=str, default='setting.txt') + parser.add_argument('--log_file', type=str, default='log.txt') + + # use some global vars for convenience + args = parser.parse_args() + + if args.task_name.lower() == 'drcd': + from baselines.models_pytorch.mrc_pytorch.preprocess.DRCD_output import write_predictions + from baselines.models_pytorch.mrc_pytorch.preprocess.DRCD_preprocess import json2features + elif args.task_name.lower() == 'cmrc2018': + from baselines.models_pytorch.mrc_pytorch.preprocess.cmrc2018_output import write_predictions + from baselines.models_pytorch.mrc_pytorch.preprocess.cmrc2018_preprocess import json2features + else: + raise NotImplementedError + + args.checkpoint_dir += ('/epoch{}_batch{}_lr{}_warmup{}_anslen{}/' + .format(args.train_epochs, args.n_batch, args.lr, args.warmup_rate, args.max_ans_length)) + args.train_dir = args.train_dir.replace('features.json', 'features_' + str(args.max_seq_length) + '.json') + args.dev_dir1 = args.dev_dir1.replace('examples.json', 'examples_' + str(args.max_seq_length) + '.json') + args.dev_dir2 = args.dev_dir2.replace('features.json', 'features_' + str(args.max_seq_length) + '.json') + args = utils.check_args(args) + os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids + device = torch.device("cuda") + n_gpu = torch.cuda.device_count() + print("device %s n_gpu %d" % (device, n_gpu)) + print("device: {} n_gpu: {} 16-bits training: {}".format(device, n_gpu, args.float16)) + + # load the bert setting + if 'albert' not in args.bert_config_file: + bert_config = BertConfig.from_json_file(args.bert_config_file) + else: + bert_config = ALBertConfig.from_json_file(args.bert_config_file) + + # load data + print('loading data...') + tokenizer = tokenization.BertTokenizer(vocab_file=args.vocab_file, do_lower_case=True) + assert args.vocab_size == len(tokenizer.vocab) + if not os.path.exists(args.train_dir): + json2features(args.train_file, [args.train_dir.replace('_features_', '_examples_'), args.train_dir], + tokenizer, is_training=True, + max_seq_length=args.max_seq_length) + + if not os.path.exists(args.dev_dir1) or not os.path.exists(args.dev_dir2): + json2features(args.dev_file, [args.dev_dir1, args.dev_dir2], tokenizer, is_training=False, + max_seq_length=args.max_seq_length) + + train_features = json.load(open(args.train_dir, 'r')) + dev_examples = json.load(open(args.dev_dir1, 'r')) + dev_features = json.load(open(args.dev_dir2, 'r')) + if os.path.exists(args.log_file): + os.remove(args.log_file) + + steps_per_epoch = len(train_features) // args.n_batch + eval_steps = int(steps_per_epoch * args.eval_epochs) + dev_steps_per_epoch = len(dev_features) // args.n_batch + if len(train_features) % args.n_batch != 0: + steps_per_epoch += 1 + if len(dev_features) % args.n_batch != 0: + dev_steps_per_epoch += 1 + total_steps = steps_per_epoch * args.train_epochs + + print('steps per epoch:', steps_per_epoch) + print('total steps:', total_steps) + print('warmup steps:', int(args.warmup_rate * total_steps)) + + F1s = [] + EMs = [] + # 存一个全局最优的模型 + best_f1_em = 0 + + for seed_ in args.seed: + best_f1, best_em = 0, 0 + with open(args.log_file, 'a') as aw: + aw.write('===================================' + + 'SEED:' + str(seed_) + + '===================================' + '\n') + print('SEED:', seed_) + + random.seed(seed_) + np.random.seed(seed_) + torch.manual_seed(seed_) + if n_gpu > 0: + torch.cuda.manual_seed_all(seed_) + + # init model + print('init model...') + if 'albert' not in args.init_restore_dir: + model = BertForQuestionAnswering(bert_config) + else: + model = ALBertForQA(bert_config, dropout_rate=args.dropout) + utils.torch_show_all_params(model) + utils.torch_init_model(model, args.init_restore_dir) + if args.float16: + model.half() + model.to(device) + if n_gpu > 1: + model = torch.nn.DataParallel(model) + optimizer = get_optimization(model=model, + float16=args.float16, + learning_rate=args.lr, + total_steps=total_steps, + schedule=args.schedule, + warmup_rate=args.warmup_rate, + max_grad_norm=args.clip_norm, + weight_decay_rate=args.weight_decay_rate) + + all_input_ids = torch.tensor([f['input_ids'] for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f['input_mask'] for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f['segment_ids'] for f in train_features], dtype=torch.long) + + seq_len = all_input_ids.shape[1] + # 样本长度不能超过bert的长度限制 + assert seq_len <= bert_config.max_position_embeddings + + # true label + all_start_positions = torch.tensor([f['start_position'] for f in train_features], dtype=torch.long) + all_end_positions = torch.tensor([f['end_position'] for f in train_features], dtype=torch.long) + + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions) + train_dataloader = DataLoader(train_data, batch_size=args.n_batch, shuffle=True) + + print('***** Training *****') + model.train() + global_steps = 1 + best_em = 0 + best_f1 = 0 + for i in range(int(args.train_epochs)): + print('Starting epoch %d' % (i + 1)) + total_loss = 0 + iteration = 1 + with tqdm(total=steps_per_epoch, desc='Epoch %d' % (i + 1)) as pbar: + for step, batch in enumerate(train_dataloader): + batch = tuple(t.to(device) for t in batch) + input_ids, input_mask, segment_ids, start_positions, end_positions = batch + loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions) + if n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu. + total_loss += loss.item() + pbar.set_postfix({'loss': '{0:1.5f}'.format(total_loss / (iteration + 1e-5))}) + pbar.update(1) + + if args.float16: + optimizer.backward(loss) + # modify learning rate with special warm up BERT uses + # if args.fp16 is False, BertAdam is used and handles this automatically + lr_this_step = args.lr * warmup_linear(global_steps / total_steps, args.warmup_rate) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + else: + loss.backward() + + optimizer.step() + model.zero_grad() + global_steps += 1 + iteration += 1 + + if global_steps % eval_steps == 0: + best_f1, best_em, best_f1_em = evaluate(model, args, dev_examples, dev_features, device, + global_steps, best_f1, best_em, best_f1_em) + + F1s.append(best_f1) + EMs.append(best_em) + + # release the memory + del model + del optimizer + torch.cuda.empty_cache() + + print('Mean F1:', np.mean(F1s), 'Mean EM:', np.mean(EMs)) + print('Best F1:', np.max(F1s), 'Best EM:', np.max(EMs)) + with open(args.log_file, 'a') as aw: + aw.write('Mean(Best) F1:{}({})\n'.format(np.mean(F1s), np.max(F1s))) + aw.write('Mean(Best) EM:{}({})\n'.format(np.mean(EMs), np.max(EMs))) diff --git a/baselines/models_pytorch/mrc_pytorch/run_mrc_cmrc2018.sh b/baselines/models_pytorch/mrc_pytorch/run_mrc_cmrc2018.sh new file mode 100644 index 0000000..6d85321 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/run_mrc_cmrc2018.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export MODEL_NAME=roberta_wwm_ext_large +export BERT_DIR=$CURRENT_DIR/prev_trained_model/$MODEL_NAME +export GLUE_DIR=$CURRENT_DIR/../../../glue/chineseGLUEdatasets/ +TASK_NAME="CMRC2018" + +python run_mrc.py \ + --gpu_ids="0,1,2,3" \ + --train_epochs=2 \ + --n_batch=32 \ + --lr=3e-5 \ + --warmup_rate=0.1 \ + --float16 \ + --max_seq_length=256 \ + --task_name=$TASK_NAME \ + --vocab_file=BERT_DIR/vocab.txt \ + --bert_config_file=BERT_DIR/bert_config.json \ + --init_checkpoint=BERT_DIR/pytorch_model.pth \ + --train_dir=$GLUE_DIR/$TASK_NAME/train_features.json \ + --train_file=$GLUE_DIR/$TASK_NAME/cmrc2018_train.json \ + --dev_dir1=$GLUE_DIR/$TASK_NAME/dev_examples.json \ + --dev_dir2=$GLUE_DIR/$TASK_NAME/dev_features.json \ + --dev_file=$GLUE_DIR/$TASK_NAME/cmrc2018_dev.json \ + --checkpoint_dir=$GLUE_DIR/$TASK_NAME/$MODEL_NAME/ + diff --git a/baselines/models_pytorch/mrc_pytorch/run_mrc_drcd.sh b/baselines/models_pytorch/mrc_pytorch/run_mrc_drcd.sh new file mode 100644 index 0000000..979c4b3 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/run_mrc_drcd.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CURRENT_DIR=$(cd -P -- "$(dirname -- "$0")" && pwd -P) +export MODEL_NAME=roberta_wwm_ext_large +export BERT_DIR=$CURRENT_DIR/prev_trained_model/$MODEL_NAME +export GLUE_DIR=$CURRENT_DIR/../../../glue/chineseGLUEdatasets/ +TASK_NAME="DRCD" + +python run_mrc.py \ + --gpu_ids="0,1,2,3" \ + --train_epochs=2 \ + --n_batch=32 \ + --lr=3e-5 \ + --warmup_rate=0.1 \ + --float16 \ + --max_seq_length=256 \ + --task_name=$TASK_NAME \ + --vocab_file=BERT_DIR/vocab.txt \ + --bert_config_file=BERT_DIR/bert_config.json \ + --init_checkpoint=BERT_DIR/pytorch_model.pth \ + --train_dir=$GLUE_DIR/$TASK_NAME/train_features.json \ + --train_file=$GLUE_DIR/$TASK_NAME/DRCD_training.json \ + --dev_dir1=$GLUE_DIR/$TASK_NAME/dev_examples.json \ + --dev_dir2=$GLUE_DIR/$TASK_NAME/dev_features.json \ + --dev_file=$GLUE_DIR/$TASK_NAME/DRCD_dev.json \ + --checkpoint_dir=$GLUE_DIR/$TASK_NAME/$MODEL_NAME/ + diff --git a/baselines/models_pytorch/mrc_pytorch/tools/file_utils.py b/baselines/models_pytorch/mrc_pytorch/tools/file_utils.py new file mode 100755 index 0000000..0a9041c --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/file_utils.py @@ -0,0 +1,238 @@ +""" +Utilities for working with the local dataset cache. +This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp +Copyright by the AllenNLP authors. +""" + +import json +import logging +import os +import shutil +import tempfile +from functools import wraps +from hashlib import sha256 +from pathlib import Path +from typing import Optional, Tuple, Union, IO, Callable, Set +from urllib.parse import urlparse + +import boto3 +import requests +from botocore.exceptions import ClientError +from tqdm import tqdm + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', + Path.home() / '.pytorch_pretrained_bert')) + + +def url_to_filename(url: str, etag: str = None) -> str: + """ + Convert `url` into a hashed filename in a repeatable way. + If `etag` is specified, append its hash to the url's, delimited + by a period. + """ + url_bytes = url.encode('utf-8') + url_hash = sha256(url_bytes) + filename = url_hash.hexdigest() + + if etag: + etag_bytes = etag.encode('utf-8') + etag_hash = sha256(etag_bytes) + filename += '.' + etag_hash.hexdigest() + + return filename + + +def filename_to_url(filename: str, cache_dir: Union[str, Path] = None) -> Tuple[str, str]: + """ + Return the url and etag (which may be ``None``) stored for `filename`. + Raise ``FileNotFoundError`` if `filename` or its stored metadata do not exist. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + cache_path = os.path.join(cache_dir, filename) + if not os.path.exists(cache_path): + raise FileNotFoundError("file {} not found".format(cache_path)) + + meta_path = cache_path + '.json' + if not os.path.exists(meta_path): + raise FileNotFoundError("file {} not found".format(meta_path)) + + with open(meta_path) as meta_file: + metadata = json.load(meta_file) + url = metadata['url'] + etag = metadata['etag'] + + return url, etag + + +def cached_path(url_or_filename: Union[str, Path], cache_dir: Union[str, Path] = None) -> str: + """ + Given something that might be a URL (or might be a local path), + determine which. If it's a URL, download the file and cache it, and + return the path to the cached file. If it's already a local path, + make sure the file exists and then return the path. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(url_or_filename, Path): + url_or_filename = str(url_or_filename) + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + parsed = urlparse(url_or_filename) + + if parsed.scheme in ('http', 'https', 's3'): + # URL, so get it from the cache (downloading if necessary) + return get_from_cache(url_or_filename, cache_dir) + elif os.path.exists(url_or_filename): + # File, and it exists. + return url_or_filename + elif parsed.scheme == '': + # File, but it doesn't exist. + raise FileNotFoundError("file {} not found".format(url_or_filename)) + else: + # Something unknown + raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) + + +def split_s3_path(url: str) -> Tuple[str, str]: + """Split a full s3 path into the bucket name and path.""" + parsed = urlparse(url) + if not parsed.netloc or not parsed.path: + raise ValueError("bad s3 path {}".format(url)) + bucket_name = parsed.netloc + s3_path = parsed.path + # Remove '/' at beginning of path. + if s3_path.startswith("/"): + s3_path = s3_path[1:] + return bucket_name, s3_path + + +def s3_request(func: Callable): + """ + Wrapper function for s3 requests in order to create more helpful error + messages. + """ + + @wraps(func) + def wrapper(url: str, *args, **kwargs): + try: + return func(url, *args, **kwargs) + except ClientError as exc: + if int(exc.response["Error"]["Code"]) == 404: + raise FileNotFoundError("file {} not found".format(url)) + else: + raise + + return wrapper + + +@s3_request +def s3_etag(url: str) -> Optional[str]: + """Check ETag on S3 object.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_object = s3_resource.Object(bucket_name, s3_path) + return s3_object.e_tag + + +@s3_request +def s3_get(url: str, temp_file: IO) -> None: + """Pull a file directly from S3.""" + s3_resource = boto3.resource("s3") + bucket_name, s3_path = split_s3_path(url) + s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) + + +def http_get(url: str, temp_file: IO) -> None: + req = requests.get(url, stream=True) + content_length = req.headers.get('Content-Length') + total = int(content_length) if content_length is not None else None + progress = tqdm(unit="B", total=total) + for chunk in req.iter_content(chunk_size=1024): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + temp_file.write(chunk) + progress.close() + + +def get_from_cache(url: str, cache_dir: Union[str, Path] = None) -> str: + """ + Given a URL, look for the corresponding dataset in the local cache. + If it's not there, download it. Then return the path to the cached file. + """ + if cache_dir is None: + cache_dir = PYTORCH_PRETRAINED_BERT_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + os.makedirs(cache_dir, exist_ok=True) + + # Get eTag to add to filename, if it exists. + if url.startswith("s3://"): + etag = s3_etag(url) + else: + response = requests.head(url, allow_redirects=True) + if response.status_code != 200: + raise IOError("HEAD request failed for url {} with status code {}" + .format(url, response.status_code)) + etag = response.headers.get("ETag") + + filename = url_to_filename(url, etag) + + # get cache path to put the file + cache_path = os.path.join(cache_dir, filename) + + if not os.path.exists(cache_path): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with tempfile.NamedTemporaryFile() as temp_file: + logger.info("%s not found in cache, downloading to %s", url, temp_file.name) + + # GET file object + if url.startswith("s3://"): + s3_get(url, temp_file) + else: + http_get(url, temp_file) + + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + # shutil.copyfileobj() starts at the current position, so go to the start + temp_file.seek(0) + + logger.info("copying %s to cache at %s", temp_file.name, cache_path) + with open(cache_path, 'wb') as cache_file: + shutil.copyfileobj(temp_file, cache_file) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w') as meta_file: + json.dump(meta, meta_file) + + logger.info("removing temp file %s", temp_file.name) + + return cache_path + + +def read_set_from_file(filename: str) -> Set[str]: + ''' + Extract a de-duped collection (set) of text from a file. + Expected file format is one item per line. + ''' + collection = set() + with open(filename, 'r', encoding='utf-8') as file_: + for line in file_: + collection.add(line.rstrip()) + return collection + + +def get_file_extension(path: str, dot=True, lower: bool = True): + ext = os.path.splitext(path)[1] + ext = ext if dot else ext[1:] + return ext.lower() if lower else ext diff --git a/baselines/models_pytorch/mrc_pytorch/tools/langconv.py b/baselines/models_pytorch/mrc_pytorch/tools/langconv.py new file mode 100644 index 0000000..efefa41 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/langconv.py @@ -0,0 +1,273 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from copy import deepcopy + +try: + import psyco + psyco.full() +except: + pass + +from .zh_wiki import zh2Hant, zh2Hans + +import sys +py3k = sys.version_info >= (3, 0, 0) + +if py3k: + UEMPTY = '' +else: + _zh2Hant, _zh2Hans = {}, {} + for old, new in ((zh2Hant, _zh2Hant), (zh2Hans, _zh2Hans)): + for k, v in old.items(): + new[k.decode('utf8')] = v.decode('utf8') + zh2Hant = _zh2Hant + zh2Hans = _zh2Hans + UEMPTY = ''.decode('utf8') + +# states +(START, END, FAIL, WAIT_TAIL) = list(range(4)) +# conditions +(TAIL, ERROR, MATCHED_SWITCH, UNMATCHED_SWITCH, CONNECTOR) = list(range(5)) + +MAPS = {} + +class Node(object): + def __init__(self, from_word, to_word=None, is_tail=True, + have_child=False): + self.from_word = from_word + if to_word is None: + self.to_word = from_word + self.data = (is_tail, have_child, from_word) + self.is_original = True + else: + self.to_word = to_word or from_word + self.data = (is_tail, have_child, to_word) + self.is_original = False + self.is_tail = is_tail + self.have_child = have_child + + def is_original_long_word(self): + return self.is_original and len(self.from_word)>1 + + def is_follow(self, chars): + return chars != self.from_word[:-1] + + def __str__(self): + return '' % (repr(self.from_word), + repr(self.to_word), self.is_tail, self.have_child) + + __repr__ = __str__ + +class ConvertMap(object): + def __init__(self, name, mapping=None): + self.name = name + self._map = {} + if mapping: + self.set_convert_map(mapping) + + def set_convert_map(self, mapping): + convert_map = {} + have_child = {} + max_key_length = 0 + for key in sorted(mapping.keys()): + if len(key)>1: + for i in range(1, len(key)): + parent_key = key[:i] + have_child[parent_key] = True + have_child[key] = False + max_key_length = max(max_key_length, len(key)) + for key in sorted(have_child.keys()): + convert_map[key] = (key in mapping, have_child[key], + mapping.get(key, UEMPTY)) + self._map = convert_map + self.max_key_length = max_key_length + + def __getitem__(self, k): + try: + is_tail, have_child, to_word = self._map[k] + return Node(k, to_word, is_tail, have_child) + except: + return Node(k) + + def __contains__(self, k): + return k in self._map + + def __len__(self): + return len(self._map) + +class StatesMachineException(Exception): pass + +class StatesMachine(object): + def __init__(self): + self.state = START + self.final = UEMPTY + self.len = 0 + self.pool = UEMPTY + + def clone(self, pool): + new = deepcopy(self) + new.state = WAIT_TAIL + new.pool = pool + return new + + def feed(self, char, map): + node = map[self.pool+char] + + if node.have_child: + if node.is_tail: + if node.is_original: + cond = UNMATCHED_SWITCH + else: + cond = MATCHED_SWITCH + else: + cond = CONNECTOR + else: + if node.is_tail: + cond = TAIL + else: + cond = ERROR + + new = None + if cond == ERROR: + self.state = FAIL + elif cond == TAIL: + if self.state == WAIT_TAIL and node.is_original_long_word(): + self.state = FAIL + else: + self.final += node.to_word + self.len += 1 + self.pool = UEMPTY + self.state = END + elif self.state == START or self.state == WAIT_TAIL: + if cond == MATCHED_SWITCH: + new = self.clone(node.from_word) + self.final += node.to_word + self.len += 1 + self.state = END + self.pool = UEMPTY + elif cond == UNMATCHED_SWITCH or cond == CONNECTOR: + if self.state == START: + new = self.clone(node.from_word) + self.final += node.to_word + self.len += 1 + self.state = END + else: + if node.is_follow(self.pool): + self.state = FAIL + else: + self.pool = node.from_word + elif self.state == END: + # END is a new START + self.state = START + new = self.feed(char, map) + elif self.state == FAIL: + raise StatesMachineException('Translate States Machine ' + 'have error with input data %s' % node) + return new + + def __len__(self): + return self.len + 1 + + def __str__(self): + return '' % ( + id(self), self.pool, self.state, self.final) + __repr__ = __str__ + +class Converter(object): + def __init__(self, to_encoding): + self.to_encoding = to_encoding + self.map = MAPS[to_encoding] + self.start() + + def feed(self, char): + branches = [] + for fsm in self.machines: + new = fsm.feed(char, self.map) + if new: + branches.append(new) + if branches: + self.machines.extend(branches) + self.machines = [fsm for fsm in self.machines if fsm.state != FAIL] + all_ok = True + for fsm in self.machines: + if fsm.state != END: + all_ok = False + if all_ok: + self._clean() + return self.get_result() + + def _clean(self): + if len(self.machines): + self.machines.sort(key=lambda x: len(x)) + # self.machines.sort(cmp=lambda x,y: cmp(len(x), len(y))) + self.final += self.machines[0].final + self.machines = [StatesMachine()] + + def start(self): + self.machines = [StatesMachine()] + self.final = UEMPTY + + def end(self): + self.machines = [fsm for fsm in self.machines + if fsm.state == FAIL or fsm.state == END] + self._clean() + + def convert(self, string): + self.start() + for char in string: + self.feed(char) + self.end() + return self.get_result() + + def get_result(self): + return self.final + + +def registery(name, mapping): + global MAPS + MAPS[name] = ConvertMap(name, mapping) + +registery('zh-hant', zh2Hant) +registery('zh-hans', zh2Hans) +del zh2Hant, zh2Hans + + +def run(): + import sys + from optparse import OptionParser + parser = OptionParser() + parser.add_option('-e', type='string', dest='encoding', + help='encoding') + parser.add_option('-f', type='string', dest='file_in', + help='input file (- for stdin)') + parser.add_option('-t', type='string', dest='file_out', + help='output file') + (options, args) = parser.parse_args() + if not options.encoding: + parser.error('encoding must be set') + if options.file_in: + if options.file_in == '-': + file_in = sys.stdin + else: + file_in = open(options.file_in) + else: + file_in = sys.stdin + if options.file_out: + if options.file_out == '-': + file_out = sys.stdout + else: + file_out = open(options.file_out, 'wb') + else: + file_out = sys.stdout + + c = Converter(options.encoding) + for line in file_in: + # print >> file_out, c.convert(line.rstrip('\n').decode( + file_out.write(c.convert(line.rstrip('\n').decode( + 'utf8')).encode('utf8')) + + +if __name__ == '__main__': + run() + diff --git a/baselines/models_pytorch/mrc_pytorch/tools/offical_tokenization.py b/baselines/models_pytorch/mrc_pytorch/tools/offical_tokenization.py new file mode 100755 index 0000000..ba96e42 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/offical_tokenization.py @@ -0,0 +1,343 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import logging +import os +import unicodedata + +from .file_utils import cached_path + +logger = logging.getLogger(__name__) + +PRETRAINED_VOCAB_ARCHIVE_MAP = { + 'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt", + 'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt", + 'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt", + 'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt", + 'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt", + 'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt", + 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", +} +VOCAB_NAME = 'vocab.txt' + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + index = 0 + with open(vocab_file, "r", encoding="utf-8") as reader: + while True: + token = reader.readline() + if not token: + break + token = token.strip() + vocab[token] = index + index += 1 + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a peice of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(object): + """Runs end-to-end tokenization: punctuation splitting + wordpiece""" + def __init__(self, vocab_file, do_lower_case=True): + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab) + + def tokenize(self, text): + split_tokens = [] + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + return split_tokens + + def convert_tokens_to_ids(self, tokens): + """Converts a sequence of tokens into ids using the vocab.""" + ids = [] + for token in tokens: + ids.append(self.vocab[token]) + return ids + + def convert_ids_to_tokens(self, ids): + """Converts a sequence of ids in wordpiece tokens using the vocab.""" + tokens = [] + for i in ids: + tokens.append(self.ids_to_tokens[i]) + return tokens + + @classmethod + def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): + """ + Instantiate a PreTrainedBertModel from a pre-trained model file. + Download and cache the pre-trained model file if needed. + """ + if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: + vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] + else: + vocab_file = pretrained_model_name + if os.path.isdir(vocab_file): + vocab_file = os.path.join(vocab_file, VOCAB_NAME) + # redirect to the cache, if necessary + try: + resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) + except FileNotFoundError: + logger.error( + "Model name '{}' was not found in model name list ({}). " + "We assumed '{}' was a path or url but couldn't find any file " + "associated to this path or url.".format( + pretrained_model_name, + ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), + vocab_file)) + return None + if resolved_vocab_file == vocab_file: + logger.info("loading vocabulary file {}".format(vocab_file)) + else: + logger.info("loading vocabulary file {} from cache at {}".format( + vocab_file, resolved_vocab_file)) + # Instantiate tokenizer. + tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) + return tokenizer + + +class BasicTokenizer(object): + """Runs basic tokenization (punctuation splitting, lower casing, etc.).""" + + def __init__(self, do_lower_case=True): + """Constructs a BasicTokenizer. + + Args: + do_lower_case: Whether to lower case the input. + """ + self.do_lower_case = do_lower_case + + def tokenize(self, text): + """Tokenizes a piece of text.""" + text = self._clean_text(text) + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + text = self._tokenize_chinese_chars(text) + orig_tokens = whitespace_tokenize(text) + split_tokens = [] + for token in orig_tokens: + if self.do_lower_case: + token = token.lower() + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text): + """Splits punctuation on a piece of text.""" + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ((cp >= 0x4E00 and cp <= 0x9FFF) or # + (cp >= 0x3400 and cp <= 0x4DBF) or # + (cp >= 0x20000 and cp <= 0x2A6DF) or # + (cp >= 0x2A700 and cp <= 0x2B73F) or # + (cp >= 0x2B740 and cp <= 0x2B81F) or # + (cp >= 0x2B820 and cp <= 0x2CEAF) or + (cp >= 0xF900 and cp <= 0xFAFF) or # + (cp >= 0x2F800 and cp <= 0x2FA1F)): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xfffd or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer(object): + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """Tokenizes a piece of text into its word pieces. + + This uses a greedy longest-match-first algorithm to perform tokenization + using the given vocabulary. + + For example: + input = "unaffable" + output = ["un", "##aff", "##able"] + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through `BasicTokenizer. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +def _is_whitespace(char): + """Checks whether `chars` is a whitespace character.""" + # \t, \n, and \r are technically contorl characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `chars` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `chars` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or + (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False diff --git a/baselines/models_pytorch/mrc_pytorch/tools/pytorch_optimization.py b/baselines/models_pytorch/mrc_pytorch/tools/pytorch_optimization.py new file mode 100755 index 0000000..dd44ca4 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/pytorch_optimization.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math + +import torch +from torch.nn.utils import clip_grad_norm_ +from torch.optim.optimizer import Optimizer + + +def warmup_cosine(x, warmup=0.002): + if x < warmup: + return x / warmup + return 0.5 * (1.0 + torch.cos(math.pi * x)) + + +def warmup_constant(x, warmup=0.002): + if x < warmup: + return x / warmup + return 1.0 + + +def warmup_linear(x, warmup=0.002): + if x < warmup: + return x / warmup + return (1.0 - x) / (1.0 - warmup) + + +def warmup_fix(step, warmup_step): + return min(1.0, step / warmup_step) + + +SCHEDULES = { + 'warmup_cosine': warmup_cosine, + 'warmup_constant': warmup_constant, + 'warmup_linear': warmup_linear, + 'warmup_fix': warmup_fix +} + + +class BERTAdam(Optimizer): + """Implements BERT version of Adam algorithm with weight decay fix (and no ). + Params: + lr: learning rate + warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 + t_total: total number of training steps for the learning + rate schedule, -1 means constant learning rate. Default: -1 + schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' + b1: Adams b1. Default: 0.9 + b2: Adams b2. Default: 0.999 + e: Adams epsilon. Default: 1e-6 + weight_decay_rate: Weight decay. Default: 0.01 + max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 + """ + + def __init__(self, params, lr, warmup=-1, t_total=-1, schedule='warmup_linear', + b1=0.9, b2=0.999, e=1e-6, weight_decay_rate=0.01, cycle_step=None, + max_grad_norm=1.0): + if lr is not None and not lr >= 0.0: + raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) + if schedule not in SCHEDULES: + raise ValueError("Invalid schedule parameter: {}".format(schedule)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + if not 0.0 <= b1 < 1.0: + raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) + if not 0.0 <= b2 < 1.0: + raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) + if not e >= 0.0: + raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) + defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + b1=b1, b2=b2, e=e, weight_decay_rate=weight_decay_rate, + max_grad_norm=max_grad_norm, cycle_step=cycle_step) + super(BERTAdam, self).__init__(params, defaults) + + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.data + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['next_m'] = torch.zeros_like(p.data) + # Exponential moving average of squared gradient values + state['next_v'] = torch.zeros_like(p.data) + + next_m, next_v = state['next_m'], state['next_v'] + beta1, beta2 = group['b1'], group['b2'] + + # Add grad clipping + if group['max_grad_norm'] > 0: + clip_grad_norm_(p, group['max_grad_norm']) + + # Decay the first and second moment running average coefficient + # In-place operations to update the averages at the same time + next_m.mul_(beta1).add_(1 - beta1, grad) + next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) + update = next_m / (next_v.sqrt() + group['e']) + + # Just adding the square of the weights to the loss function is *not* + # the correct way of using L2 regularization/weight decay with Adam, + # since that will interact with the m and v parameters in strange ways. + # + # Instead we want ot decay the weights in a manner that doesn't interact + # with the m/v parameters. This is equivalent to adding the square + # of the weights to the loss with plain (non-momentum) SGD. + if group['weight_decay_rate'] > 0.0: + update += group['weight_decay_rate'] * p.data + + schedule_fct = SCHEDULES[group['schedule']] + if group['cycle_step'] is not None and state['step'] > group['cycle_step']: + lr_scheduled = group['lr'] * (1 - ((state['step'] % group['cycle_step']) / group['cycle_step'])) + elif group['t_total'] != -1 and group['schedule'] != 'warmup_fix': + lr_scheduled = group['lr'] * schedule_fct(state['step'] / group['t_total'], group['warmup']) + elif group['schedule'] == 'warmup_fix': + lr_scheduled = group['lr'] * schedule_fct(state['step'], group['warmup'] * group['t_total']) + else: + lr_scheduled = group['lr'] + + update_with_lr = lr_scheduled * update + p.data.add_(-update_with_lr) + + state['step'] += 1 + + return loss + + +def get_optimization(model, float16, learning_rate, total_steps, schedule, + warmup_rate, weight_decay_rate, max_grad_norm, opt_pooler=False): + # Prepare optimizer + assert 0.0 <= warmup_rate <= 1.0 + param_optimizer = list(model.named_parameters()) + + # hack to remove pooler, which is not used + # thus it produce None grad that break apex + if opt_pooler is False: + param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_parameters = [ + {'params': [p for n, p in param_optimizer if not any([nd in n for nd in no_decay])], + 'weight_decay_rate': weight_decay_rate}, + {'params': [p for n, p in param_optimizer if any([nd in n for nd in no_decay])], + 'weight_decay_rate': 0.0} + ] + if float16: + try: + from apex.contrib.optimizers import FP16_Optimizer + from apex.contrib.optimizers import FusedAdam + except ImportError: + raise ImportError( + "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + optimizer = FusedAdam(optimizer_parameters, + lr=learning_rate, + bias_correction=False, + max_grad_norm=max_grad_norm) + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = BERTAdam(params=optimizer_parameters, + lr=learning_rate, + warmup=warmup_rate, + max_grad_norm=max_grad_norm, + t_total=total_steps, + schedule=schedule, + weight_decay_rate=weight_decay_rate) + + return optimizer diff --git a/baselines/models_pytorch/mrc_pytorch/tools/utils.py b/baselines/models_pytorch/mrc_pytorch/tools/utils.py new file mode 100644 index 0000000..d55d757 --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/utils.py @@ -0,0 +1,146 @@ +import collections +import os +import re +from glob import glob + +import tensorflow as tf +import tensorflow.contrib.slim as slim +import torch + + +def check_args(args): + args.setting_file = os.path.join(args.checkpoint_dir, args.setting_file) + args.log_file = os.path.join(args.checkpoint_dir, args.log_file) + os.makedirs(args.checkpoint_dir, exist_ok=True) + with open(args.setting_file, 'wt') as opt_file: + opt_file.write('------------ Options -------------\n') + print('------------ Options -------------') + for k in args.__dict__: + v = args.__dict__[k] + opt_file.write('%s: %s\n' % (str(k), str(v))) + print('%s: %s' % (str(k), str(v))) + opt_file.write('-------------- End ----------------\n') + print('------------ End -------------') + + return args + + +def show_all_variables(rank=0): + model_vars = tf.trainable_variables() + slim.model_analyzer.analyze_vars(model_vars, print_info=True if rank == 0 else False) + + +def torch_show_all_params(model, rank=0): + params = list(model.parameters()) + k = 0 + for i in params: + l = 1 + for j in i.size(): + l *= j + k = k + l + if rank == 0: + print("Total param num:" + str(k)) + + +# import ipdb +def get_assigment_map_from_checkpoint(tvars, init_checkpoint): + """Compute the union of the current variables and checkpoint variables.""" + initialized_variable_names = {} + new_variable_names = set() + unused_variable_names = set() + + name_to_variable = collections.OrderedDict() + for var in tvars: + name = var.name + m = re.match("^(.*):\\d+$", name) + if m is not None: + name = m.group(1) + name_to_variable[name] = var + + init_vars = tf.train.list_variables(init_checkpoint) + + assignment_map = collections.OrderedDict() + for x in init_vars: + (name, var) = (x[0], x[1]) + if name not in name_to_variable: + if 'adam' not in name: + unused_variable_names.add(name) + continue + # assignment_map[name] = name + assignment_map[name] = name_to_variable[name] + initialized_variable_names[name] = 1 + initialized_variable_names[name + ":0"] = 1 + + for name in name_to_variable: + if name not in initialized_variable_names: + new_variable_names.add(name) + return assignment_map, initialized_variable_names, new_variable_names, unused_variable_names + + +# loading weights +def init_from_checkpoint(init_checkpoint, tvars=None, rank=0): + if not tvars: + tvars = tf.trainable_variables() + assignment_map, initialized_variable_names, new_variable_names, unused_variable_names \ + = get_assigment_map_from_checkpoint(tvars, init_checkpoint) + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + if rank == 0: + # 显示成功加载的权重 + for t in initialized_variable_names: + if ":0" not in t: + print("Loading weights success: " + t) + + # 显示新的参数 + print('New parameters:', new_variable_names) + + # 显示初始化参数中没用到的参数 + print('Unused parameters', unused_variable_names) + + +def torch_init_model(model, init_checkpoint): + state_dict = torch.load(init_checkpoint, map_location='cpu') + missing_keys = [] + unexpected_keys = [] + error_msgs = [] + # copy state_dict so _load_from_state_dict can modify it + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + def load(module, prefix=''): + local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {}) + + module._load_from_state_dict( + state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(model, prefix='' if hasattr(model, 'bert') else 'bert.') + + print("missing keys:{}".format(missing_keys)) + print('unexpected keys:{}'.format(unexpected_keys)) + print('error msgs:{}'.format(error_msgs)) + + +def torch_save_model(model, output_dir, scores, max_save_num=1): + # Save model checkpoint + if not os.path.exists(output_dir): + os.makedirs(output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + saved_pths = glob(os.path.join(output_dir, '*.pth')) + saved_pths.sort() + while len(saved_pths) >= max_save_num: + if os.path.exists(saved_pths[0].replace('//', '/')): + os.remove(saved_pths[0].replace('//', '/')) + del saved_pths[0] + + save_prex = "checkpoint_score" + for k in scores: + save_prex += ('_' + k + '-' + str(scores[k])[:6]) + save_prex += '.pth' + + torch.save(model_to_save.state_dict(), + os.path.join(output_dir, save_prex)) + print("Saving model checkpoint to %s", output_dir) diff --git a/baselines/models_pytorch/mrc_pytorch/tools/zh_wiki.py b/baselines/models_pytorch/mrc_pytorch/tools/zh_wiki.py new file mode 100644 index 0000000..32acd1f --- /dev/null +++ b/baselines/models_pytorch/mrc_pytorch/tools/zh_wiki.py @@ -0,0 +1,8287 @@ +# -*- coding: utf-8 -*- +# copy fom wikipedia + +zh2Hant = { +'呆': '獃', +"打印机": "印表機", +'帮助文件': '說明檔案', +"画": "畫", +"龙": "竜", +"板": "板", +"表": "表", +"才": "才", +"丑": "醜", +"出": "出", +"淀": "澱", +"冬": "冬", +"范": "範", +"丰": "豐", +"刮": "刮", +"后": "後", +"胡": "胡", +"回": "回", +"伙": "夥", +"姜": "薑", +"借": "借", +"克": "克", +"困": "困", +"漓": "漓", +"里": "里", +"帘": "簾", +"霉": "霉", +"面": "面", +"蔑": "蔑", +"千": "千", +"秋": "秋", +"松": "松", +"咸": "咸", +"向": "向", +"余": "餘", +"郁": "鬱", +"御": "御", +"愿": "願", +"云": "雲", +"芸": "芸", +"沄": "沄", +"致": "致", +"制": "制", +"朱": "朱", +"筑": "築", +"准": "準", +"厂": "廠", +"广": "廣", +"辟": "闢", +"别": "別", +"卜": "卜", +"沈": "沈", +"冲": "沖", +"种": "種", +"虫": "蟲", +"担": "擔", +"党": "黨", +"斗": "鬥", +"儿": "兒", +"干": "乾", +"谷": "谷", +"柜": "櫃", +"合": "合", +"划": "劃", +"坏": "壞", +"几": "幾", +"系": "系", +"家": "家", +"价": "價", +"据": "據", +"卷": "捲", +"适": "適", +"蜡": "蠟", +"腊": "臘", +"了": "了", +"累": "累", +"么": "麽", +"蒙": "蒙", +"万": "萬", +"宁": "寧", +"朴": "樸", +"苹": "蘋", +"仆": "僕", +"曲": "曲", +"确": "確", +"舍": "舍", +"胜": "勝", +"术": "術", +"台": "台", +"体": "體", +"涂": "塗", +"叶": "葉", +"吁": "吁", +"旋": "旋", +"佣": "傭", +"与": "與", +"折": "折", +"征": "徵", +"症": "症", +"恶": "惡", +"发": "發", +"复": "復", +"汇": "匯", +"获": "獲", +"饥": "飢", +"尽": "盡", +"历": "歷", +"卤": "滷", +"弥": "彌", +"签": "簽", +"纤": "纖", +"苏": "蘇", +"坛": "壇", +"团": "團", +"须": "須", +"脏": "臟", +"只": "只", +"钟": "鐘", +"药": "藥", +"同": "同", +"志": "志", +"杯": "杯", +"岳": "岳", +"布": "布", +"当": "當", +"吊": "弔", +"仇": "仇", +"蕴": "蘊", +"线": "線", +"为": "為", +"产": "產", +"众": "眾", +"伪": "偽", +"凫": "鳧", +"厕": "廁", +"启": "啟", +"墙": "牆", +"壳": "殼", +"奖": "獎", +"妫": "媯", +"并": "並", +"录": "錄", +"悫": "愨", +"极": "極", +"沩": "溈", +"瘘": "瘺", +"硷": "鹼", +"竖": "豎", +"绝": "絕", +"绣": "繡", +"绦": "絛", +"绱": "緔", +"绷": "綳", +"绿": "綠", +"缰": "韁", +"苧": "苎", +"莼": "蒓", +"说": "說", +"谣": "謠", +"谫": "譾", +"赃": "贓", +"赍": "齎", +"赝": "贗", +"酝": "醞", +"采": "採", +"钩": "鉤", +"钵": "缽", +"锈": "銹", +"锐": "銳", +"锨": "杴", +"镌": "鐫", +"镢": "钁", +"阅": "閱", +"颓": "頹", +"颜": "顏", +"骂": "罵", +"鲇": "鯰", +"鲞": "鯗", +"鳄": "鱷", +"鸡": "雞", +"鹚": "鶿", +"荡": "盪", +"锤": "錘", +"㟆": "㠏", +"㛟": "𡞵", +"专": "專", +"业": "業", +"丛": "叢", +"东": "東", +"丝": "絲", +"丢": "丟", +"两": "兩", +"严": "嚴", +"丧": "喪", +"个": "個", +"临": "臨", +"丽": "麗", +"举": "舉", +"义": "義", +"乌": "烏", +"乐": "樂", +"乔": "喬", +"习": "習", +"乡": "鄉", +"书": "書", +"买": "買", +"乱": "亂", +"争": "爭", +"于": "於", +"亏": "虧", +"亚": "亞", +"亩": "畝", +"亲": "親", +"亵": "褻", +"亸": "嚲", +"亿": "億", +"仅": "僅", +"从": "從", +"仑": "侖", +"仓": "倉", +"仪": "儀", +"们": "們", +"优": "優", +"会": "會", +"伛": "傴", +"伞": "傘", +"伟": "偉", +"传": "傳", +"伣": "俔", +"伤": "傷", +"伥": "倀", +"伦": "倫", +"伧": "傖", +"伫": "佇", +"佥": "僉", +"侠": "俠", +"侣": "侶", +"侥": "僥", +"侦": "偵", +"侧": "側", +"侨": "僑", +"侩": "儈", +"侪": "儕", +"侬": "儂", +"俣": "俁", +"俦": "儔", +"俨": "儼", +"俩": "倆", +"俪": "儷", +"俫": "倈", +"俭": "儉", +"债": "債", +"倾": "傾", +"偬": "傯", +"偻": "僂", +"偾": "僨", +"偿": "償", +"傥": "儻", +"傧": "儐", +"储": "儲", +"傩": "儺", +"㑩": "儸", +"兑": "兌", +"兖": "兗", +"兰": "蘭", +"关": "關", +"兴": "興", +"兹": "茲", +"养": "養", +"兽": "獸", +"冁": "囅", +"内": "內", +"冈": "岡", +"册": "冊", +"写": "寫", +"军": "軍", +"农": "農", +"冯": "馮", +"决": "決", +"况": "況", +"冻": "凍", +"净": "凈", +"凉": "涼", +"减": "減", +"凑": "湊", +"凛": "凜", +"凤": "鳳", +"凭": "憑", +"凯": "凱", +"击": "擊", +"凿": "鑿", +"刍": "芻", +"刘": "劉", +"则": "則", +"刚": "剛", +"创": "創", +"删": "刪", +"刬": "剗", +"刭": "剄", +"刹": "剎", +"刽": "劊", +"刿": "劌", +"剀": "剴", +"剂": "劑", +"剐": "剮", +"剑": "劍", +"剥": "剝", +"剧": "劇", +"㓥": "劏", +"㔉": "劚", +"劝": "勸", +"办": "辦", +"务": "務", +"劢": "勱", +"动": "動", +"励": "勵", +"劲": "勁", +"劳": "勞", +"势": "勢", +"勋": "勛", +"勚": "勩", +"匀": "勻", +"匦": "匭", +"匮": "匱", +"区": "區", +"医": "醫", +"华": "華", +"协": "協", +"单": "單", +"卖": "賣", +"卢": "盧", +"卫": "衛", +"却": "卻", +"厅": "廳", +"厉": "厲", +"压": "壓", +"厌": "厭", +"厍": "厙", +"厐": "龎", +"厘": "釐", +"厢": "廂", +"厣": "厴", +"厦": "廈", +"厨": "廚", +"厩": "廄", +"厮": "廝", +"县": "縣", +"叁": "叄", +"参": "參", +"双": "雙", +"变": "變", +"叙": "敘", +"叠": "疊", +"号": "號", +"叹": "嘆", +"叽": "嘰", +"吓": "嚇", +"吕": "呂", +"吗": "嗎", +"吣": "唚", +"吨": "噸", +"听": "聽", +"吴": "吳", +"呐": "吶", +"呒": "嘸", +"呓": "囈", +"呕": "嘔", +"呖": "嚦", +"呗": "唄", +"员": "員", +"呙": "咼", +"呛": "嗆", +"呜": "嗚", +"咏": "詠", +"咙": "嚨", +"咛": "嚀", +"咝": "噝", +"咤": "吒", +"响": "響", +"哑": "啞", +"哒": "噠", +"哓": "嘵", +"哔": "嗶", +"哕": "噦", +"哗": "嘩", +"哙": "噲", +"哜": "嚌", +"哝": "噥", +"哟": "喲", +"唛": "嘜", +"唝": "嗊", +"唠": "嘮", +"唡": "啢", +"唢": "嗩", +"唤": "喚", +"啧": "嘖", +"啬": "嗇", +"啭": "囀", +"啮": "嚙", +"啴": "嘽", +"啸": "嘯", +"㖞": "喎", +"喷": "噴", +"喽": "嘍", +"喾": "嚳", +"嗫": "囁", +"嗳": "噯", +"嘘": "噓", +"嘤": "嚶", +"嘱": "囑", +"㖊": "噚", +"噜": "嚕", +"嚣": "囂", +"园": "園", +"囱": "囪", +"围": "圍", +"囵": "圇", +"国": "國", +"图": "圖", +"圆": "圓", +"圣": "聖", +"圹": "壙", +"场": "場", +"坂": "阪", +"块": "塊", +"坚": "堅", +"坜": "壢", +"坝": "壩", +"坞": "塢", +"坟": "墳", +"坠": "墜", +"垄": "壟", +"垅": "壠", +"垆": "壚", +"垒": "壘", +"垦": "墾", +"垩": "堊", +"垫": "墊", +"垭": "埡", +"垱": "壋", +"垲": "塏", +"垴": "堖", +"埘": "塒", +"埙": "塤", +"埚": "堝", +"埯": "垵", +"堑": "塹", +"堕": "墮", +"𡒄": "壈", +"壮": "壯", +"声": "聲", +"壶": "壺", +"壸": "壼", +"处": "處", +"备": "備", +"够": "夠", +"头": "頭", +"夸": "誇", +"夹": "夾", +"夺": "奪", +"奁": "奩", +"奂": "奐", +"奋": "奮", +"奥": "奧", +"奸": "姦", +"妆": "妝", +"妇": "婦", +"妈": "媽", +"妩": "嫵", +"妪": "嫗", +"姗": "姍", +"姹": "奼", +"娄": "婁", +"娅": "婭", +"娆": "嬈", +"娇": "嬌", +"娈": "孌", +"娱": "娛", +"娲": "媧", +"娴": "嫻", +"婳": "嫿", +"婴": "嬰", +"婵": "嬋", +"婶": "嬸", +"媪": "媼", +"嫒": "嬡", +"嫔": "嬪", +"嫱": "嬙", +"嬷": "嬤", +"孙": "孫", +"学": "學", +"孪": "孿", +"宝": "寶", +"实": "實", +"宠": "寵", +"审": "審", +"宪": "憲", +"宫": "宮", +"宽": "寬", +"宾": "賓", +"寝": "寢", +"对": "對", +"寻": "尋", +"导": "導", +"寿": "壽", +"将": "將", +"尔": "爾", +"尘": "塵", +"尝": "嘗", +"尧": "堯", +"尴": "尷", +"尸": "屍", +"层": "層", +"屃": "屓", +"屉": "屜", +"届": "屆", +"属": "屬", +"屡": "屢", +"屦": "屨", +"屿": "嶼", +"岁": "歲", +"岂": "豈", +"岖": "嶇", +"岗": "崗", +"岘": "峴", +"岙": "嶴", +"岚": "嵐", +"岛": "島", +"岭": "嶺", +"岽": "崬", +"岿": "巋", +"峄": "嶧", +"峡": "峽", +"峣": "嶢", +"峤": "嶠", +"峥": "崢", +"峦": "巒", +"崂": "嶗", +"崃": "崍", +"崄": "嶮", +"崭": "嶄", +"嵘": "嶸", +"嵚": "嶔", +"嵝": "嶁", +"巅": "巔", +"巩": "鞏", +"巯": "巰", +"币": "幣", +"帅": "帥", +"师": "師", +"帏": "幃", +"帐": "帳", +"帜": "幟", +"带": "帶", +"帧": "幀", +"帮": "幫", +"帱": "幬", +"帻": "幘", +"帼": "幗", +"幂": "冪", +"庄": "莊", +"庆": "慶", +"庐": "廬", +"庑": "廡", +"库": "庫", +"应": "應", +"庙": "廟", +"庞": "龐", +"废": "廢", +"廪": "廩", +"开": "開", +"异": "異", +"弃": "棄", +"弑": "弒", +"张": "張", +"弪": "弳", +"弯": "彎", +"弹": "彈", +"强": "強", +"归": "歸", +"彝": "彞", +"彦": "彥", +"彻": "徹", +"径": "徑", +"徕": "徠", +"忆": "憶", +"忏": "懺", +"忧": "憂", +"忾": "愾", +"怀": "懷", +"态": "態", +"怂": "慫", +"怃": "憮", +"怄": "慪", +"怅": "悵", +"怆": "愴", +"怜": "憐", +"总": "總", +"怼": "懟", +"怿": "懌", +"恋": "戀", +"恒": "恆", +"恳": "懇", +"恸": "慟", +"恹": "懨", +"恺": "愷", +"恻": "惻", +"恼": "惱", +"恽": "惲", +"悦": "悅", +"悬": "懸", +"悭": "慳", +"悮": "悞", +"悯": "憫", +"惊": "驚", +"惧": "懼", +"惨": "慘", +"惩": "懲", +"惫": "憊", +"惬": "愜", +"惭": "慚", +"惮": "憚", +"惯": "慣", +"愠": "慍", +"愤": "憤", +"愦": "憒", +"慑": "懾", +"懑": "懣", +"懒": "懶", +"懔": "懍", +"戆": "戇", +"戋": "戔", +"戏": "戲", +"戗": "戧", +"战": "戰", +"戬": "戩", +"戯": "戱", +"户": "戶", +"扑": "撲", +"执": "執", +"扩": "擴", +"扪": "捫", +"扫": "掃", +"扬": "揚", +"扰": "擾", +"抚": "撫", +"抛": "拋", +"抟": "摶", +"抠": "摳", +"抡": "掄", +"抢": "搶", +"护": "護", +"报": "報", +"拟": "擬", +"拢": "攏", +"拣": "揀", +"拥": "擁", +"拦": "攔", +"拧": "擰", +"拨": "撥", +"择": "擇", +"挂": "掛", +"挚": "摯", +"挛": "攣", +"挜": "掗", +"挝": "撾", +"挞": "撻", +"挟": "挾", +"挠": "撓", +"挡": "擋", +"挢": "撟", +"挣": "掙", +"挤": "擠", +"挥": "揮", +"挦": "撏", +"挽": "輓", +"捝": "挩", +"捞": "撈", +"损": "損", +"捡": "撿", +"换": "換", +"捣": "搗", +"掳": "擄", +"掴": "摑", +"掷": "擲", +"掸": "撣", +"掺": "摻", +"掼": "摜", +"揽": "攬", +"揾": "搵", +"揿": "撳", +"搀": "攙", +"搁": "擱", +"搂": "摟", +"搅": "攪", +"携": "攜", +"摄": "攝", +"摅": "攄", +"摆": "擺", +"摇": "搖", +"摈": "擯", +"摊": "攤", +"撄": "攖", +"撑": "撐", +"㧑": "撝", +"撵": "攆", +"撷": "擷", +"撸": "擼", +"撺": "攛", +"㧟": "擓", +"擞": "擻", +"攒": "攢", +"敌": "敵", +"敛": "斂", +"数": "數", +"斋": "齋", +"斓": "斕", +"斩": "斬", +"断": "斷", +"无": "無", +"旧": "舊", +"时": "時", +"旷": "曠", +"旸": "暘", +"昙": "曇", +"昼": "晝", +"昽": "曨", +"显": "顯", +"晋": "晉", +"晒": "曬", +"晓": "曉", +"晔": "曄", +"晕": "暈", +"晖": "暉", +"暂": "暫", +"暧": "曖", +"机": "機", +"杀": "殺", +"杂": "雜", +"权": "權", +"杆": "桿", +"条": "條", +"来": "來", +"杨": "楊", +"杩": "榪", +"杰": "傑", +"构": "構", +"枞": "樅", +"枢": "樞", +"枣": "棗", +"枥": "櫪", +"枧": "梘", +"枨": "棖", +"枪": "槍", +"枫": "楓", +"枭": "梟", +"柠": "檸", +"柽": "檉", +"栀": "梔", +"栅": "柵", +"标": "標", +"栈": "棧", +"栉": "櫛", +"栊": "櫳", +"栋": "棟", +"栌": "櫨", +"栎": "櫟", +"栏": "欄", +"树": "樹", +"栖": "棲", +"栗": "慄", +"样": "樣", +"栾": "欒", +"桠": "椏", +"桡": "橈", +"桢": "楨", +"档": "檔", +"桤": "榿", +"桥": "橋", +"桦": "樺", +"桧": "檜", +"桨": "槳", +"桩": "樁", +"梦": "夢", +"梼": "檮", +"梾": "棶", +"梿": "槤", +"检": "檢", +"棁": "梲", +"棂": "欞", +"椁": "槨", +"椟": "櫝", +"椠": "槧", +"椤": "欏", +"椭": "橢", +"楼": "樓", +"榄": "欖", +"榅": "榲", +"榇": "櫬", +"榈": "櫚", +"榉": "櫸", +"槚": "檟", +"槛": "檻", +"槟": "檳", +"槠": "櫧", +"横": "橫", +"樯": "檣", +"樱": "櫻", +"橥": "櫫", +"橱": "櫥", +"橹": "櫓", +"橼": "櫞", +"檩": "檁", +"欢": "歡", +"欤": "歟", +"欧": "歐", +"歼": "殲", +"殁": "歿", +"殇": "殤", +"残": "殘", +"殒": "殞", +"殓": "殮", +"殚": "殫", +"殡": "殯", +"㱮": "殨", +"㱩": "殰", +"殴": "毆", +"毁": "毀", +"毂": "轂", +"毕": "畢", +"毙": "斃", +"毡": "氈", +"毵": "毿", +"氇": "氌", +"气": "氣", +"氢": "氫", +"氩": "氬", +"氲": "氳", +"汉": "漢", +"汤": "湯", +"汹": "洶", +"沟": "溝", +"没": "沒", +"沣": "灃", +"沤": "漚", +"沥": "瀝", +"沦": "淪", +"沧": "滄", +"沪": "滬", +"泞": "濘", +"注": "註", +"泪": "淚", +"泶": "澩", +"泷": "瀧", +"泸": "瀘", +"泺": "濼", +"泻": "瀉", +"泼": "潑", +"泽": "澤", +"泾": "涇", +"洁": "潔", +"洒": "灑", +"洼": "窪", +"浃": "浹", +"浅": "淺", +"浆": "漿", +"浇": "澆", +"浈": "湞", +"浊": "濁", +"测": "測", +"浍": "澮", +"济": "濟", +"浏": "瀏", +"浐": "滻", +"浑": "渾", +"浒": "滸", +"浓": "濃", +"浔": "潯", +"涛": "濤", +"涝": "澇", +"涞": "淶", +"涟": "漣", +"涠": "潿", +"涡": "渦", +"涣": "渙", +"涤": "滌", +"润": "潤", +"涧": "澗", +"涨": "漲", +"涩": "澀", +"渊": "淵", +"渌": "淥", +"渍": "漬", +"渎": "瀆", +"渐": "漸", +"渑": "澠", +"渔": "漁", +"渖": "瀋", +"渗": "滲", +"温": "溫", +"湾": "灣", +"湿": "濕", +"溃": "潰", +"溅": "濺", +"溆": "漵", +"滗": "潷", +"滚": "滾", +"滞": "滯", +"滟": "灧", +"滠": "灄", +"满": "滿", +"滢": "瀅", +"滤": "濾", +"滥": "濫", +"滦": "灤", +"滨": "濱", +"滩": "灘", +"滪": "澦", +"漤": "灠", +"潆": "瀠", +"潇": "瀟", +"潋": "瀲", +"潍": "濰", +"潜": "潛", +"潴": "瀦", +"澜": "瀾", +"濑": "瀨", +"濒": "瀕", +"㲿": "瀇", +"灏": "灝", +"灭": "滅", +"灯": "燈", +"灵": "靈", +"灶": "竈", +"灾": "災", +"灿": "燦", +"炀": "煬", +"炉": "爐", +"炖": "燉", +"炜": "煒", +"炝": "熗", +"点": "點", +"炼": "煉", +"炽": "熾", +"烁": "爍", +"烂": "爛", +"烃": "烴", +"烛": "燭", +"烟": "煙", +"烦": "煩", +"烧": "燒", +"烨": "燁", +"烩": "燴", +"烫": "燙", +"烬": "燼", +"热": "熱", +"焕": "煥", +"焖": "燜", +"焘": "燾", +"㶽": "煱", +"煴": "熅", +"㶶": "燶", +"爱": "愛", +"爷": "爺", +"牍": "牘", +"牦": "氂", +"牵": "牽", +"牺": "犧", +"犊": "犢", +"状": "狀", +"犷": "獷", +"犸": "獁", +"犹": "猶", +"狈": "狽", +"狝": "獮", +"狞": "獰", +"独": "獨", +"狭": "狹", +"狮": "獅", +"狯": "獪", +"狰": "猙", +"狱": "獄", +"狲": "猻", +"猃": "獫", +"猎": "獵", +"猕": "獼", +"猡": "玀", +"猪": "豬", +"猫": "貓", +"猬": "蝟", +"献": "獻", +"獭": "獺", +"㺍": "獱", +"玑": "璣", +"玚": "瑒", +"玛": "瑪", +"玮": "瑋", +"环": "環", +"现": "現", +"玱": "瑲", +"玺": "璽", +"珐": "琺", +"珑": "瓏", +"珰": "璫", +"珲": "琿", +"琏": "璉", +"琐": "瑣", +"琼": "瓊", +"瑶": "瑤", +"瑷": "璦", +"璎": "瓔", +"瓒": "瓚", +"瓯": "甌", +"电": "電", +"画": "畫", +"畅": "暢", +"畴": "疇", +"疖": "癤", +"疗": "療", +"疟": "瘧", +"疠": "癘", +"疡": "瘍", +"疬": "癧", +"疭": "瘲", +"疮": "瘡", +"疯": "瘋", +"疱": "皰", +"疴": "痾", +"痈": "癰", +"痉": "痙", +"痒": "癢", +"痖": "瘂", +"痨": "癆", +"痪": "瘓", +"痫": "癇", +"瘅": "癉", +"瘆": "瘮", +"瘗": "瘞", +"瘪": "癟", +"瘫": "癱", +"瘾": "癮", +"瘿": "癭", +"癞": "癩", +"癣": "癬", +"癫": "癲", +"皑": "皚", +"皱": "皺", +"皲": "皸", +"盏": "盞", +"盐": "鹽", +"监": "監", +"盖": "蓋", +"盗": "盜", +"盘": "盤", +"眍": "瞘", +"眦": "眥", +"眬": "矓", +"着": "著", +"睁": "睜", +"睐": "睞", +"睑": "瞼", +"瞆": "瞶", +"瞒": "瞞", +"䁖": "瞜", +"瞩": "矚", +"矫": "矯", +"矶": "磯", +"矾": "礬", +"矿": "礦", +"砀": "碭", +"码": "碼", +"砖": "磚", +"砗": "硨", +"砚": "硯", +"砜": "碸", +"砺": "礪", +"砻": "礱", +"砾": "礫", +"础": "礎", +"硁": "硜", +"硕": "碩", +"硖": "硤", +"硗": "磽", +"硙": "磑", +"碍": "礙", +"碛": "磧", +"碜": "磣", +"碱": "鹼", +"礼": "禮", +"祃": "禡", +"祎": "禕", +"祢": "禰", +"祯": "禎", +"祷": "禱", +"祸": "禍", +"禀": "稟", +"禄": "祿", +"禅": "禪", +"离": "離", +"秃": "禿", +"秆": "稈", +"积": "積", +"称": "稱", +"秽": "穢", +"秾": "穠", +"稆": "穭", +"税": "稅", +"䅉": "稏", +"稣": "穌", +"稳": "穩", +"穑": "穡", +"穷": "窮", +"窃": "竊", +"窍": "竅", +"窎": "窵", +"窑": "窯", +"窜": "竄", +"窝": "窩", +"窥": "窺", +"窦": "竇", +"窭": "窶", +"竞": "競", +"笃": "篤", +"笋": "筍", +"笔": "筆", +"笕": "筧", +"笺": "箋", +"笼": "籠", +"笾": "籩", +"筚": "篳", +"筛": "篩", +"筜": "簹", +"筝": "箏", +"䇲": "筴", +"筹": "籌", +"筼": "篔", +"简": "簡", +"箓": "籙", +"箦": "簀", +"箧": "篋", +"箨": "籜", +"箩": "籮", +"箪": "簞", +"箫": "簫", +"篑": "簣", +"篓": "簍", +"篮": "籃", +"篱": "籬", +"簖": "籪", +"籁": "籟", +"籴": "糴", +"类": "類", +"籼": "秈", +"粜": "糶", +"粝": "糲", +"粤": "粵", +"粪": "糞", +"粮": "糧", +"糁": "糝", +"糇": "餱", +"紧": "緊", +"䌷": "紬", +"䌹": "絅", +"絷": "縶", +"䌼": "綐", +"䌽": "綵", +"䌸": "縳", +"䍁": "繸", +"䍀": "繿", +"纟": "糹", +"纠": "糾", +"纡": "紆", +"红": "紅", +"纣": "紂", +"纥": "紇", +"约": "約", +"级": "級", +"纨": "紈", +"纩": "纊", +"纪": "紀", +"纫": "紉", +"纬": "緯", +"纭": "紜", +"纮": "紘", +"纯": "純", +"纰": "紕", +"纱": "紗", +"纲": "綱", +"纳": "納", +"纴": "紝", +"纵": "縱", +"纶": "綸", +"纷": "紛", +"纸": "紙", +"纹": "紋", +"纺": "紡", +"纻": "紵", +"纼": "紖", +"纽": "紐", +"纾": "紓", +"绀": "紺", +"绁": "紲", +"绂": "紱", +"练": "練", +"组": "組", +"绅": "紳", +"细": "細", +"织": "織", +"终": "終", +"绉": "縐", +"绊": "絆", +"绋": "紼", +"绌": "絀", +"绍": "紹", +"绎": "繹", +"经": "經", +"绐": "紿", +"绑": "綁", +"绒": "絨", +"结": "結", +"绔": "絝", +"绕": "繞", +"绖": "絰", +"绗": "絎", +"绘": "繪", +"给": "給", +"绚": "絢", +"绛": "絳", +"络": "絡", +"绞": "絞", +"统": "統", +"绠": "綆", +"绡": "綃", +"绢": "絹", +"绤": "綌", +"绥": "綏", +"继": "繼", +"绨": "綈", +"绩": "績", +"绪": "緒", +"绫": "綾", +"绬": "緓", +"续": "續", +"绮": "綺", +"绯": "緋", +"绰": "綽", +"绲": "緄", +"绳": "繩", +"维": "維", +"绵": "綿", +"绶": "綬", +"绸": "綢", +"绹": "綯", +"绺": "綹", +"绻": "綣", +"综": "綜", +"绽": "綻", +"绾": "綰", +"缀": "綴", +"缁": "緇", +"缂": "緙", +"缃": "緗", +"缄": "緘", +"缅": "緬", +"缆": "纜", +"缇": "緹", +"缈": "緲", +"缉": "緝", +"缊": "縕", +"缋": "繢", +"缌": "緦", +"缍": "綞", +"缎": "緞", +"缏": "緶", +"缑": "緱", +"缒": "縋", +"缓": "緩", +"缔": "締", +"缕": "縷", +"编": "編", +"缗": "緡", +"缘": "緣", +"缙": "縉", +"缚": "縛", +"缛": "縟", +"缜": "縝", +"缝": "縫", +"缞": "縗", +"缟": "縞", +"缠": "纏", +"缡": "縭", +"缢": "縊", +"缣": "縑", +"缤": "繽", +"缥": "縹", +"缦": "縵", +"缧": "縲", +"缨": "纓", +"缩": "縮", +"缪": "繆", +"缫": "繅", +"缬": "纈", +"缭": "繚", +"缮": "繕", +"缯": "繒", +"缱": "繾", +"缲": "繰", +"缳": "繯", +"缴": "繳", +"缵": "纘", +"罂": "罌", +"网": "網", +"罗": "羅", +"罚": "罰", +"罢": "罷", +"罴": "羆", +"羁": "羈", +"羟": "羥", +"翘": "翹", +"耢": "耮", +"耧": "耬", +"耸": "聳", +"耻": "恥", +"聂": "聶", +"聋": "聾", +"职": "職", +"聍": "聹", +"联": "聯", +"聩": "聵", +"聪": "聰", +"肃": "肅", +"肠": "腸", +"肤": "膚", +"肮": "骯", +"肴": "餚", +"肾": "腎", +"肿": "腫", +"胀": "脹", +"胁": "脅", +"胆": "膽", +"胧": "朧", +"胨": "腖", +"胪": "臚", +"胫": "脛", +"胶": "膠", +"脉": "脈", +"脍": "膾", +"脐": "臍", +"脑": "腦", +"脓": "膿", +"脔": "臠", +"脚": "腳", +"脱": "脫", +"脶": "腡", +"脸": "臉", +"腭": "齶", +"腻": "膩", +"腼": "靦", +"腽": "膃", +"腾": "騰", +"膑": "臏", +"臜": "臢", +"舆": "輿", +"舣": "艤", +"舰": "艦", +"舱": "艙", +"舻": "艫", +"艰": "艱", +"艳": "艷", +"艺": "藝", +"节": "節", +"芈": "羋", +"芗": "薌", +"芜": "蕪", +"芦": "蘆", +"苁": "蓯", +"苇": "葦", +"苈": "藶", +"苋": "莧", +"苌": "萇", +"苍": "蒼", +"苎": "苧", +"茎": "莖", +"茏": "蘢", +"茑": "蔦", +"茔": "塋", +"茕": "煢", +"茧": "繭", +"荆": "荊", +"荐": "薦", +"荙": "薘", +"荚": "莢", +"荛": "蕘", +"荜": "蓽", +"荞": "蕎", +"荟": "薈", +"荠": "薺", +"荣": "榮", +"荤": "葷", +"荥": "滎", +"荦": "犖", +"荧": "熒", +"荨": "蕁", +"荩": "藎", +"荪": "蓀", +"荫": "蔭", +"荬": "蕒", +"荭": "葒", +"荮": "葤", +"莅": "蒞", +"莱": "萊", +"莲": "蓮", +"莳": "蒔", +"莴": "萵", +"莶": "薟", +"莸": "蕕", +"莹": "瑩", +"莺": "鶯", +"萝": "蘿", +"萤": "螢", +"营": "營", +"萦": "縈", +"萧": "蕭", +"萨": "薩", +"葱": "蔥", +"蒇": "蕆", +"蒉": "蕢", +"蒋": "蔣", +"蒌": "蔞", +"蓝": "藍", +"蓟": "薊", +"蓠": "蘺", +"蓣": "蕷", +"蓥": "鎣", +"蓦": "驀", +"蔂": "虆", +"蔷": "薔", +"蔹": "蘞", +"蔺": "藺", +"蔼": "藹", +"蕰": "薀", +"蕲": "蘄", +"薮": "藪", +"䓕": "薳", +"藓": "蘚", +"蘖": "櫱", +"虏": "虜", +"虑": "慮", +"虚": "虛", +"虬": "虯", +"虮": "蟣", +"虽": "雖", +"虾": "蝦", +"虿": "蠆", +"蚀": "蝕", +"蚁": "蟻", +"蚂": "螞", +"蚕": "蠶", +"蚬": "蜆", +"蛊": "蠱", +"蛎": "蠣", +"蛏": "蟶", +"蛮": "蠻", +"蛰": "蟄", +"蛱": "蛺", +"蛲": "蟯", +"蛳": "螄", +"蛴": "蠐", +"蜕": "蛻", +"蜗": "蝸", +"蝇": "蠅", +"蝈": "蟈", +"蝉": "蟬", +"蝼": "螻", +"蝾": "蠑", +"螀": "螿", +"螨": "蟎", +"䗖": "螮", +"蟏": "蠨", +"衅": "釁", +"衔": "銜", +"补": "補", +"衬": "襯", +"衮": "袞", +"袄": "襖", +"袅": "裊", +"袆": "褘", +"袜": "襪", +"袭": "襲", +"袯": "襏", +"装": "裝", +"裆": "襠", +"裈": "褌", +"裢": "褳", +"裣": "襝", +"裤": "褲", +"裥": "襇", +"褛": "褸", +"褴": "襤", +"䙓": "襬", +"见": "見", +"观": "觀", +"觃": "覎", +"规": "規", +"觅": "覓", +"视": "視", +"觇": "覘", +"览": "覽", +"觉": "覺", +"觊": "覬", +"觋": "覡", +"觌": "覿", +"觍": "覥", +"觎": "覦", +"觏": "覯", +"觐": "覲", +"觑": "覷", +"觞": "觴", +"触": "觸", +"觯": "觶", +"訚": "誾", +"䜣": "訢", +"誉": "譽", +"誊": "謄", +"䜧": "譅", +"讠": "訁", +"计": "計", +"订": "訂", +"讣": "訃", +"认": "認", +"讥": "譏", +"讦": "訐", +"讧": "訌", +"讨": "討", +"让": "讓", +"讪": "訕", +"讫": "訖", +"讬": "託", +"训": "訓", +"议": "議", +"讯": "訊", +"记": "記", +"讱": "訒", +"讲": "講", +"讳": "諱", +"讴": "謳", +"讵": "詎", +"讶": "訝", +"讷": "訥", +"许": "許", +"讹": "訛", +"论": "論", +"讻": "訩", +"讼": "訟", +"讽": "諷", +"设": "設", +"访": "訪", +"诀": "訣", +"证": "證", +"诂": "詁", +"诃": "訶", +"评": "評", +"诅": "詛", +"识": "識", +"诇": "詗", +"诈": "詐", +"诉": "訴", +"诊": "診", +"诋": "詆", +"诌": "謅", +"词": "詞", +"诎": "詘", +"诏": "詔", +"诐": "詖", +"译": "譯", +"诒": "詒", +"诓": "誆", +"诔": "誄", +"试": "試", +"诖": "詿", +"诗": "詩", +"诘": "詰", +"诙": "詼", +"诚": "誠", +"诛": "誅", +"诜": "詵", +"话": "話", +"诞": "誕", +"诟": "詬", +"诠": "詮", +"诡": "詭", +"询": "詢", +"诣": "詣", +"诤": "諍", +"该": "該", +"详": "詳", +"诧": "詫", +"诨": "諢", +"诩": "詡", +"诪": "譸", +"诫": "誡", +"诬": "誣", +"语": "語", +"诮": "誚", +"误": "誤", +"诰": "誥", +"诱": "誘", +"诲": "誨", +"诳": "誑", +"诵": "誦", +"诶": "誒", +"请": "請", +"诸": "諸", +"诹": "諏", +"诺": "諾", +"读": "讀", +"诼": "諑", +"诽": "誹", +"课": "課", +"诿": "諉", +"谀": "諛", +"谁": "誰", +"谂": "諗", +"调": "調", +"谄": "諂", +"谅": "諒", +"谆": "諄", +"谇": "誶", +"谈": "談", +"谊": "誼", +"谋": "謀", +"谌": "諶", +"谍": "諜", +"谎": "謊", +"谏": "諫", +"谐": "諧", +"谑": "謔", +"谒": "謁", +"谓": "謂", +"谔": "諤", +"谕": "諭", +"谖": "諼", +"谗": "讒", +"谘": "諮", +"谙": "諳", +"谚": "諺", +"谛": "諦", +"谜": "謎", +"谝": "諞", +"谞": "諝", +"谟": "謨", +"谠": "讜", +"谡": "謖", +"谢": "謝", +"谤": "謗", +"谥": "謚", +"谦": "謙", +"谧": "謐", +"谨": "謹", +"谩": "謾", +"谪": "謫", +"谬": "謬", +"谭": "譚", +"谮": "譖", +"谯": "譙", +"谰": "讕", +"谱": "譜", +"谲": "譎", +"谳": "讞", +"谴": "譴", +"谵": "譫", +"谶": "讖", +"豮": "豶", +"䝙": "貙", +"䞐": "賰", +"贝": "貝", +"贞": "貞", +"负": "負", +"贠": "貟", +"贡": "貢", +"财": "財", +"责": "責", +"贤": "賢", +"败": "敗", +"账": "賬", +"货": "貨", +"质": "質", +"贩": "販", +"贪": "貪", +"贫": "貧", +"贬": "貶", +"购": "購", +"贮": "貯", +"贯": "貫", +"贰": "貳", +"贱": "賤", +"贲": "賁", +"贳": "貰", +"贴": "貼", +"贵": "貴", +"贶": "貺", +"贷": "貸", +"贸": "貿", +"费": "費", +"贺": "賀", +"贻": "貽", +"贼": "賊", +"贽": "贄", +"贾": "賈", +"贿": "賄", +"赀": "貲", +"赁": "賃", +"赂": "賂", +"资": "資", +"赅": "賅", +"赆": "贐", +"赇": "賕", +"赈": "賑", +"赉": "賚", +"赊": "賒", +"赋": "賦", +"赌": "賭", +"赎": "贖", +"赏": "賞", +"赐": "賜", +"赑": "贔", +"赒": "賙", +"赓": "賡", +"赔": "賠", +"赕": "賧", +"赖": "賴", +"赗": "賵", +"赘": "贅", +"赙": "賻", +"赚": "賺", +"赛": "賽", +"赜": "賾", +"赞": "贊", +"赟": "贇", +"赠": "贈", +"赡": "贍", +"赢": "贏", +"赣": "贛", +"赪": "赬", +"赵": "趙", +"赶": "趕", +"趋": "趨", +"趱": "趲", +"趸": "躉", +"跃": "躍", +"跄": "蹌", +"跞": "躒", +"践": "踐", +"跶": "躂", +"跷": "蹺", +"跸": "蹕", +"跹": "躚", +"跻": "躋", +"踊": "踴", +"踌": "躊", +"踪": "蹤", +"踬": "躓", +"踯": "躑", +"蹑": "躡", +"蹒": "蹣", +"蹰": "躕", +"蹿": "躥", +"躏": "躪", +"躜": "躦", +"躯": "軀", +"车": "車", +"轧": "軋", +"轨": "軌", +"轩": "軒", +"轪": "軑", +"轫": "軔", +"转": "轉", +"轭": "軛", +"轮": "輪", +"软": "軟", +"轰": "轟", +"轱": "軲", +"轲": "軻", +"轳": "轤", +"轴": "軸", +"轵": "軹", +"轶": "軼", +"轷": "軤", +"轸": "軫", +"轹": "轢", +"轺": "軺", +"轻": "輕", +"轼": "軾", +"载": "載", +"轾": "輊", +"轿": "轎", +"辀": "輈", +"辁": "輇", +"辂": "輅", +"较": "較", +"辄": "輒", +"辅": "輔", +"辆": "輛", +"辇": "輦", +"辈": "輩", +"辉": "輝", +"辊": "輥", +"辋": "輞", +"辌": "輬", +"辍": "輟", +"辎": "輜", +"辏": "輳", +"辐": "輻", +"辑": "輯", +"辒": "轀", +"输": "輸", +"辔": "轡", +"辕": "轅", +"辖": "轄", +"辗": "輾", +"辘": "轆", +"辙": "轍", +"辚": "轔", +"辞": "辭", +"辩": "辯", +"辫": "辮", +"边": "邊", +"辽": "遼", +"达": "達", +"迁": "遷", +"过": "過", +"迈": "邁", +"运": "運", +"还": "還", +"这": "這", +"进": "進", +"远": "遠", +"违": "違", +"连": "連", +"迟": "遲", +"迩": "邇", +"迳": "逕", +"迹": "跡", +"选": "選", +"逊": "遜", +"递": "遞", +"逦": "邐", +"逻": "邏", +"遗": "遺", +"遥": "遙", +"邓": "鄧", +"邝": "鄺", +"邬": "鄔", +"邮": "郵", +"邹": "鄒", +"邺": "鄴", +"邻": "鄰", +"郏": "郟", +"郐": "鄶", +"郑": "鄭", +"郓": "鄆", +"郦": "酈", +"郧": "鄖", +"郸": "鄲", +"酂": "酇", +"酦": "醱", +"酱": "醬", +"酽": "釅", +"酾": "釃", +"酿": "釀", +"释": "釋", +"鉴": "鑒", +"銮": "鑾", +"錾": "鏨", +"𨱏": "鎝", +"钅": "釒", +"钆": "釓", +"钇": "釔", +"针": "針", +"钉": "釘", +"钊": "釗", +"钋": "釙", +"钌": "釕", +"钍": "釷", +"钎": "釺", +"钏": "釧", +"钐": "釤", +"钑": "鈒", +"钒": "釩", +"钓": "釣", +"钔": "鍆", +"钕": "釹", +"钖": "鍚", +"钗": "釵", +"钘": "鈃", +"钙": "鈣", +"钚": "鈈", +"钛": "鈦", +"钜": "鉅", +"钝": "鈍", +"钞": "鈔", +"钠": "鈉", +"钡": "鋇", +"钢": "鋼", +"钣": "鈑", +"钤": "鈐", +"钥": "鑰", +"钦": "欽", +"钧": "鈞", +"钨": "鎢", +"钪": "鈧", +"钫": "鈁", +"钬": "鈥", +"钭": "鈄", +"钮": "鈕", +"钯": "鈀", +"钰": "鈺", +"钱": "錢", +"钲": "鉦", +"钳": "鉗", +"钴": "鈷", +"钶": "鈳", +"钷": "鉕", +"钸": "鈽", +"钹": "鈸", +"钺": "鉞", +"钻": "鑽", +"钼": "鉬", +"钽": "鉭", +"钾": "鉀", +"钿": "鈿", +"铀": "鈾", +"铁": "鐵", +"铂": "鉑", +"铃": "鈴", +"铄": "鑠", +"铅": "鉛", +"铆": "鉚", +"铇": "鉋", +"铈": "鈰", +"铉": "鉉", +"铊": "鉈", +"铋": "鉍", +"铌": "鈮", +"铍": "鈹", +"铎": "鐸", +"铏": "鉶", +"铐": "銬", +"铑": "銠", +"铒": "鉺", +"铓": "鋩", +"铔": "錏", +"铕": "銪", +"铖": "鋮", +"铗": "鋏", +"铘": "鋣", +"铙": "鐃", +"铚": "銍", +"铛": "鐺", +"铜": "銅", +"铝": "鋁", +"铞": "銱", +"铟": "銦", +"铠": "鎧", +"铡": "鍘", +"铢": "銖", +"铣": "銑", +"铤": "鋌", +"铥": "銩", +"铦": "銛", +"铧": "鏵", +"铨": "銓", +"铩": "鎩", +"铪": "鉿", +"铫": "銚", +"铬": "鉻", +"铭": "銘", +"铮": "錚", +"铯": "銫", +"铰": "鉸", +"铱": "銥", +"铲": "鏟", +"铳": "銃", +"铴": "鐋", +"铵": "銨", +"银": "銀", +"铷": "銣", +"铸": "鑄", +"铹": "鐒", +"铺": "鋪", +"铻": "鋙", +"铼": "錸", +"铽": "鋱", +"链": "鏈", +"铿": "鏗", +"销": "銷", +"锁": "鎖", +"锂": "鋰", +"锃": "鋥", +"锄": "鋤", +"锅": "鍋", +"锆": "鋯", +"锇": "鋨", +"锉": "銼", +"锊": "鋝", +"锋": "鋒", +"锌": "鋅", +"锍": "鋶", +"锎": "鐦", +"锏": "鐧", +"锑": "銻", +"锒": "鋃", +"锓": "鋟", +"锔": "鋦", +"锕": "錒", +"锖": "錆", +"锗": "鍺", +"锘": "鍩", +"错": "錯", +"锚": "錨", +"锛": "錛", +"锜": "錡", +"锝": "鍀", +"锞": "錁", +"锟": "錕", +"锠": "錩", +"锡": "錫", +"锢": "錮", +"锣": "鑼", +"锥": "錐", +"锦": "錦", +"锧": "鑕", +"锩": "錈", +"锪": "鍃", +"锫": "錇", +"锬": "錟", +"锭": "錠", +"键": "鍵", +"锯": "鋸", +"锰": "錳", +"锱": "錙", +"锲": "鍥", +"锳": "鍈", +"锴": "鍇", +"锵": "鏘", +"锶": "鍶", +"锷": "鍔", +"锸": "鍤", +"锹": "鍬", +"锺": "鍾", +"锻": "鍛", +"锼": "鎪", +"锽": "鍠", +"锾": "鍰", +"锿": "鎄", +"镀": "鍍", +"镁": "鎂", +"镂": "鏤", +"镃": "鎡", +"镄": "鐨", +"镅": "鎇", +"镆": "鏌", +"镇": "鎮", +"镈": "鎛", +"镉": "鎘", +"镊": "鑷", +"镋": "鎲", +"镍": "鎳", +"镎": "鎿", +"镏": "鎦", +"镐": "鎬", +"镑": "鎊", +"镒": "鎰", +"镓": "鎵", +"镔": "鑌", +"镕": "鎔", +"镖": "鏢", +"镗": "鏜", +"镘": "鏝", +"镙": "鏍", +"镚": "鏰", +"镛": "鏞", +"镜": "鏡", +"镝": "鏑", +"镞": "鏃", +"镟": "鏇", +"镠": "鏐", +"镡": "鐔", +"镣": "鐐", +"镤": "鏷", +"镥": "鑥", +"镦": "鐓", +"镧": "鑭", +"镨": "鐠", +"镩": "鑹", +"镪": "鏹", +"镫": "鐙", +"镬": "鑊", +"镭": "鐳", +"镮": "鐶", +"镯": "鐲", +"镰": "鐮", +"镱": "鐿", +"镲": "鑔", +"镳": "鑣", +"镴": "鑞", +"镵": "鑱", +"镶": "鑲", +"长": "長", +"门": "門", +"闩": "閂", +"闪": "閃", +"闫": "閆", +"闬": "閈", +"闭": "閉", +"问": "問", +"闯": "闖", +"闰": "閏", +"闱": "闈", +"闲": "閑", +"闳": "閎", +"间": "間", +"闵": "閔", +"闶": "閌", +"闷": "悶", +"闸": "閘", +"闹": "鬧", +"闺": "閨", +"闻": "聞", +"闼": "闥", +"闽": "閩", +"闾": "閭", +"闿": "闓", +"阀": "閥", +"阁": "閣", +"阂": "閡", +"阃": "閫", +"阄": "鬮", +"阆": "閬", +"阇": "闍", +"阈": "閾", +"阉": "閹", +"阊": "閶", +"阋": "鬩", +"阌": "閿", +"阍": "閽", +"阎": "閻", +"阏": "閼", +"阐": "闡", +"阑": "闌", +"阒": "闃", +"阓": "闠", +"阔": "闊", +"阕": "闋", +"阖": "闔", +"阗": "闐", +"阘": "闒", +"阙": "闕", +"阚": "闞", +"阛": "闤", +"队": "隊", +"阳": "陽", +"阴": "陰", +"阵": "陣", +"阶": "階", +"际": "際", +"陆": "陸", +"陇": "隴", +"陈": "陳", +"陉": "陘", +"陕": "陝", +"陧": "隉", +"陨": "隕", +"险": "險", +"随": "隨", +"隐": "隱", +"隶": "隸", +"隽": "雋", +"难": "難", +"雏": "雛", +"雠": "讎", +"雳": "靂", +"雾": "霧", +"霁": "霽", +"霡": "霢", +"霭": "靄", +"靓": "靚", +"静": "靜", +"靥": "靨", +"䩄": "靦", +"鞑": "韃", +"鞒": "鞽", +"鞯": "韉", +"韦": "韋", +"韧": "韌", +"韨": "韍", +"韩": "韓", +"韪": "韙", +"韫": "韞", +"韬": "韜", +"韵": "韻", +"页": "頁", +"顶": "頂", +"顷": "頃", +"顸": "頇", +"项": "項", +"顺": "順", +"顼": "頊", +"顽": "頑", +"顾": "顧", +"顿": "頓", +"颀": "頎", +"颁": "頒", +"颂": "頌", +"颃": "頏", +"预": "預", +"颅": "顱", +"领": "領", +"颇": "頗", +"颈": "頸", +"颉": "頡", +"颊": "頰", +"颋": "頲", +"颌": "頜", +"颍": "潁", +"颎": "熲", +"颏": "頦", +"颐": "頤", +"频": "頻", +"颒": "頮", +"颔": "頷", +"颕": "頴", +"颖": "穎", +"颗": "顆", +"题": "題", +"颙": "顒", +"颚": "顎", +"颛": "顓", +"额": "額", +"颞": "顳", +"颟": "顢", +"颠": "顛", +"颡": "顙", +"颢": "顥", +"颤": "顫", +"颥": "顬", +"颦": "顰", +"颧": "顴", +"风": "風", +"飏": "颺", +"飐": "颭", +"飑": "颮", +"飒": "颯", +"飓": "颶", +"飔": "颸", +"飕": "颼", +"飖": "颻", +"飗": "飀", +"飘": "飄", +"飙": "飆", +"飚": "飈", +"飞": "飛", +"飨": "饗", +"餍": "饜", +"饣": "飠", +"饤": "飣", +"饦": "飥", +"饧": "餳", +"饨": "飩", +"饩": "餼", +"饪": "飪", +"饫": "飫", +"饬": "飭", +"饭": "飯", +"饮": "飲", +"饯": "餞", +"饰": "飾", +"饱": "飽", +"饲": "飼", +"饳": "飿", +"饴": "飴", +"饵": "餌", +"饶": "饒", +"饷": "餉", +"饸": "餄", +"饹": "餎", +"饺": "餃", +"饻": "餏", +"饼": "餅", +"饽": "餑", +"饾": "餖", +"饿": "餓", +"馀": "餘", +"馁": "餒", +"馂": "餕", +"馃": "餜", +"馄": "餛", +"馅": "餡", +"馆": "館", +"馇": "餷", +"馈": "饋", +"馉": "餶", +"馊": "餿", +"馋": "饞", +"馌": "饁", +"馍": "饃", +"馎": "餺", +"馏": "餾", +"馐": "饈", +"馑": "饉", +"馒": "饅", +"馓": "饊", +"馔": "饌", +"馕": "饢", +"䯄": "騧", +"马": "馬", +"驭": "馭", +"驮": "馱", +"驯": "馴", +"驰": "馳", +"驱": "驅", +"驲": "馹", +"驳": "駁", +"驴": "驢", +"驵": "駔", +"驶": "駛", +"驷": "駟", +"驸": "駙", +"驹": "駒", +"驺": "騶", +"驻": "駐", +"驼": "駝", +"驽": "駑", +"驾": "駕", +"驿": "驛", +"骀": "駘", +"骁": "驍", +"骃": "駰", +"骄": "驕", +"骅": "驊", +"骆": "駱", +"骇": "駭", +"骈": "駢", +"骉": "驫", +"骊": "驪", +"骋": "騁", +"验": "驗", +"骍": "騂", +"骎": "駸", +"骏": "駿", +"骐": "騏", +"骑": "騎", +"骒": "騍", +"骓": "騅", +"骔": "騌", +"骕": "驌", +"骖": "驂", +"骗": "騙", +"骘": "騭", +"骙": "騤", +"骚": "騷", +"骛": "騖", +"骜": "驁", +"骝": "騮", +"骞": "騫", +"骟": "騸", +"骠": "驃", +"骡": "騾", +"骢": "驄", +"骣": "驏", +"骤": "驟", +"骥": "驥", +"骦": "驦", +"骧": "驤", +"髅": "髏", +"髋": "髖", +"髌": "髕", +"鬓": "鬢", +"魇": "魘", +"魉": "魎", +"鱼": "魚", +"鱽": "魛", +"鱾": "魢", +"鱿": "魷", +"鲀": "魨", +"鲁": "魯", +"鲂": "魴", +"鲃": "䰾", +"鲄": "魺", +"鲅": "鮁", +"鲆": "鮃", +"鲈": "鱸", +"鲉": "鮋", +"鲊": "鮓", +"鲋": "鮒", +"鲌": "鮊", +"鲍": "鮑", +"鲎": "鱟", +"鲏": "鮍", +"鲐": "鮐", +"鲑": "鮭", +"鲒": "鮚", +"鲓": "鮳", +"鲔": "鮪", +"鲕": "鮞", +"鲖": "鮦", +"鲗": "鰂", +"鲘": "鮜", +"鲙": "鱠", +"鲚": "鱭", +"鲛": "鮫", +"鲜": "鮮", +"鲝": "鮺", +"鲟": "鱘", +"鲠": "鯁", +"鲡": "鱺", +"鲢": "鰱", +"鲣": "鰹", +"鲤": "鯉", +"鲥": "鰣", +"鲦": "鰷", +"鲧": "鯀", +"鲨": "鯊", +"鲩": "鯇", +"鲪": "鮶", +"鲫": "鯽", +"鲬": "鯒", +"鲭": "鯖", +"鲮": "鯪", +"鲯": "鯕", +"鲰": "鯫", +"鲱": "鯡", +"鲲": "鯤", +"鲳": "鯧", +"鲴": "鯝", +"鲵": "鯢", +"鲶": "鯰", +"鲷": "鯛", +"鲸": "鯨", +"鲹": "鰺", +"鲺": "鯴", +"鲻": "鯔", +"鲼": "鱝", +"鲽": "鰈", +"鲾": "鰏", +"鲿": "鱨", +"鳀": "鯷", +"鳁": "鰮", +"鳂": "鰃", +"鳃": "鰓", +"鳅": "鰍", +"鳆": "鰒", +"鳇": "鰉", +"鳈": "鰁", +"鳉": "鱂", +"鳊": "鯿", +"鳋": "鰠", +"鳌": "鰲", +"鳍": "鰭", +"鳎": "鰨", +"鳏": "鰥", +"鳐": "鰩", +"鳑": "鰟", +"鳒": "鰜", +"鳓": "鰳", +"鳔": "鰾", +"鳕": "鱈", +"鳖": "鱉", +"鳗": "鰻", +"鳘": "鰵", +"鳙": "鱅", +"鳚": "䲁", +"鳛": "鰼", +"鳜": "鱖", +"鳝": "鱔", +"鳞": "鱗", +"鳟": "鱒", +"鳠": "鱯", +"鳡": "鱤", +"鳢": "鱧", +"鳣": "鱣", +"䴓": "鳾", +"䴕": "鴷", +"䴔": "鵁", +"䴖": "鶄", +"䴗": "鶪", +"䴘": "鷈", +"䴙": "鷿", +"㶉": "鸂", +"鸟": "鳥", +"鸠": "鳩", +"鸢": "鳶", +"鸣": "鳴", +"鸤": "鳲", +"鸥": "鷗", +"鸦": "鴉", +"鸧": "鶬", +"鸨": "鴇", +"鸩": "鴆", +"鸪": "鴣", +"鸫": "鶇", +"鸬": "鸕", +"鸭": "鴨", +"鸮": "鴞", +"鸯": "鴦", +"鸰": "鴒", +"鸱": "鴟", +"鸲": "鴝", +"鸳": "鴛", +"鸴": "鷽", +"鸵": "鴕", +"鸶": "鷥", +"鸷": "鷙", +"鸸": "鴯", +"鸹": "鴰", +"鸺": "鵂", +"鸻": "鴴", +"鸼": "鵃", +"鸽": "鴿", +"鸾": "鸞", +"鸿": "鴻", +"鹀": "鵐", +"鹁": "鵓", +"鹂": "鸝", +"鹃": "鵑", +"鹄": "鵠", +"鹅": "鵝", +"鹆": "鵒", +"鹇": "鷳", +"鹈": "鵜", +"鹉": "鵡", +"鹊": "鵲", +"鹋": "鶓", +"鹌": "鵪", +"鹍": "鵾", +"鹎": "鵯", +"鹏": "鵬", +"鹐": "鵮", +"鹑": "鶉", +"鹒": "鶊", +"鹓": "鵷", +"鹔": "鷫", +"鹕": "鶘", +"鹖": "鶡", +"鹗": "鶚", +"鹘": "鶻", +"鹙": "鶖", +"鹛": "鶥", +"鹜": "鶩", +"鹝": "鷊", +"鹞": "鷂", +"鹟": "鶲", +"鹠": "鶹", +"鹡": "鶺", +"鹢": "鷁", +"鹣": "鶼", +"鹤": "鶴", +"鹥": "鷖", +"鹦": "鸚", +"鹧": "鷓", +"鹨": "鷚", +"鹩": "鷯", +"鹪": "鷦", +"鹫": "鷲", +"鹬": "鷸", +"鹭": "鷺", +"鹯": "鸇", +"鹰": "鷹", +"鹱": "鸌", +"鹲": "鸏", +"鹳": "鸛", +"鹴": "鸘", +"鹾": "鹺", +"麦": "麥", +"麸": "麩", +"黄": "黃", +"黉": "黌", +"黡": "黶", +"黩": "黷", +"黪": "黲", +"黾": "黽", +"鼋": "黿", +"鼍": "鼉", +"鼗": "鞀", +"鼹": "鼴", +"齐": "齊", +"齑": "齏", +"齿": "齒", +"龀": "齔", +"龁": "齕", +"龂": "齗", +"龃": "齟", +"龄": "齡", +"龅": "齙", +"龆": "齠", +"龇": "齜", +"龈": "齦", +"龉": "齬", +"龊": "齪", +"龋": "齲", +"龌": "齷", +"龙": "龍", +"龚": "龔", +"龛": "龕", +"龟": "龜", +"一伙": "一伙", +"一并": "一併", +"一准": "一准", +"一划": "一划", +"一地里": "一地裡", +"一干": "一干", +"一树百获": "一樹百穫", +"一台": "一臺", +"一冲": "一衝", +"一只": "一隻", +"一发千钧": "一髮千鈞", +"一出": "一齣", +"七只": "七隻", +"三元里": "三元裡", +"三国志": "三國誌", +"三复": "三複", +"三只": "三隻", +"上吊": "上吊", +"上台": "上臺", +"下不了台": "下不了臺", +"下台": "下臺", +"下面": "下麵", +"不准": "不准", +"不吊": "不吊", +"不知就里": "不知就裡", +"不知所云": "不知所云", +"不锈钢": "不鏽鋼", +"丑剧": "丑劇", +"丑旦": "丑旦", +"丑角": "丑角", +"并存着": "並存著", +"中岳": "中嶽", +"中台医专": "中臺醫專", +"丰南": "丰南", +"丰台": "丰台", +"丰姿": "丰姿", +"丰采": "丰采", +"丰韵": "丰韻", +"主干": "主幹", +"么么唱唱": "么么唱唱", +"么儿": "么兒", +"么喝": "么喝", +"么妹": "么妹", +"么弟": "么弟", +"么爷": "么爺", +"九世之雠": "九世之讎", +"九只": "九隻", +"干丝": "乾絲", +"干着急": "乾著急", +"乱发": "亂髮", +"云云": "云云", +"云尔": "云爾", +"五岳": "五嶽", +"五斗柜": "五斗櫃", +"五斗橱": "五斗櫥", +"五谷": "五穀", +"五行生克": "五行生剋", +"五只": "五隻", +"五出": "五齣", +"交卷": "交卷", +"人云亦云": "人云亦云", +"人物志": "人物誌", +"什锦面": "什錦麵", +"什么": "什麼", +"仆倒": "仆倒", +"介系词": "介係詞", +"介系词": "介繫詞", +"仿制": "仿製", +"伙伕": "伙伕", +"伙伴": "伙伴", +"伙同": "伙同", +"伙夫": "伙夫", +"伙房": "伙房", +"伙计": "伙計", +"伙食": "伙食", +"布下": "佈下", +"布告": "佈告", +"布哨": "佈哨", +"布局": "佈局", +"布岗": "佈崗", +"布施": "佈施", +"布景": "佈景", +"布满": "佈滿", +"布线": "佈線", +"布置": "佈置", +"布署": "佈署", +"布道": "佈道", +"布达": "佈達", +"布防": "佈防", +"布阵": "佈陣", +"布雷": "佈雷", +"体育锻鍊": "体育鍛鍊", +"何干": "何干", +"作准": "作准", +"佣人": "佣人", +"佣工": "佣工", +"佣金": "佣金", +"并入": "併入", +"并列": "併列", +"并到": "併到", +"并合": "併合", +"并吞": "併吞", +"并在": "併在", +"并成": "併成", +"并排": "併排", +"并拢": "併攏", +"并案": "併案", +"并为": "併為", +"并发": "併發", +"并科": "併科", +"并购": "併購", +"并进": "併進", +"来复": "來複", +"供制": "供製", +"依依不舍": "依依不捨", +"侵并": "侵併", +"便辟": "便辟", +"系数": "係數", +"系为": "係為", +"保险柜": "保險柜", +"信号台": "信號臺", +"修复": "修複", +"修胡刀": "修鬍刀", +"俯冲": "俯衝", +"个里": "個裡", +"借着": "借著", +"假发": "假髮", +"停制": "停製", +"偷鸡不着": "偷雞不著", +"家伙": "傢伙", +"家俱": "傢俱", +"家具": "傢具", +"传布": "傳佈", +"债台高筑": "債臺高築", +"傻里傻气": "傻裡傻氣", +"倾家荡产": "傾家蕩產", +"倾复": "傾複", +"倾复": "傾覆", +"僱佣": "僱佣", +"仪表": "儀錶", +"亿只": "億隻", +"尽尽": "儘儘", +"尽先": "儘先", +"尽其所有": "儘其所有", +"尽力": "儘力", +"尽快": "儘快", +"尽早": "儘早", +"尽是": "儘是", +"尽管": "儘管", +"尽速": "儘速", +"尽量": "儘量", +"允准": "允准", +"兄台": "兄臺", +"充饥": "充饑", +"光采": "光采", +"克里": "克裡", +"克复": "克複", +"入伙": "入伙", +"内制": "內製", +"两只": "兩隻", +"八字胡": "八字鬍", +"八只": "八隻", +"公布": "公佈", +"公干": "公幹", +"公斗": "公斗", +"公历": "公曆", +"六只": "六隻", +"六出": "六齣", +"兼并": "兼併", +"冤雠": "冤讎", +"准予": "准予", +"准假": "准假", +"准将": "准將", +"准考证": "准考證", +"准许": "准許", +"几几": "几几", +"几案": "几案", +"几丝": "几絲", +"凹洞里": "凹洞裡", +"出征": "出征", +"出锤": "出鎚", +"刀削面": "刀削麵", +"刁斗": "刁斗", +"分布": "分佈", +"切面": "切麵", +"刊布": "刊佈", +"划上": "划上", +"划下": "划下", +"划不来": "划不來", +"划了": "划了", +"划具": "划具", +"划出": "划出", +"划到": "划到", +"划动": "划動", +"划去": "划去", +"划子": "划子", +"划得来": "划得來", +"划拳": "划拳", +"划桨": "划槳", +"划水": "划水", +"划算": "划算", +"划船": "划船", +"划艇": "划艇", +"划着": "划著", +"划着走": "划著走", +"划行": "划行", +"划走": "划走", +"划起": "划起", +"划进": "划進", +"划过": "划過", +"初征": "初征", +"别致": "別緻", +"别着": "別著", +"别只": "別隻", +"利比里亚": "利比裡亞", +"刮着": "刮著", +"刮胡刀": "刮鬍刀", +"剃发": "剃髮", +"剃须": "剃鬚", +"削发": "削髮", +"克制": "剋制", +"克星": "剋星", +"克服": "剋服", +"克死": "剋死", +"克薄": "剋薄", +"前仆后继": "前仆後繼", +"前台": "前臺", +"前车之复": "前車之覆", +"刚才": "剛纔", +"剪发": "剪髮", +"割舍": "割捨", +"创制": "創製", +"加里宁": "加裡寧", +"动荡": "動蕩", +"劳力士表": "勞力士錶", +"包准": "包准", +"包谷": "包穀", +"北斗": "北斗", +"北回": "北迴", +"匡复": "匡複", +"匪干": "匪幹", +"十卷": "十卷", +"十台": "十臺", +"十只": "十隻", +"十出": "十齣", +"千丝万缕": "千絲萬縷", +"千回百折": "千迴百折", +"千回百转": "千迴百轉", +"千钧一发": "千鈞一髮", +"千只": "千隻", +"升斗小民": "升斗小民", +"半只": "半隻", +"南岳": "南嶽", +"南征": "南征", +"南台": "南臺", +"南回": "南迴", +"卡里": "卡裡", +"印制": "印製", +"卷入": "卷入", +"卷取": "卷取", +"卷土重来": "卷土重來", +"卷子": "卷子", +"卷宗": "卷宗", +"卷尺": "卷尺", +"卷层云": "卷層雲", +"卷帙": "卷帙", +"卷扬机": "卷揚機", +"卷曲": "卷曲", +"卷染": "卷染", +"卷烟": "卷煙", +"卷筒": "卷筒", +"卷纬": "卷緯", +"卷绕": "卷繞", +"卷装": "卷裝", +"卷轴": "卷軸", +"卷云": "卷雲", +"卷领": "卷領", +"卷发": "卷髮", +"卷须": "卷鬚", +"参与": "參与", +"参与者": "參与者", +"参合": "參合", +"参考价值": "參考價值", +"参与": "參與", +"参与人员": "參與人員", +"参与制": "參與制", +"参与感": "參與感", +"参与者": "參與者", +"参观团": "參觀團", +"参观团体": "參觀團體", +"参阅": "參閱", +"反冲": "反衝", +"反复": "反複", +"反复": "反覆", +"取舍": "取捨", +"口里": "口裡", +"只准": "只准", +"只冲": "只衝", +"叮当": "叮噹", +"可怜虫": "可憐虫", +"可紧可松": "可緊可鬆", +"台制": "台製", +"司令台": "司令臺", +"吃着不尽": "吃著不盡", +"吃里扒外": "吃裡扒外", +"吃里爬外": "吃裡爬外", +"各吊": "各吊", +"合伙": "合伙", +"合并": "合併", +"合着": "合著", +"合着者": "合著者", +"吊上": "吊上", +"吊下": "吊下", +"吊了": "吊了", +"吊个": "吊個", +"吊儿郎当": "吊兒郎當", +"吊到": "吊到", +"吊去": "吊去", +"吊取": "吊取", +"吊吊": "吊吊", +"吊嗓": "吊嗓", +"吊好": "吊好", +"吊子": "吊子", +"吊带": "吊帶", +"吊带裤": "吊帶褲", +"吊床": "吊床", +"吊得": "吊得", +"吊挂": "吊掛", +"吊挂着": "吊掛著", +"吊杆": "吊杆", +"吊架": "吊架", +"吊桶": "吊桶", +"吊杆": "吊桿", +"吊桥": "吊橋", +"吊死": "吊死", +"吊灯": "吊燈", +"吊环": "吊環", +"吊盘": "吊盤", +"吊索": "吊索", +"吊着": "吊著", +"吊装": "吊裝", +"吊裤": "吊褲", +"吊裤带": "吊褲帶", +"吊袜": "吊襪", +"吊走": "吊走", +"吊起": "吊起", +"吊车": "吊車", +"吊钩": "吊鉤", +"吊销": "吊銷", +"吊钟": "吊鐘", +"同伙": "同伙", +"名表": "名錶", +"后冠": "后冠", +"后土": "后土", +"后妃": "后妃", +"后座": "后座", +"后稷": "后稷", +"后羿": "后羿", +"后里": "后里", +"向着": "向著", +"吞并": "吞併", +"吹发": "吹髮", +"吕后": "呂后", +"獃里獃气": "呆裡呆氣", +"周而复始": "周而複始", +"呼吁": "呼籲", +"和面": "和麵", +"哪里": "哪裡", +"哭脏": "哭髒", +"问卷": "問卷", +"喝采": "喝采", +"单干": "單干", +"单只": "單隻", +"嘴里": "嘴裡", +"恶心": "噁心", +"当啷": "噹啷", +"当当": "噹噹", +"噜苏": "嚕囌", +"向导": "嚮導", +"向往": "嚮往", +"向应": "嚮應", +"向日": "嚮日", +"向迩": "嚮邇", +"严丝合缝": "嚴絲合縫", +"严复": "嚴複", +"四舍五入": "四捨五入", +"四只": "四隻", +"四出": "四齣", +"回丝": "回絲", +"回着": "回著", +"回荡": "回蕩", +"回复": "回覆", +"回采": "回采", +"圈子里": "圈子裡", +"圈里": "圈裡", +"国历": "國曆", +"国雠": "國讎", +"园里": "園裡", +"图里": "圖裡", +"土里": "土裡", +"土制": "土製", +"地志": "地誌", +"坍台": "坍臺", +"坑里": "坑裡", +"坦荡": "坦蕩", +"垂发": "垂髮", +"垮台": "垮臺", +"埋布": "埋佈", +"城里": "城裡", +"基干": "基幹", +"报复": "報複", +"塌台": "塌臺", +"塔台": "塔臺", +"涂着": "塗著", +"墓志": "墓誌", +"墨斗": "墨斗", +"墨索里尼": "墨索裡尼", +"垦复": "墾複", +"垄断价格": "壟斷價格", +"垄断资产": "壟斷資產", +"垄断集团": "壟斷集團", +"壶里": "壺裡", +"寿面": "壽麵", +"夏天里": "夏天裡", +"夏历": "夏曆", +"外制": "外製", +"多冲": "多衝", +"多采多姿": "多采多姿", +"多么": "多麼", +"夜光表": "夜光錶", +"夜里": "夜裡", +"梦里": "夢裡", +"大伙": "大伙", +"大卷": "大卷", +"大干": "大干", +"大干": "大幹", +"大锤": "大鎚", +"大只": "大隻", +"天后": "天后", +"天干": "天干", +"天文台": "天文臺", +"天翻地复": "天翻地覆", +"太后": "太后", +"奏折": "奏摺", +"女丑": "女丑", +"女佣": "女佣", +"好家夥": "好傢夥", +"好戏连台": "好戲連臺", +"如法泡制": "如法泡製", +"妆台": "妝臺", +"姜太公": "姜太公", +"姜子牙": "姜子牙", +"姜丝": "姜絲", +"字汇": "字彙", +"字里行间": "字裡行間", +"存折": "存摺", +"孟姜女": "孟姜女", +"宇宙志": "宇宙誌", +"定准": "定准", +"定制": "定製", +"宣布": "宣佈", +"宫里": "宮裡", +"家伙": "家伙", +"家里": "家裡", +"密布": "密佈", +"寇雠": "寇讎", +"实干": "實幹", +"写字台": "寫字檯", +"写字台": "寫字臺", +"宽松": "寬鬆", +"封面里": "封面裡", +"射干": "射干", +"对表": "對錶", +"小丑": "小丑", +"小伙": "小伙", +"小只": "小隻", +"少吊": "少吊", +"尺布斗粟": "尺布斗粟", +"尼克松": "尼克鬆", +"尼采": "尼采", +"尿斗": "尿斗", +"局里": "局裡", +"居里": "居裡", +"屋子里": "屋子裡", +"屋里": "屋裡", +"展布": "展佈", +"屡仆屡起": "屢仆屢起", +"屯里": "屯裡", +"山岳": "山嶽", +"山里": "山裡", +"峰回": "峰迴", +"巡回": "巡迴", +"巧干": "巧幹", +"巴尔干": "巴爾幹", +"巴里": "巴裡", +"巷里": "巷裡", +"市里": "市裡", +"布谷": "布穀", +"希腊": "希腊", +"帘子": "帘子", +"帘布": "帘布", +"席卷": "席卷", +"带团参加": "帶團參加", +"带发修行": "帶髮修行", +"干休": "干休", +"干系": "干係", +"干卿何事": "干卿何事", +"干将": "干將", +"干戈": "干戈", +"干挠": "干撓", +"干扰": "干擾", +"干支": "干支", +"干政": "干政", +"干时": "干時", +"干涉": "干涉", +"干犯": "干犯", +"干与": "干與", +"干着急": "干著急", +"干贝": "干貝", +"干预": "干預", +"平台": "平臺", +"年历": "年曆", +"年里": "年裡", +"干上": "幹上", +"干下去": "幹下去", +"干了": "幹了", +"干事": "幹事", +"干些": "幹些", +"干个": "幹個", +"干劲": "幹勁", +"干员": "幹員", +"干吗": "幹嗎", +"干嘛": "幹嘛", +"干坏事": "幹壞事", +"干完": "幹完", +"干得": "幹得", +"干性油": "幹性油", +"干才": "幹才", +"干掉": "幹掉", +"干校": "幹校", +"干活": "幹活", +"干流": "幹流", +"干球温度": "幹球溫度", +"干线": "幹線", +"干练": "幹練", +"干警": "幹警", +"干起来": "幹起來", +"干路": "幹路", +"干道": "幹道", +"干部": "幹部", +"干么": "幹麼", +"几丝": "幾絲", +"几只": "幾隻", +"几出": "幾齣", +"底里": "底裡", +"康采恩": "康采恩", +"庙里": "廟裡", +"建台": "建臺", +"弄脏": "弄髒", +"弔卷": "弔卷", +"弘历": "弘曆", +"别扭": "彆扭", +"别拗": "彆拗", +"别气": "彆氣", +"别脚": "彆腳", +"别着": "彆著", +"弹子台": "彈子檯", +"弹药": "彈葯", +"汇报": "彙報", +"汇整": "彙整", +"汇编": "彙編", +"汇总": "彙總", +"汇纂": "彙纂", +"汇辑": "彙輯", +"汇集": "彙集", +"形单影只": "形單影隻", +"影后": "影后", +"往里": "往裡", +"往复": "往複", +"征伐": "征伐", +"征兵": "征兵", +"征尘": "征塵", +"征夫": "征夫", +"征战": "征戰", +"征收": "征收", +"征服": "征服", +"征求": "征求", +"征发": "征發", +"征衣": "征衣", +"征讨": "征討", +"征途": "征途", +"后台": "後臺", +"从里到外": "從裡到外", +"从里向外": "從裡向外", +"复雠": "復讎", +"复辟": "復辟", +"德干高原": "德干高原", +"心愿": "心愿", +"心荡神驰": "心蕩神馳", +"心里": "心裡", +"忙里": "忙裡", +"快干": "快幹", +"快冲": "快衝", +"怎么": "怎麼", +"怎么着": "怎麼著", +"怒发冲冠": "怒髮衝冠", +"急冲而下": "急衝而下", +"怪里怪气": "怪裡怪氣", +"恩准": "恩准", +"情有所钟": "情有所鍾", +"意面": "意麵", +"慌里慌张": "慌裡慌張", +"慰借": "慰藉", +"忧郁": "憂郁", +"凭吊": "憑吊", +"凭借": "憑藉", +"凭借着": "憑藉著", +"蒙懂": "懞懂", +"怀里": "懷裡", +"怀表": "懷錶", +"悬吊": "懸吊", +"恋恋不舍": "戀戀不捨", +"戏台": "戲臺", +"戴表": "戴錶", +"戽斗": "戽斗", +"房里": "房裡", +"手不释卷": "手不釋卷", +"手卷": "手卷", +"手折": "手摺", +"手里": "手裡", +"手表": "手錶", +"手松": "手鬆", +"才干": "才幹", +"才高八斗": "才高八斗", +"打谷": "打穀", +"扞御": "扞禦", +"批准": "批准", +"批复": "批複", +"批复": "批覆", +"承制": "承製", +"抗御": "抗禦", +"折冲": "折衝", +"披复": "披覆", +"披发": "披髮", +"抱朴": "抱朴", +"抵御": "抵禦", +"拆伙": "拆伙", +"拆台": "拆臺", +"拈须": "拈鬚", +"拉纤": "拉縴", +"拉面": "拉麵", +"拖吊": "拖吊", +"拗别": "拗彆", +"拮据": "拮据", +"振荡": "振蕩", +"捍御": "捍禦", +"舍不得": "捨不得", +"舍出": "捨出", +"舍去": "捨去", +"舍命": "捨命", +"舍己从人": "捨己從人", +"舍己救人": "捨己救人", +"舍己为人": "捨己為人", +"舍己为公": "捨己為公", +"舍己为国": "捨己為國", +"舍得": "捨得", +"舍我其谁": "捨我其誰", +"舍本逐末": "捨本逐末", +"舍弃": "捨棄", +"舍死忘生": "捨死忘生", +"舍生": "捨生", +"舍短取长": "捨短取長", +"舍身": "捨身", +"舍车保帅": "捨車保帥", +"舍近求远": "捨近求遠", +"捲发": "捲髮", +"捵面": "捵麵", +"扫荡": "掃蕩", +"掌柜": "掌柜", +"排骨面": "排骨麵", +"挂帘": "掛帘", +"挂面": "掛麵", +"接着说": "接著說", +"提心吊胆": "提心吊膽", +"插图卷": "插圖卷", +"换吊": "換吊", +"换只": "換隻", +"换发": "換髮", +"摇荡": "搖蕩", +"搭伙": "搭伙", +"折合": "摺合", +"折奏": "摺奏", +"折子": "摺子", +"折尺": "摺尺", +"折扇": "摺扇", +"折梯": "摺梯", +"折椅": "摺椅", +"折叠": "摺疊", +"折痕": "摺痕", +"折篷": "摺篷", +"折纸": "摺紙", +"折裙": "摺裙", +"撒布": "撒佈", +"撚须": "撚鬚", +"撞球台": "撞球檯", +"擂台": "擂臺", +"担仔面": "擔仔麵", +"担担面": "擔擔麵", +"担着": "擔著", +"担负着": "擔負著", +"据云": "據云", +"擢发难数": "擢髮難數", +"摆布": "擺佈", +"摄制": "攝製", +"支干": "支幹", +"收获": "收穫", +"改制": "改製", +"攻克": "攻剋", +"放荡": "放蕩", +"放松": "放鬆", +"叙说着": "敘說著", +"散伙": "散伙", +"散布": "散佈", +"散荡": "散蕩", +"散发": "散髮", +"整只": "整隻", +"整出": "整齣", +"文采": "文采", +"斗六": "斗六", +"斗南": "斗南", +"斗大": "斗大", +"斗子": "斗子", +"斗室": "斗室", +"斗方": "斗方", +"斗栱": "斗栱", +"斗笠": "斗笠", +"斗箕": "斗箕", +"斗篷": "斗篷", +"斗胆": "斗膽", +"斗转参横": "斗轉參橫", +"斗量": "斗量", +"斗门": "斗門", +"料斗": "料斗", +"斯里兰卡": "斯裡蘭卡", +"新历": "新曆", +"断头台": "斷頭臺", +"方才": "方纔", +"施舍": "施捨", +"旋绕着": "旋繞著", +"旋回": "旋迴", +"族里": "族裡", +"日历": "日曆", +"日志": "日誌", +"日进斗金": "日進斗金", +"明了": "明瞭", +"明窗净几": "明窗淨几", +"明里": "明裡", +"星斗": "星斗", +"星历": "星曆", +"星移斗换": "星移斗換", +"星移斗转": "星移斗轉", +"星罗棋布": "星羅棋佈", +"星辰表": "星辰錶", +"春假里": "春假裡", +"春天里": "春天裡", +"晃荡": "晃蕩", +"景致": "景緻", +"暗地里": "暗地裡", +"暗沟里": "暗溝裡", +"暗里": "暗裡", +"历数": "曆數", +"历书": "曆書", +"历法": "曆法", +"书卷": "書卷", +"会干": "會幹", +"会里": "會裡", +"月历": "月曆", +"月台": "月臺", +"有只": "有隻", +"木制": "木製", +"本台": "本臺", +"朴子": "朴子", +"朴实": "朴實", +"朴硝": "朴硝", +"朴素": "朴素", +"朴资茅斯": "朴資茅斯", +"村里": "村裡", +"束发": "束髮", +"东岳": "東嶽", +"东征": "東征", +"松赞干布": "松贊干布", +"板着脸": "板著臉", +"板荡": "板蕩", +"枕借": "枕藉", +"林宏岳": "林宏嶽", +"枝干": "枝幹", +"枯干": "枯幹", +"某只": "某隻", +"染发": "染髮", +"柜上": "柜上", +"柜台": "柜台", +"柜子": "柜子", +"查卷": "查卷", +"查号台": "查號臺", +"校雠学": "校讎學", +"核准": "核准", +"核复": "核覆", +"格里": "格裡", +"案卷": "案卷", +"条干": "條幹", +"棉卷": "棉卷", +"棉制": "棉製", +"植发": "植髮", +"楼台": "樓臺", +"标志着": "標志著", +"标致": "標緻", +"标志": "標誌", +"模制": "模製", +"树干": "樹幹", +"横征暴敛": "橫征暴斂", +"横冲": "橫衝", +"档卷": "檔卷", +"检复": "檢覆", +"台子": "檯子", +"台布": "檯布", +"台灯": "檯燈", +"台球": "檯球", +"台面": "檯面", +"柜台": "櫃檯", +"柜台": "櫃臺", +"栏干": "欄干", +"欺蒙": "欺矇", +"歌后": "歌后", +"欧几里得": "歐幾裡得", +"正当着": "正當著", +"武后": "武后", +"武松": "武鬆", +"归并": "歸併", +"死里求生": "死裡求生", +"死里逃生": "死裡逃生", +"残卷": "殘卷", +"杀虫药": "殺虫藥", +"壳里": "殼裡", +"母后": "母后", +"每只": "每隻", +"比干": "比干", +"毛卷": "毛卷", +"毛发": "毛髮", +"毫发": "毫髮", +"气冲牛斗": "氣沖牛斗", +"气象台": "氣象臺", +"氯霉素": "氯黴素", +"水斗": "水斗", +"水里": "水裡", +"水表": "水錶", +"永历": "永曆", +"污蔑": "汙衊", +"池里": "池裡", +"污蔑": "污衊", +"沈着": "沈著", +"没事干": "沒事幹", +"没精打采": "沒精打采", +"冲着": "沖著", +"沙里淘金": "沙裡淘金", +"河里": "河裡", +"油面": "油麵", +"泡面": "泡麵", +"泰斗": "泰斗", +"洗手不干": "洗手不幹", +"洗发精": "洗髮精", +"派团参加": "派團參加", +"流荡": "流蕩", +"浩荡": "浩蕩", +"浪琴表": "浪琴錶", +"浪荡": "浪蕩", +"浮荡": "浮蕩", +"海里": "海裡", +"涂着": "涂著", +"液晶表": "液晶錶", +"凉面": "涼麵", +"淡朱": "淡硃", +"淫荡": "淫蕩", +"测验卷": "測驗卷", +"港制": "港製", +"游荡": "游蕩", +"凑合着": "湊合著", +"湖里": "湖裡", +"汤团": "湯糰", +"汤面": "湯麵", +"卤制": "滷製", +"卤面": "滷麵", +"满布": "滿佈", +"漂荡": "漂蕩", +"漏斗": "漏斗", +"演奏台": "演奏臺", +"潭里": "潭裡", +"激荡": "激蕩", +"浓郁": "濃郁", +"浓发": "濃髮", +"湿地松": "濕地鬆", +"蒙蒙": "濛濛", +"蒙雾": "濛霧", +"瀛台": "瀛臺", +"弥漫": "瀰漫", +"弥漫着": "瀰漫著", +"火并": "火併", +"灰蒙": "灰濛", +"炒面": "炒麵", +"炮制": "炮製", +"炸药": "炸葯", +"炸酱面": "炸醬麵", +"为着": "為著", +"乌干达": "烏干達", +"乌苏里江": "烏蘇裡江", +"乌发": "烏髮", +"乌龙面": "烏龍麵", +"烘制": "烘製", +"烽火台": "烽火臺", +"无干": "無干", +"无精打采": "無精打采", +"炼制": "煉製", +"烟卷儿": "煙卷兒", +"烟斗": "煙斗", +"烟斗丝": "煙斗絲", +"烟台": "煙臺", +"照准": "照准", +"熨斗": "熨斗", +"灯台": "燈臺", +"燎发": "燎髮", +"烫发": "燙髮", +"烫面": "燙麵", +"烛台": "燭臺", +"炉台": "爐臺", +"爽荡": "爽蕩", +"片言只语": "片言隻語", +"牛肉面": "牛肉麵", +"牛只": "牛隻", +"特准": "特准", +"特征": "特征", +"特里": "特裡", +"特制": "特製", +"牵系": "牽繫", +"狼借": "狼藉", +"猛冲": "猛衝", +"奖杯": "獎盃", +"获准": "獲准", +"率团参加": "率團參加", +"王侯后": "王侯后", +"王后": "王后", +"班里": "班裡", +"理发": "理髮", +"瑶台": "瑤臺", +"甚么": "甚麼", +"甜面酱": "甜麵醬", +"生力面": "生力麵", +"生锈": "生鏽", +"生发": "生髮", +"田里": "田裡", +"由馀": "由余", +"男佣": "男佣", +"男用表": "男用錶", +"留发": "留髮", +"畚斗": "畚斗", +"当着": "當著", +"疏松": "疏鬆", +"疲困": "疲睏", +"病症": "病癥", +"症候": "癥候", +"症状": "癥狀", +"症结": "癥結", +"登台": "登臺", +"发布": "發佈", +"发着": "發著", +"发面": "發麵", +"发霉": "發黴", +"白卷": "白卷", +"白干儿": "白干兒", +"白发": "白髮", +"白面": "白麵", +"百里": "百裡", +"百只": "百隻", +"皇后": "皇后", +"皇历": "皇曆", +"皓发": "皓髮", +"皮里阳秋": "皮裏陽秋", +"皮里春秋": "皮裡春秋", +"皮制": "皮製", +"皱折": "皺摺", +"盒里": "盒裡", +"监制": "監製", +"盘里": "盤裡", +"盘回": "盤迴", +"直接参与": "直接參与", +"直冲": "直衝", +"相克": "相剋", +"相干": "相干", +"相冲": "相衝", +"看台": "看臺", +"眼帘": "眼帘", +"眼眶里": "眼眶裡", +"眼里": "眼裡", +"困乏": "睏乏", +"睡着了": "睡著了", +"了如": "瞭如", +"了望": "瞭望", +"了然": "瞭然", +"了若指掌": "瞭若指掌", +"了解": "瞭解", +"蒙住": "矇住", +"蒙昧无知": "矇昧無知", +"蒙混": "矇混", +"蒙蒙": "矇矇", +"蒙眬": "矇矓", +"蒙蔽": "矇蔽", +"蒙骗": "矇騙", +"短发": "短髮", +"石英表": "石英錶", +"研制": "研製", +"砰当": "砰噹", +"砲台": "砲臺", +"朱唇皓齿": "硃唇皓齒", +"朱批": "硃批", +"朱砂": "硃砂", +"朱笔": "硃筆", +"朱红色": "硃紅色", +"朱色": "硃色", +"硬干": "硬幹", +"砚台": "硯臺", +"碑志": "碑誌", +"磁制": "磁製", +"磨制": "磨製", +"示复": "示覆", +"社里": "社裡", +"神采": "神采", +"御侮": "禦侮", +"御寇": "禦寇", +"御寒": "禦寒", +"御敌": "禦敵", +"秃发": "禿髮", +"秀发": "秀髮", +"私下里": "私下裡", +"秋天里": "秋天裡", +"秋裤": "秋褲", +"秒表": "秒錶", +"稀松": "稀鬆", +"禀复": "稟覆", +"稻谷": "稻穀", +"稽征": "稽征", +"谷仓": "穀倉", +"谷场": "穀場", +"谷子": "穀子", +"谷壳": "穀殼", +"谷物": "穀物", +"谷皮": "穀皮", +"谷神": "穀神", +"谷粒": "穀粒", +"谷舱": "穀艙", +"谷苗": "穀苗", +"谷草": "穀草", +"谷贱伤农": "穀賤傷農", +"谷道": "穀道", +"谷雨": "穀雨", +"谷类": "穀類", +"积极参与": "積极參与", +"积极参加": "積极參加", +"空荡": "空蕩", +"窗帘": "窗帘", +"窗明几净": "窗明几淨", +"窗台": "窗檯", +"窗台": "窗臺", +"窝里": "窩裡", +"窝阔台": "窩闊臺", +"穷追不舍": "窮追不捨", +"笆斗": "笆斗", +"笑里藏刀": "笑裡藏刀", +"第一卷": "第一卷", +"筋斗": "筋斗", +"答卷": "答卷", +"答复": "答複", +"答复": "答覆", +"筵几": "筵几", +"箕斗": "箕斗", +"签着": "簽著", +"吁求": "籲求", +"吁请": "籲請", +"粗制": "粗製", +"粗卤": "粗鹵", +"精干": "精幹", +"精明强干": "精明強幹", +"精致": "精緻", +"精制": "精製", +"精辟": "精辟", +"精采": "精采", +"糊里糊涂": "糊裡糊塗", +"团子": "糰子", +"系着": "系著", +"纪历": "紀曆", +"红发": "紅髮", +"红霉素": "紅黴素", +"纡回": "紆迴", +"纳采": "納采", +"素食面": "素食麵", +"素面": "素麵", +"紫微斗数": "紫微斗數", +"细致": "細緻", +"组里": "組裡", +"结发": "結髮", +"绝对参照": "絕對參照", +"丝来线去": "絲來線去", +"丝布": "絲布", +"丝板": "絲板", +"丝瓜布": "絲瓜布", +"丝绒布": "絲絨布", +"丝线": "絲線", +"丝织厂": "絲織廠", +"丝虫": "絲蟲", +"綑吊": "綑吊", +"经卷": "經卷", +"绿霉素": "綠黴素", +"维系": "維繫", +"绾发": "綰髮", +"网里": "網裡", +"紧绷": "緊繃", +"紧绷着": "緊繃著", +"紧追不舍": "緊追不捨", +"编制": "編製", +"编发": "編髮", +"缓冲": "緩衝", +"致密": "緻密", +"萦回": "縈迴", +"县里": "縣裡", +"县志": "縣誌", +"缝里": "縫裡", +"缝制": "縫製", +"纤夫": "縴夫", +"繁复": "繁複", +"绷住": "繃住", +"绷子": "繃子", +"绷带": "繃帶", +"绷紧": "繃緊", +"绷脸": "繃臉", +"绷着": "繃著", +"绷着脸": "繃著臉", +"绷着脸儿": "繃著臉兒", +"绷开": "繃開", +"绘制": "繪製", +"系上": "繫上", +"系到": "繫到", +"系囚": "繫囚", +"系心": "繫心", +"系念": "繫念", +"系怀": "繫懷", +"系数": "繫數", +"系于": "繫於", +"系系": "繫系", +"系紧": "繫緊", +"系绳": "繫繩", +"系着": "繫著", +"系辞": "繫辭", +"缴卷": "繳卷", +"累囚": "纍囚", +"累累": "纍纍", +"坛子": "罈子", +"坛坛罐罐": "罈罈罐罐", +"骂着": "罵著", +"美制": "美製", +"美发": "美髮", +"翻来复去": "翻來覆去", +"翻天复地": "翻天覆地", +"翻复": "翻覆", +"翻云复雨": "翻雲覆雨", +"老么": "老么", +"老板": "老闆", +"考卷": "考卷", +"耕获": "耕穫", +"聊斋志异": "聊齋誌異", +"联系": "聯係", +"联系": "聯繫", +"肉丝面": "肉絲麵", +"肉羹面": "肉羹麵", +"肉松": "肉鬆", +"肢体": "肢体", +"背向着": "背向著", +"背地里": "背地裡", +"胡里胡涂": "胡裡胡塗", +"能干": "能幹", +"脉冲": "脈衝", +"脱发": "脫髮", +"腊味": "腊味", +"腊笔": "腊筆", +"腊肉": "腊肉", +"脑子里": "腦子裡", +"腰里": "腰裡", +"胶卷": "膠卷", +"自制": "自製", +"自觉自愿": "自覺自愿", +"台上": "臺上", +"台下": "臺下", +"台中": "臺中", +"台北": "臺北", +"台南": "臺南", +"台地": "臺地", +"台塑": "臺塑", +"台大": "臺大", +"台币": "臺幣", +"台座": "臺座", +"台东": "臺東", +"台柱": "臺柱", +"台榭": "臺榭", +"台汽": "臺汽", +"台海": "臺海", +"台澎金马": "臺澎金馬", +"台湾": "臺灣", +"台灯": "臺燈", +"台球": "臺球", +"台省": "臺省", +"台端": "臺端", +"台糖": "臺糖", +"台肥": "臺肥", +"台航": "臺航", +"台视": "臺視", +"台词": "臺詞", +"台车": "臺車", +"台铁": "臺鐵", +"台阶": "臺階", +"台电": "臺電", +"台面": "臺面", +"舂谷": "舂穀", +"兴致": "興緻", +"兴高采烈": "興高采烈", +"旧历": "舊曆", +"舒卷": "舒卷", +"舞台": "舞臺", +"航海历": "航海曆", +"船只": "船隻", +"舰只": "艦隻", +"芬郁": "芬郁", +"花卷": "花卷", +"花盆里": "花盆裡", +"花采": "花采", +"苑里": "苑裡", +"若干": "若干", +"苦干": "苦幹", +"苦里": "苦裏", +"苦卤": "苦鹵", +"范仲淹": "范仲淹", +"范蠡": "范蠡", +"范阳": "范陽", +"茅台": "茅臺", +"茶几": "茶几", +"草丛里": "草叢裡", +"庄里": "莊裡", +"茎干": "莖幹", +"莽荡": "莽蕩", +"菌丝体": "菌絲体", +"菌丝体": "菌絲體", +"华里": "華裡", +"华发": "華髮", +"万卷": "萬卷", +"万历": "萬曆", +"万只": "萬隻", +"落发": "落髮", +"着儿": "著兒", +"着书立说": "著書立說", +"着色软体": "著色軟體", +"着重指出": "著重指出", +"着录": "著錄", +"着录规则": "著錄規則", +"蓄发": "蓄髮", +"蓄须": "蓄鬚", +"蓬发": "蓬髮", +"蓬松": "蓬鬆", +"莲台": "蓮臺", +"荡来荡去": "蕩來蕩去", +"荡女": "蕩女", +"荡妇": "蕩婦", +"荡寇": "蕩寇", +"荡平": "蕩平", +"荡涤": "蕩滌", +"荡漾": "蕩漾", +"荡然": "蕩然", +"荡舟": "蕩舟", +"荡船": "蕩船", +"荡荡": "蕩蕩", +"薑丝": "薑絲", +"薙发": "薙髮", +"借以": "藉以", +"借口": "藉口", +"借故": "藉故", +"借机": "藉機", +"借此": "藉此", +"借由": "藉由", +"借端": "藉端", +"借着": "藉著", +"借借": "藉藉", +"借词": "藉詞", +"借资": "藉資", +"借酒浇愁": "藉酒澆愁", +"藤制": "藤製", +"蕴含着": "蘊含著", +"蕴涵着": "蘊涵著", +"蕴借": "蘊藉", +"萝卜": "蘿蔔", +"虎须": "虎鬚", +"号志": "號誌", +"蜂后": "蜂后", +"蛮干": "蠻幹", +"行事历": "行事曆", +"胡同": "衚衕", +"冲上": "衝上", +"冲下": "衝下", +"冲来": "衝來", +"冲倒": "衝倒", +"冲出": "衝出", +"冲到": "衝到", +"冲刺": "衝刺", +"冲克": "衝剋", +"冲力": "衝力", +"冲劲": "衝勁", +"冲动": "衝動", +"冲去": "衝去", +"冲口": "衝口", +"冲垮": "衝垮", +"冲堂": "衝堂", +"冲压": "衝壓", +"冲天": "衝天", +"冲掉": "衝掉", +"冲撞": "衝撞", +"冲击": "衝擊", +"冲散": "衝散", +"冲决": "衝決", +"冲浪": "衝浪", +"冲激": "衝激", +"冲破": "衝破", +"冲程": "衝程", +"冲突": "衝突", +"冲线": "衝線", +"冲着": "衝著", +"冲冲": "衝衝", +"冲要": "衝要", +"冲起": "衝起", +"冲进": "衝進", +"冲过": "衝過", +"冲锋": "衝鋒", +"表里": "表裡", +"袖里": "袖裡", +"被里": "被裡", +"被复": "被複", +"被复": "被覆", +"被复着": "被覆著", +"被发": "被髮", +"裁并": "裁併", +"裁制": "裁製", +"里面": "裏面", +"里人": "裡人", +"里加": "裡加", +"里外": "裡外", +"里子": "裡子", +"里屋": "裡屋", +"里层": "裡層", +"里布": "裡布", +"里带": "裡帶", +"里弦": "裡弦", +"里应外合": "裡應外合", +"里拉": "裡拉", +"里斯": "裡斯", +"里海": "裡海", +"里脊": "裡脊", +"里衣": "裡衣", +"里里": "裡裡", +"里通外国": "裡通外國", +"里通外敌": "裡通外敵", +"里边": "裡邊", +"里间": "裡間", +"里面": "裡面", +"里头": "裡頭", +"制件": "製件", +"制作": "製作", +"制做": "製做", +"制备": "製備", +"制冰": "製冰", +"制冷": "製冷", +"制剂": "製劑", +"制品": "製品", +"制图": "製圖", +"制成": "製成", +"制法": "製法", +"制为": "製為", +"制片": "製片", +"制版": "製版", +"制程": "製程", +"制糖": "製糖", +"制纸": "製紙", +"制药": "製藥", +"制表": "製表", +"制裁": "製裁", +"制造": "製造", +"制革": "製革", +"制鞋": "製鞋", +"制盐": "製鹽", +"复仞年如": "複仞年如", +"复以百万": "複以百萬", +"复位": "複位", +"复信": "複信", +"复分数": "複分數", +"复列": "複列", +"复利": "複利", +"复印": "複印", +"复原": "複原", +"复句": "複句", +"复合": "複合", +"复名": "複名", +"复员": "複員", +"复壁": "複壁", +"复壮": "複壯", +"复姓": "複姓", +"复字键": "複字鍵", +"复审": "複審", +"复写": "複寫", +"复式": "複式", +"复复": "複復", +"复数": "複數", +"复本": "複本", +"复查": "複查", +"复核": "複核", +"复检": "複檢", +"复次": "複次", +"复比": "複比", +"复决": "複決", +"复活": "複活", +"复测": "複測", +"复亩珍": "複畝珍", +"复发": "複發", +"复目": "複目", +"复眼": "複眼", +"复种": "複種", +"复线": "複線", +"复习": "複習", +"复兴社": "複興社", +"复旧": "複舊", +"复色": "複色", +"复叶": "複葉", +"复盖": "複蓋", +"复苏": "複蘇", +"复制": "複製", +"复诊": "複診", +"复词": "複詞", +"复试": "複試", +"复课": "複課", +"复议": "複議", +"复变函数": "複變函數", +"复赛": "複賽", +"复述": "複述", +"复选": "複選", +"复钱": "複錢", +"复杂": "複雜", +"复电": "複電", +"复音": "複音", +"复韵": "複韻", +"衬里": "襯裡", +"西岳": "西嶽", +"西征": "西征", +"西历": "西曆", +"要冲": "要衝", +"要么": "要麼", +"复上": "覆上", +"复亡": "覆亡", +"复住": "覆住", +"复信": "覆信", +"复命": "覆命", +"复在": "覆在", +"复审": "覆審", +"复巢之下": "覆巢之下", +"复成": "覆成", +"复败": "覆敗", +"复文": "覆文", +"复校": "覆校", +"复核": "覆核", +"复水难收": "覆水難收", +"复没": "覆沒", +"复灭": "覆滅", +"复盆": "覆盆", +"复舟": "覆舟", +"复着": "覆著", +"复盖": "覆蓋", +"复盖着": "覆蓋著", +"复试": "覆試", +"复议": "覆議", +"复车": "覆車", +"复载": "覆載", +"复辙": "覆轍", +"复电": "覆電", +"见复": "見覆", +"亲征": "親征", +"观众台": "觀眾臺", +"观台": "觀臺", +"观象台": "觀象臺", +"角落里": "角落裡", +"觔斗": "觔斗", +"触须": "觸鬚", +"订制": "訂製", +"诉说着": "訴說著", +"词汇": "詞彙", +"试卷": "試卷", +"诗卷": "詩卷", +"话里有话": "話裡有話", +"志哀": "誌哀", +"志喜": "誌喜", +"志庆": "誌慶", +"语云": "語云", +"语汇": "語彙", +"诬蔑": "誣衊", +"诵经台": "誦經臺", +"说着": "說著", +"课征": "課征", +"调制": "調製", +"调频台": "調頻臺", +"请参阅": "請參閱", +"讲台": "講臺", +"谢绝参观": "謝絕參觀", +"护发": "護髮", +"雠隙": "讎隙", +"豆腐干": "豆腐干", +"竖着": "豎著", +"丰富多采": "豐富多采", +"丰滨": "豐濱", +"丰滨乡": "豐濱鄉", +"丰采": "豐采", +"象征着": "象徵著", +"贵干": "貴幹", +"贾后": "賈后", +"赈饥": "賑饑", +"贤后": "賢后", +"质朴": "質朴", +"赌台": "賭檯", +"购并": "購併", +"赤松": "赤鬆", +"起吊": "起吊", +"起复": "起複", +"赶制": "趕製", +"跌荡": "跌蕩", +"跟斗": "跟斗", +"跳荡": "跳蕩", +"跳表": "跳錶", +"踬仆": "躓仆", +"躯干": "軀幹", +"车库里": "車庫裡", +"车站里": "車站裡", +"车里": "車裡", +"轻松": "輕鬆", +"轮回": "輪迴", +"转台": "轉檯", +"辛丑": "辛丑", +"辟邪": "辟邪", +"办伙": "辦伙", +"办公台": "辦公檯", +"辞汇": "辭彙", +"农历": "農曆", +"迂回": "迂迴", +"近日里": "近日裡", +"迥然回异": "迥然迴異", +"回光返照": "迴光返照", +"回向": "迴向", +"回圈": "迴圈", +"回廊": "迴廊", +"回形夹": "迴形夾", +"回文": "迴文", +"回旋": "迴旋", +"回流": "迴流", +"回环": "迴環", +"回荡": "迴盪", +"回纹针": "迴紋針", +"回绕": "迴繞", +"回肠": "迴腸", +"回荡": "迴蕩", +"回诵": "迴誦", +"回路": "迴路", +"回转": "迴轉", +"回递性": "迴遞性", +"回避": "迴避", +"回响": "迴響", +"回风": "迴風", +"回首": "迴首", +"迷蒙": "迷濛", +"退伙": "退伙", +"这么着": "這么著", +"这里": "這裏", +"这里": "這裡", +"这只": "這隻", +"这么": "這麼", +"这么着": "這麼著", +"通心面": "通心麵", +"速食面": "速食麵", +"连系": "連繫", +"连台好戏": "連臺好戲", +"游荡": "遊蕩", +"遍布": "遍佈", +"递回": "遞迴", +"远征": "遠征", +"适才": "適纔", +"遮复": "遮覆", +"还冲": "還衝", +"邋里邋遢": "邋裡邋遢", +"那里": "那裡", +"那只": "那隻", +"那么": "那麼", +"那么着": "那麼著", +"邪辟": "邪辟", +"郁烈": "郁烈", +"郁穆": "郁穆", +"郁郁": "郁郁", +"郁闭": "郁閉", +"郁馥": "郁馥", +"乡愿": "鄉愿", +"乡里": "鄉裡", +"邻里": "鄰裡", +"配合着": "配合著", +"配制": "配製", +"酒杯": "酒盃", +"酒坛": "酒罈", +"酥松": "酥鬆", +"醋坛": "醋罈", +"酝借": "醞藉", +"酝酿着": "醞釀著", +"医药": "醫葯", +"醲郁": "醲郁", +"酿制": "釀製", +"采地": "采地", +"采女": "采女", +"采声": "采聲", +"采色": "采色", +"采邑": "采邑", +"里程表": "里程錶", +"重折": "重摺", +"重复": "重複", +"重复": "重覆", +"重锤": "重鎚", +"野台戏": "野臺戲", +"金斗": "金斗", +"金表": "金錶", +"金发": "金髮", +"金霉素": "金黴素", +"钉锤": "釘鎚", +"银朱": "銀硃", +"银发": "銀髮", +"铜制": "銅製", +"铝制": "鋁製", +"钢制": "鋼製", +"录着": "錄著", +"录制": "錄製", +"表带": "錶帶", +"表店": "錶店", +"表厂": "錶廠", +"表壳": "錶殼", +"表链": "錶鏈", +"表面": "錶面", +"锅台": "鍋臺", +"锻鍊出": "鍛鍊出", +"锻鍊身体": "鍛鍊身体", +"锲而不舍": "鍥而不捨", +"锤儿": "鎚兒", +"锤子": "鎚子", +"锤头": "鎚頭", +"链霉素": "鏈黴素", +"镜台": "鏡臺", +"锈病": "鏽病", +"锈菌": "鏽菌", +"锈蚀": "鏽蝕", +"钟表": "鐘錶", +"铁锤": "鐵鎚", +"铁锈": "鐵鏽", +"长征": "長征", +"长发": "長髮", +"长须鲸": "長鬚鯨", +"门帘": "門帘", +"门斗": "門斗", +"门里": "門裡", +"开伙": "開伙", +"开卷": "開卷", +"开诚布公": "開誠佈公", +"开采": "開采", +"閒情逸致": "閒情逸緻", +"閒荡": "閒蕩", +"间不容发": "間不容髮", +"闵采尔": "閔采爾", +"阅卷": "閱卷", +"阑干": "闌干", +"关系": "關係", +"关系着": "關係著", +"防御": "防禦", +"防锈": "防鏽", +"防台": "防颱", +"阿斗": "阿斗", +"阿里": "阿裡", +"除旧布新": "除舊佈新", +"阴干": "陰干", +"阴历": "陰曆", +"阴郁": "陰郁", +"陆征祥": "陸征祥", +"阳春面": "陽春麵", +"阳历": "陽曆", +"阳台": "陽臺", +"只字": "隻字", +"只影": "隻影", +"只手遮天": "隻手遮天", +"只眼": "隻眼", +"只言片语": "隻言片語", +"只身": "隻身", +"雅致": "雅緻", +"雇佣": "雇佣", +"双折": "雙摺", +"杂志": "雜誌", +"鸡丝": "雞絲", +"鸡丝面": "雞絲麵", +"鸡腿面": "雞腿麵", +"鸡只": "雞隻", +"难舍": "難捨", +"雪里": "雪裡", +"云须": "雲鬚", +"电子表": "電子錶", +"电台": "電臺", +"电冲": "電衝", +"电复": "電覆", +"电视台": "電視臺", +"电表": "電錶", +"震荡": "震蕩", +"雾里": "霧裡", +"露台": "露臺", +"灵台": "靈臺", +"青瓦台": "青瓦臺", +"青霉": "青黴", +"面朝着": "面朝著", +"面临着": "面臨著", +"鞋里": "鞋裡", +"鞣制": "鞣製", +"秋千": "鞦韆", +"鞭辟入里": "鞭辟入裡", +"韩国制": "韓國製", +"韩制": "韓製", +"预制": "預製", +"颁布": "頒佈", +"头里": "頭裡", +"头发": "頭髮", +"颊须": "頰鬚", +"颠仆": "顛仆", +"颠复": "顛複", +"颠复": "顛覆", +"显着标志": "顯著標志", +"风土志": "風土誌", +"风斗": "風斗", +"风物志": "風物誌", +"风里": "風裡", +"风采": "風采", +"台风": "颱風", +"刮了": "颳了", +"刮倒": "颳倒", +"刮去": "颳去", +"刮得": "颳得", +"刮着": "颳著", +"刮走": "颳走", +"刮起": "颳起", +"刮风": "颳風", +"飘荡": "飄蕩", +"饭团": "飯糰", +"饼干": "餅干", +"馄饨面": "餛飩麵", +"饥不择食": "饑不擇食", +"饥寒": "饑寒", +"饥民": "饑民", +"饥渴": "饑渴", +"饥溺": "饑溺", +"饥荒": "饑荒", +"饥饱": "饑飽", +"饥饿": "饑餓", +"饥馑": "饑饉", +"首当其冲": "首當其衝", +"香郁": "香郁", +"馥郁": "馥郁", +"马里": "馬裡", +"马表": "馬錶", +"骀荡": "駘蕩", +"腾冲": "騰衝", +"骨子里": "骨子裡", +"骨干": "骨幹", +"骨灰坛": "骨灰罈", +"肮脏": "骯髒", +"脏乱": "髒亂", +"脏兮兮": "髒兮兮", +"脏字": "髒字", +"脏得": "髒得", +"脏东西": "髒東西", +"脏水": "髒水", +"脏的": "髒的", +"脏话": "髒話", +"脏钱": "髒錢", +"高干": "高幹", +"高台": "高臺", +"髭须": "髭鬚", +"发型": "髮型", +"发夹": "髮夾", +"发妻": "髮妻", +"发姐": "髮姐", +"发带": "髮帶", +"发廊": "髮廊", +"发式": "髮式", +"发指": "髮指", +"发捲": "髮捲", +"发根": "髮根", +"发毛": "髮毛", +"发油": "髮油", +"发状": "髮狀", +"发短心长": "髮短心長", +"发端": "髮端", +"发结": "髮結", +"发丝": "髮絲", +"发网": "髮網", +"发肤": "髮膚", +"发胶": "髮膠", +"发菜": "髮菜", +"发蜡": "髮蠟", +"发辫": "髮辮", +"发针": "髮針", +"发长": "髮長", +"发际": "髮際", +"发霜": "髮霜", +"发髻": "髮髻", +"发鬓": "髮鬢", +"鬅松": "鬅鬆", +"松了": "鬆了", +"松些": "鬆些", +"松劲": "鬆勁", +"松动": "鬆動", +"松口": "鬆口", +"松土": "鬆土", +"松弛": "鬆弛", +"松快": "鬆快", +"松懈": "鬆懈", +"松手": "鬆手", +"松散": "鬆散", +"松林": "鬆林", +"松柔": "鬆柔", +"松毛虫": "鬆毛蟲", +"松浮": "鬆浮", +"松涛": "鬆濤", +"松科": "鬆科", +"松节油": "鬆節油", +"松绑": "鬆綁", +"松紧": "鬆緊", +"松缓": "鬆緩", +"松脆": "鬆脆", +"松脱": "鬆脫", +"松起": "鬆起", +"松软": "鬆軟", +"松通": "鬆通", +"松开": "鬆開", +"松饼": "鬆餅", +"松松": "鬆鬆", +"鬈发": "鬈髮", +"胡子": "鬍子", +"胡梢": "鬍梢", +"胡渣": "鬍渣", +"胡髭": "鬍髭", +"胡须": "鬍鬚", +"须根": "鬚根", +"须毛": "鬚毛", +"须生": "鬚生", +"须眉": "鬚眉", +"须发": "鬚髮", +"须须": "鬚鬚", +"鬓发": "鬢髮", +"斗着": "鬥著", +"闹着玩儿": "鬧著玩儿", +"闹着玩儿": "鬧著玩兒", +"郁郁": "鬱郁", +"鱼松": "魚鬆", +"鲸须": "鯨鬚", +"鲇鱼": "鯰魚", +"鹤发": "鶴髮", +"卤化": "鹵化", +"卤味": "鹵味", +"卤族": "鹵族", +"卤水": "鹵水", +"卤汁": "鹵汁", +"卤簿": "鹵簿", +"卤素": "鹵素", +"卤莽": "鹵莽", +"卤钝": "鹵鈍", +"咸味": "鹹味", +"咸土": "鹹土", +"咸度": "鹹度", +"咸得": "鹹得", +"咸水": "鹹水", +"咸海": "鹹海", +"咸淡": "鹹淡", +"咸湖": "鹹湖", +"咸汤": "鹹湯", +"咸的": "鹹的", +"咸肉": "鹹肉", +"咸菜": "鹹菜", +"咸蛋": "鹹蛋", +"咸猪肉": "鹹豬肉", +"咸类": "鹹類", +"咸鱼": "鹹魚", +"咸鸭蛋": "鹹鴨蛋", +"咸卤": "鹹鹵", +"咸咸": "鹹鹹", +"盐卤": "鹽鹵", +"面价": "麵價", +"面包": "麵包", +"面团": "麵團", +"面店": "麵店", +"面厂": "麵廠", +"面杖": "麵杖", +"面条": "麵條", +"面灰": "麵灰", +"面皮": "麵皮", +"面筋": "麵筋", +"面粉": "麵粉", +"面糊": "麵糊", +"面线": "麵線", +"面茶": "麵茶", +"面食": "麵食", +"面饺": "麵餃", +"面饼": "麵餅", +"麻酱面": "麻醬麵", +"黄历": "黃曆", +"黄发垂髫": "黃髮垂髫", +"黑发": "黑髮", +"黑松": "黑鬆", +"霉毒": "黴毒", +"霉菌": "黴菌", +"鼓里": "鼓裡", +"冬冬": "鼕鼕", +"龙卷": "龍卷", +"龙须": "龍鬚", +} + +zh2Hans = { +'顯著': '显著', +'土著': '土著', +'印表機': '打印机', +'說明檔案': '帮助文件', +"瀋": "沈", +"畫": "划", +"鍾": "钟", +"靦": "腼", +"餘": "余", +"鯰": "鲇", +"鹼": "碱", +"㠏": "㟆", +"𡞵": "㛟", +"万": "万", +"与": "与", +"丑": "丑", +"丟": "丢", +"並": "并", +"丰": "丰", +"么": "么", +"乾": "干", +"乾坤": "乾坤", +"乾隆": "乾隆", +"亂": "乱", +"云": "云", +"亙": "亘", +"亞": "亚", +"仆": "仆", +"价": "价", +"伙": "伙", +"佇": "伫", +"佈": "布", +"体": "体", +"余": "余", +"余": "馀", +"佣": "佣", +"併": "并", +"來": "来", +"侖": "仑", +"侶": "侣", +"俁": "俣", +"係": "系", +"俔": "伣", +"俠": "侠", +"倀": "伥", +"倆": "俩", +"倈": "俫", +"倉": "仓", +"個": "个", +"們": "们", +"倫": "伦", +"偉": "伟", +"側": "侧", +"偵": "侦", +"偽": "伪", +"傑": "杰", +"傖": "伧", +"傘": "伞", +"備": "备", +"傢": "家", +"傭": "佣", +"傯": "偬", +"傳": "传", +"傴": "伛", +"債": "债", +"傷": "伤", +"傾": "倾", +"僂": "偻", +"僅": "仅", +"僉": "佥", +"僑": "侨", +"僕": "仆", +"僞": "伪", +"僥": "侥", +"僨": "偾", +"價": "价", +"儀": "仪", +"儂": "侬", +"億": "亿", +"儈": "侩", +"儉": "俭", +"儐": "傧", +"儔": "俦", +"儕": "侪", +"儘": "尽", +"償": "偿", +"優": "优", +"儲": "储", +"儷": "俪", +"儸": "㑩", +"儺": "傩", +"儻": "傥", +"儼": "俨", +"儿": "儿", +"兇": "凶", +"兌": "兑", +"兒": "儿", +"兗": "兖", +"党": "党", +"內": "内", +"兩": "两", +"冊": "册", +"冪": "幂", +"准": "准", +"凈": "净", +"凍": "冻", +"凜": "凛", +"几": "几", +"凱": "凯", +"划": "划", +"別": "别", +"刪": "删", +"剄": "刭", +"則": "则", +"剋": "克", +"剎": "刹", +"剗": "刬", +"剛": "刚", +"剝": "剥", +"剮": "剐", +"剴": "剀", +"創": "创", +"劃": "划", +"劇": "剧", +"劉": "刘", +"劊": "刽", +"劌": "刿", +"劍": "剑", +"劏": "㓥", +"劑": "剂", +"劚": "㔉", +"勁": "劲", +"動": "动", +"務": "务", +"勛": "勋", +"勝": "胜", +"勞": "劳", +"勢": "势", +"勩": "勚", +"勱": "劢", +"勵": "励", +"勸": "劝", +"勻": "匀", +"匭": "匦", +"匯": "汇", +"匱": "匮", +"區": "区", +"協": "协", +"卷": "卷", +"卻": "却", +"厂": "厂", +"厙": "厍", +"厠": "厕", +"厭": "厌", +"厲": "厉", +"厴": "厣", +"參": "参", +"叄": "叁", +"叢": "丛", +"台": "台", +"叶": "叶", +"吊": "吊", +"后": "后", +"吒": "咤", +"吳": "吴", +"吶": "呐", +"呂": "吕", +"獃": "呆", +"咼": "呙", +"員": "员", +"唄": "呗", +"唚": "吣", +"問": "问", +"啓": "启", +"啞": "哑", +"啟": "启", +"啢": "唡", +"喎": "㖞", +"喚": "唤", +"喪": "丧", +"喬": "乔", +"單": "单", +"喲": "哟", +"嗆": "呛", +"嗇": "啬", +"嗊": "唝", +"嗎": "吗", +"嗚": "呜", +"嗩": "唢", +"嗶": "哔", +"嘆": "叹", +"嘍": "喽", +"嘔": "呕", +"嘖": "啧", +"嘗": "尝", +"嘜": "唛", +"嘩": "哗", +"嘮": "唠", +"嘯": "啸", +"嘰": "叽", +"嘵": "哓", +"嘸": "呒", +"嘽": "啴", +"噁": "恶", +"噓": "嘘", +"噚": "㖊", +"噝": "咝", +"噠": "哒", +"噥": "哝", +"噦": "哕", +"噯": "嗳", +"噲": "哙", +"噴": "喷", +"噸": "吨", +"噹": "当", +"嚀": "咛", +"嚇": "吓", +"嚌": "哜", +"嚕": "噜", +"嚙": "啮", +"嚥": "咽", +"嚦": "呖", +"嚨": "咙", +"嚮": "向", +"嚲": "亸", +"嚳": "喾", +"嚴": "严", +"嚶": "嘤", +"囀": "啭", +"囁": "嗫", +"囂": "嚣", +"囅": "冁", +"囈": "呓", +"囌": "苏", +"囑": "嘱", +"囪": "囱", +"圇": "囵", +"國": "国", +"圍": "围", +"園": "园", +"圓": "圆", +"圖": "图", +"團": "团", +"坏": "坏", +"垵": "埯", +"埡": "垭", +"埰": "采", +"執": "执", +"堅": "坚", +"堊": "垩", +"堖": "垴", +"堝": "埚", +"堯": "尧", +"報": "报", +"場": "场", +"塊": "块", +"塋": "茔", +"塏": "垲", +"塒": "埘", +"塗": "涂", +"塚": "冢", +"塢": "坞", +"塤": "埙", +"塵": "尘", +"塹": "堑", +"墊": "垫", +"墜": "坠", +"墮": "堕", +"墳": "坟", +"墻": "墙", +"墾": "垦", +"壇": "坛", +"壈": "𡒄", +"壋": "垱", +"壓": "压", +"壘": "垒", +"壙": "圹", +"壚": "垆", +"壞": "坏", +"壟": "垄", +"壠": "垅", +"壢": "坜", +"壩": "坝", +"壯": "壮", +"壺": "壶", +"壼": "壸", +"壽": "寿", +"夠": "够", +"夢": "梦", +"夾": "夹", +"奐": "奂", +"奧": "奥", +"奩": "奁", +"奪": "夺", +"奬": "奖", +"奮": "奋", +"奼": "姹", +"妝": "妆", +"姍": "姗", +"姜": "姜", +"姦": "奸", +"娛": "娱", +"婁": "娄", +"婦": "妇", +"婭": "娅", +"媧": "娲", +"媯": "妫", +"媼": "媪", +"媽": "妈", +"嫗": "妪", +"嫵": "妩", +"嫻": "娴", +"嫿": "婳", +"嬀": "妫", +"嬈": "娆", +"嬋": "婵", +"嬌": "娇", +"嬙": "嫱", +"嬡": "嫒", +"嬤": "嬷", +"嬪": "嫔", +"嬰": "婴", +"嬸": "婶", +"孌": "娈", +"孫": "孙", +"學": "学", +"孿": "孪", +"宁": "宁", +"宮": "宫", +"寢": "寝", +"實": "实", +"寧": "宁", +"審": "审", +"寫": "写", +"寬": "宽", +"寵": "宠", +"寶": "宝", +"將": "将", +"專": "专", +"尋": "寻", +"對": "对", +"導": "导", +"尷": "尴", +"屆": "届", +"屍": "尸", +"屓": "屃", +"屜": "屉", +"屢": "屡", +"層": "层", +"屨": "屦", +"屬": "属", +"岡": "冈", +"峴": "岘", +"島": "岛", +"峽": "峡", +"崍": "崃", +"崗": "岗", +"崢": "峥", +"崬": "岽", +"嵐": "岚", +"嶁": "嵝", +"嶄": "崭", +"嶇": "岖", +"嶔": "嵚", +"嶗": "崂", +"嶠": "峤", +"嶢": "峣", +"嶧": "峄", +"嶮": "崄", +"嶴": "岙", +"嶸": "嵘", +"嶺": "岭", +"嶼": "屿", +"嶽": "岳", +"巋": "岿", +"巒": "峦", +"巔": "巅", +"巰": "巯", +"帘": "帘", +"帥": "帅", +"師": "师", +"帳": "帐", +"帶": "带", +"幀": "帧", +"幃": "帏", +"幗": "帼", +"幘": "帻", +"幟": "帜", +"幣": "币", +"幫": "帮", +"幬": "帱", +"幹": "干", +"幺": "么", +"幾": "几", +"广": "广", +"庫": "库", +"廁": "厕", +"廂": "厢", +"廄": "厩", +"廈": "厦", +"廚": "厨", +"廝": "厮", +"廟": "庙", +"廠": "厂", +"廡": "庑", +"廢": "废", +"廣": "广", +"廩": "廪", +"廬": "庐", +"廳": "厅", +"弒": "弑", +"弳": "弪", +"張": "张", +"強": "强", +"彆": "别", +"彈": "弹", +"彌": "弥", +"彎": "弯", +"彙": "汇", +"彞": "彝", +"彥": "彦", +"征": "征", +"後": "后", +"徑": "径", +"從": "从", +"徠": "徕", +"復": "复", +"徵": "征", +"徹": "彻", +"志": "志", +"恆": "恒", +"恥": "耻", +"悅": "悦", +"悞": "悮", +"悵": "怅", +"悶": "闷", +"惡": "恶", +"惱": "恼", +"惲": "恽", +"惻": "恻", +"愛": "爱", +"愜": "惬", +"愨": "悫", +"愴": "怆", +"愷": "恺", +"愾": "忾", +"愿": "愿", +"慄": "栗", +"態": "态", +"慍": "愠", +"慘": "惨", +"慚": "惭", +"慟": "恸", +"慣": "惯", +"慤": "悫", +"慪": "怄", +"慫": "怂", +"慮": "虑", +"慳": "悭", +"慶": "庆", +"憂": "忧", +"憊": "惫", +"憐": "怜", +"憑": "凭", +"憒": "愦", +"憚": "惮", +"憤": "愤", +"憫": "悯", +"憮": "怃", +"憲": "宪", +"憶": "忆", +"懇": "恳", +"應": "应", +"懌": "怿", +"懍": "懔", +"懞": "蒙", +"懟": "怼", +"懣": "懑", +"懨": "恹", +"懲": "惩", +"懶": "懒", +"懷": "怀", +"懸": "悬", +"懺": "忏", +"懼": "惧", +"懾": "慑", +"戀": "恋", +"戇": "戆", +"戔": "戋", +"戧": "戗", +"戩": "戬", +"戰": "战", +"戱": "戯", +"戲": "戏", +"戶": "户", +"担": "担", +"拋": "抛", +"挩": "捝", +"挾": "挟", +"捨": "舍", +"捫": "扪", +"据": "据", +"掃": "扫", +"掄": "抡", +"掗": "挜", +"掙": "挣", +"掛": "挂", +"採": "采", +"揀": "拣", +"揚": "扬", +"換": "换", +"揮": "挥", +"損": "损", +"搖": "摇", +"搗": "捣", +"搵": "揾", +"搶": "抢", +"摑": "掴", +"摜": "掼", +"摟": "搂", +"摯": "挚", +"摳": "抠", +"摶": "抟", +"摺": "折", +"摻": "掺", +"撈": "捞", +"撏": "挦", +"撐": "撑", +"撓": "挠", +"撝": "㧑", +"撟": "挢", +"撣": "掸", +"撥": "拨", +"撫": "抚", +"撲": "扑", +"撳": "揿", +"撻": "挞", +"撾": "挝", +"撿": "捡", +"擁": "拥", +"擄": "掳", +"擇": "择", +"擊": "击", +"擋": "挡", +"擓": "㧟", +"擔": "担", +"據": "据", +"擠": "挤", +"擬": "拟", +"擯": "摈", +"擰": "拧", +"擱": "搁", +"擲": "掷", +"擴": "扩", +"擷": "撷", +"擺": "摆", +"擻": "擞", +"擼": "撸", +"擾": "扰", +"攄": "摅", +"攆": "撵", +"攏": "拢", +"攔": "拦", +"攖": "撄", +"攙": "搀", +"攛": "撺", +"攜": "携", +"攝": "摄", +"攢": "攒", +"攣": "挛", +"攤": "摊", +"攪": "搅", +"攬": "揽", +"敗": "败", +"敘": "叙", +"敵": "敌", +"數": "数", +"斂": "敛", +"斃": "毙", +"斕": "斓", +"斗": "斗", +"斬": "斩", +"斷": "断", +"於": "于", +"時": "时", +"晉": "晋", +"晝": "昼", +"暈": "晕", +"暉": "晖", +"暘": "旸", +"暢": "畅", +"暫": "暂", +"曄": "晔", +"曆": "历", +"曇": "昙", +"曉": "晓", +"曏": "向", +"曖": "暧", +"曠": "旷", +"曨": "昽", +"曬": "晒", +"書": "书", +"會": "会", +"朧": "胧", +"朮": "术", +"术": "术", +"朴": "朴", +"東": "东", +"杴": "锨", +"极": "极", +"柜": "柜", +"柵": "栅", +"桿": "杆", +"梔": "栀", +"梘": "枧", +"條": "条", +"梟": "枭", +"梲": "棁", +"棄": "弃", +"棖": "枨", +"棗": "枣", +"棟": "栋", +"棧": "栈", +"棲": "栖", +"棶": "梾", +"椏": "桠", +"楊": "杨", +"楓": "枫", +"楨": "桢", +"業": "业", +"極": "极", +"榪": "杩", +"榮": "荣", +"榲": "榅", +"榿": "桤", +"構": "构", +"槍": "枪", +"槤": "梿", +"槧": "椠", +"槨": "椁", +"槳": "桨", +"樁": "桩", +"樂": "乐", +"樅": "枞", +"樓": "楼", +"標": "标", +"樞": "枢", +"樣": "样", +"樸": "朴", +"樹": "树", +"樺": "桦", +"橈": "桡", +"橋": "桥", +"機": "机", +"橢": "椭", +"橫": "横", +"檁": "檩", +"檉": "柽", +"檔": "档", +"檜": "桧", +"檟": "槚", +"檢": "检", +"檣": "樯", +"檮": "梼", +"檯": "台", +"檳": "槟", +"檸": "柠", +"檻": "槛", +"櫃": "柜", +"櫓": "橹", +"櫚": "榈", +"櫛": "栉", +"櫝": "椟", +"櫞": "橼", +"櫟": "栎", +"櫥": "橱", +"櫧": "槠", +"櫨": "栌", +"櫪": "枥", +"櫫": "橥", +"櫬": "榇", +"櫱": "蘖", +"櫳": "栊", +"櫸": "榉", +"櫻": "樱", +"欄": "栏", +"權": "权", +"欏": "椤", +"欒": "栾", +"欖": "榄", +"欞": "棂", +"欽": "钦", +"歐": "欧", +"歟": "欤", +"歡": "欢", +"歲": "岁", +"歷": "历", +"歸": "归", +"歿": "殁", +"殘": "残", +"殞": "殒", +"殤": "殇", +"殨": "㱮", +"殫": "殚", +"殮": "殓", +"殯": "殡", +"殰": "㱩", +"殲": "歼", +"殺": "杀", +"殻": "壳", +"殼": "壳", +"毀": "毁", +"毆": "殴", +"毿": "毵", +"氂": "牦", +"氈": "毡", +"氌": "氇", +"氣": "气", +"氫": "氢", +"氬": "氩", +"氳": "氲", +"汙": "污", +"決": "决", +"沒": "没", +"沖": "冲", +"況": "况", +"洶": "汹", +"浹": "浃", +"涂": "涂", +"涇": "泾", +"涼": "凉", +"淀": "淀", +"淒": "凄", +"淚": "泪", +"淥": "渌", +"淨": "净", +"淩": "凌", +"淪": "沦", +"淵": "渊", +"淶": "涞", +"淺": "浅", +"渙": "涣", +"減": "减", +"渦": "涡", +"測": "测", +"渾": "浑", +"湊": "凑", +"湞": "浈", +"湯": "汤", +"溈": "沩", +"準": "准", +"溝": "沟", +"溫": "温", +"滄": "沧", +"滅": "灭", +"滌": "涤", +"滎": "荥", +"滬": "沪", +"滯": "滞", +"滲": "渗", +"滷": "卤", +"滸": "浒", +"滻": "浐", +"滾": "滚", +"滿": "满", +"漁": "渔", +"漚": "沤", +"漢": "汉", +"漣": "涟", +"漬": "渍", +"漲": "涨", +"漵": "溆", +"漸": "渐", +"漿": "浆", +"潁": "颍", +"潑": "泼", +"潔": "洁", +"潙": "沩", +"潛": "潜", +"潤": "润", +"潯": "浔", +"潰": "溃", +"潷": "滗", +"潿": "涠", +"澀": "涩", +"澆": "浇", +"澇": "涝", +"澐": "沄", +"澗": "涧", +"澠": "渑", +"澤": "泽", +"澦": "滪", +"澩": "泶", +"澮": "浍", +"澱": "淀", +"濁": "浊", +"濃": "浓", +"濕": "湿", +"濘": "泞", +"濛": "蒙", +"濟": "济", +"濤": "涛", +"濫": "滥", +"濰": "潍", +"濱": "滨", +"濺": "溅", +"濼": "泺", +"濾": "滤", +"瀅": "滢", +"瀆": "渎", +"瀇": "㲿", +"瀉": "泻", +"瀋": "沈", +"瀏": "浏", +"瀕": "濒", +"瀘": "泸", +"瀝": "沥", +"瀟": "潇", +"瀠": "潆", +"瀦": "潴", +"瀧": "泷", +"瀨": "濑", +"瀰": "弥", +"瀲": "潋", +"瀾": "澜", +"灃": "沣", +"灄": "滠", +"灑": "洒", +"灕": "漓", +"灘": "滩", +"灝": "灏", +"灠": "漤", +"灣": "湾", +"灤": "滦", +"灧": "滟", +"災": "灾", +"為": "为", +"烏": "乌", +"烴": "烃", +"無": "无", +"煉": "炼", +"煒": "炜", +"煙": "烟", +"煢": "茕", +"煥": "焕", +"煩": "烦", +"煬": "炀", +"煱": "㶽", +"熅": "煴", +"熒": "荧", +"熗": "炝", +"熱": "热", +"熲": "颎", +"熾": "炽", +"燁": "烨", +"燈": "灯", +"燉": "炖", +"燒": "烧", +"燙": "烫", +"燜": "焖", +"營": "营", +"燦": "灿", +"燭": "烛", +"燴": "烩", +"燶": "㶶", +"燼": "烬", +"燾": "焘", +"爍": "烁", +"爐": "炉", +"爛": "烂", +"爭": "争", +"爲": "为", +"爺": "爷", +"爾": "尔", +"牆": "墙", +"牘": "牍", +"牽": "牵", +"犖": "荦", +"犢": "犊", +"犧": "牺", +"狀": "状", +"狹": "狭", +"狽": "狈", +"猙": "狰", +"猶": "犹", +"猻": "狲", +"獁": "犸", +"獄": "狱", +"獅": "狮", +"獎": "奖", +"獨": "独", +"獪": "狯", +"獫": "猃", +"獮": "狝", +"獰": "狞", +"獱": "㺍", +"獲": "获", +"獵": "猎", +"獷": "犷", +"獸": "兽", +"獺": "獭", +"獻": "献", +"獼": "猕", +"玀": "猡", +"現": "现", +"琺": "珐", +"琿": "珲", +"瑋": "玮", +"瑒": "玚", +"瑣": "琐", +"瑤": "瑶", +"瑩": "莹", +"瑪": "玛", +"瑲": "玱", +"璉": "琏", +"璣": "玑", +"璦": "瑷", +"璫": "珰", +"環": "环", +"璽": "玺", +"瓊": "琼", +"瓏": "珑", +"瓔": "璎", +"瓚": "瓒", +"甌": "瓯", +"產": "产", +"産": "产", +"畝": "亩", +"畢": "毕", +"異": "异", +"畵": "画", +"當": "当", +"疇": "畴", +"疊": "叠", +"痙": "痉", +"痾": "疴", +"瘂": "痖", +"瘋": "疯", +"瘍": "疡", +"瘓": "痪", +"瘞": "瘗", +"瘡": "疮", +"瘧": "疟", +"瘮": "瘆", +"瘲": "疭", +"瘺": "瘘", +"瘻": "瘘", +"療": "疗", +"癆": "痨", +"癇": "痫", +"癉": "瘅", +"癘": "疠", +"癟": "瘪", +"癢": "痒", +"癤": "疖", +"癥": "症", +"癧": "疬", +"癩": "癞", +"癬": "癣", +"癭": "瘿", +"癮": "瘾", +"癰": "痈", +"癱": "瘫", +"癲": "癫", +"發": "发", +"皚": "皑", +"皰": "疱", +"皸": "皲", +"皺": "皱", +"盃": "杯", +"盜": "盗", +"盞": "盏", +"盡": "尽", +"監": "监", +"盤": "盘", +"盧": "卢", +"盪": "荡", +"眥": "眦", +"眾": "众", +"睏": "困", +"睜": "睁", +"睞": "睐", +"瞘": "眍", +"瞜": "䁖", +"瞞": "瞒", +"瞭": "了", +"瞶": "瞆", +"瞼": "睑", +"矇": "蒙", +"矓": "眬", +"矚": "瞩", +"矯": "矫", +"硃": "朱", +"硜": "硁", +"硤": "硖", +"硨": "砗", +"确": "确", +"硯": "砚", +"碩": "硕", +"碭": "砀", +"碸": "砜", +"確": "确", +"碼": "码", +"磑": "硙", +"磚": "砖", +"磣": "碜", +"磧": "碛", +"磯": "矶", +"磽": "硗", +"礆": "硷", +"礎": "础", +"礙": "碍", +"礦": "矿", +"礪": "砺", +"礫": "砾", +"礬": "矾", +"礱": "砻", +"祿": "禄", +"禍": "祸", +"禎": "祯", +"禕": "祎", +"禡": "祃", +"禦": "御", +"禪": "禅", +"禮": "礼", +"禰": "祢", +"禱": "祷", +"禿": "秃", +"秈": "籼", +"种": "种", +"稅": "税", +"稈": "秆", +"稏": "䅉", +"稟": "禀", +"種": "种", +"稱": "称", +"穀": "谷", +"穌": "稣", +"積": "积", +"穎": "颖", +"穠": "秾", +"穡": "穑", +"穢": "秽", +"穩": "稳", +"穫": "获", +"穭": "稆", +"窩": "窝", +"窪": "洼", +"窮": "穷", +"窯": "窑", +"窵": "窎", +"窶": "窭", +"窺": "窥", +"竄": "窜", +"竅": "窍", +"竇": "窦", +"竈": "灶", +"竊": "窃", +"竪": "竖", +"競": "竞", +"筆": "笔", +"筍": "笋", +"筑": "筑", +"筧": "笕", +"筴": "䇲", +"箋": "笺", +"箏": "筝", +"節": "节", +"範": "范", +"築": "筑", +"篋": "箧", +"篔": "筼", +"篤": "笃", +"篩": "筛", +"篳": "筚", +"簀": "箦", +"簍": "篓", +"簞": "箪", +"簡": "简", +"簣": "篑", +"簫": "箫", +"簹": "筜", +"簽": "签", +"簾": "帘", +"籃": "篮", +"籌": "筹", +"籖": "签", +"籙": "箓", +"籜": "箨", +"籟": "籁", +"籠": "笼", +"籩": "笾", +"籪": "簖", +"籬": "篱", +"籮": "箩", +"籲": "吁", +"粵": "粤", +"糝": "糁", +"糞": "粪", +"糧": "粮", +"糰": "团", +"糲": "粝", +"糴": "籴", +"糶": "粜", +"糹": "纟", +"糾": "纠", +"紀": "纪", +"紂": "纣", +"約": "约", +"紅": "红", +"紆": "纡", +"紇": "纥", +"紈": "纨", +"紉": "纫", +"紋": "纹", +"納": "纳", +"紐": "纽", +"紓": "纾", +"純": "纯", +"紕": "纰", +"紖": "纼", +"紗": "纱", +"紘": "纮", +"紙": "纸", +"級": "级", +"紛": "纷", +"紜": "纭", +"紝": "纴", +"紡": "纺", +"紬": "䌷", +"細": "细", +"紱": "绂", +"紲": "绁", +"紳": "绅", +"紵": "纻", +"紹": "绍", +"紺": "绀", +"紼": "绋", +"紿": "绐", +"絀": "绌", +"終": "终", +"組": "组", +"絅": "䌹", +"絆": "绊", +"絎": "绗", +"結": "结", +"絕": "绝", +"絛": "绦", +"絝": "绔", +"絞": "绞", +"絡": "络", +"絢": "绚", +"給": "给", +"絨": "绒", +"絰": "绖", +"統": "统", +"絲": "丝", +"絳": "绛", +"絶": "绝", +"絹": "绢", +"綁": "绑", +"綃": "绡", +"綆": "绠", +"綈": "绨", +"綉": "绣", +"綌": "绤", +"綏": "绥", +"綐": "䌼", +"經": "经", +"綜": "综", +"綞": "缍", +"綠": "绿", +"綢": "绸", +"綣": "绻", +"綫": "线", +"綬": "绶", +"維": "维", +"綯": "绹", +"綰": "绾", +"綱": "纲", +"網": "网", +"綳": "绷", +"綴": "缀", +"綵": "䌽", +"綸": "纶", +"綹": "绺", +"綺": "绮", +"綻": "绽", +"綽": "绰", +"綾": "绫", +"綿": "绵", +"緄": "绲", +"緇": "缁", +"緊": "紧", +"緋": "绯", +"緑": "绿", +"緒": "绪", +"緓": "绬", +"緔": "绱", +"緗": "缃", +"緘": "缄", +"緙": "缂", +"線": "线", +"緝": "缉", +"緞": "缎", +"締": "缔", +"緡": "缗", +"緣": "缘", +"緦": "缌", +"編": "编", +"緩": "缓", +"緬": "缅", +"緯": "纬", +"緱": "缑", +"緲": "缈", +"練": "练", +"緶": "缏", +"緹": "缇", +"緻": "致", +"縈": "萦", +"縉": "缙", +"縊": "缢", +"縋": "缒", +"縐": "绉", +"縑": "缣", +"縕": "缊", +"縗": "缞", +"縛": "缚", +"縝": "缜", +"縞": "缟", +"縟": "缛", +"縣": "县", +"縧": "绦", +"縫": "缝", +"縭": "缡", +"縮": "缩", +"縱": "纵", +"縲": "缧", +"縳": "䌸", +"縴": "纤", +"縵": "缦", +"縶": "絷", +"縷": "缕", +"縹": "缥", +"總": "总", +"績": "绩", +"繃": "绷", +"繅": "缫", +"繆": "缪", +"繒": "缯", +"織": "织", +"繕": "缮", +"繚": "缭", +"繞": "绕", +"繡": "绣", +"繢": "缋", +"繩": "绳", +"繪": "绘", +"繫": "系", +"繭": "茧", +"繮": "缰", +"繯": "缳", +"繰": "缲", +"繳": "缴", +"繸": "䍁", +"繹": "绎", +"繼": "继", +"繽": "缤", +"繾": "缱", +"繿": "䍀", +"纈": "缬", +"纊": "纩", +"續": "续", +"纍": "累", +"纏": "缠", +"纓": "缨", +"纔": "才", +"纖": "纤", +"纘": "缵", +"纜": "缆", +"缽": "钵", +"罈": "坛", +"罌": "罂", +"罰": "罚", +"罵": "骂", +"罷": "罢", +"羅": "罗", +"羆": "罴", +"羈": "羁", +"羋": "芈", +"羥": "羟", +"義": "义", +"習": "习", +"翹": "翘", +"耬": "耧", +"耮": "耢", +"聖": "圣", +"聞": "闻", +"聯": "联", +"聰": "聪", +"聲": "声", +"聳": "耸", +"聵": "聩", +"聶": "聂", +"職": "职", +"聹": "聍", +"聽": "听", +"聾": "聋", +"肅": "肃", +"胜": "胜", +"脅": "胁", +"脈": "脉", +"脛": "胫", +"脫": "脱", +"脹": "胀", +"腊": "腊", +"腎": "肾", +"腖": "胨", +"腡": "脶", +"腦": "脑", +"腫": "肿", +"腳": "脚", +"腸": "肠", +"膃": "腽", +"膚": "肤", +"膠": "胶", +"膩": "腻", +"膽": "胆", +"膾": "脍", +"膿": "脓", +"臉": "脸", +"臍": "脐", +"臏": "膑", +"臘": "腊", +"臚": "胪", +"臟": "脏", +"臠": "脔", +"臢": "臜", +"臥": "卧", +"臨": "临", +"臺": "台", +"與": "与", +"興": "兴", +"舉": "举", +"舊": "旧", +"艙": "舱", +"艤": "舣", +"艦": "舰", +"艫": "舻", +"艱": "艰", +"艷": "艳", +"芻": "刍", +"苧": "苎", +"苹": "苹", +"范": "范", +"茲": "兹", +"荊": "荆", +"莊": "庄", +"莖": "茎", +"莢": "荚", +"莧": "苋", +"華": "华", +"萇": "苌", +"萊": "莱", +"萬": "万", +"萵": "莴", +"葉": "叶", +"葒": "荭", +"著": "着", +"著名": "著名", +"葤": "荮", +"葦": "苇", +"葯": "药", +"葷": "荤", +"蒓": "莼", +"蒔": "莳", +"蒞": "莅", +"蒼": "苍", +"蓀": "荪", +"蓋": "盖", +"蓮": "莲", +"蓯": "苁", +"蓴": "莼", +"蓽": "荜", +"蔔": "卜", +"蔞": "蒌", +"蔣": "蒋", +"蔥": "葱", +"蔦": "茑", +"蔭": "荫", +"蕁": "荨", +"蕆": "蒇", +"蕎": "荞", +"蕒": "荬", +"蕓": "芸", +"蕕": "莸", +"蕘": "荛", +"蕢": "蒉", +"蕩": "荡", +"蕪": "芜", +"蕭": "萧", +"蕷": "蓣", +"薀": "蕰", +"薈": "荟", +"薊": "蓟", +"薌": "芗", +"薔": "蔷", +"薘": "荙", +"薟": "莶", +"薦": "荐", +"薩": "萨", +"薳": "䓕", +"薴": "苧", +"薺": "荠", +"藉": "借", +"藍": "蓝", +"藎": "荩", +"藝": "艺", +"藥": "药", +"藪": "薮", +"藴": "蕴", +"藶": "苈", +"藹": "蔼", +"藺": "蔺", +"蘄": "蕲", +"蘆": "芦", +"蘇": "苏", +"蘊": "蕴", +"蘋": "苹", +"蘚": "藓", +"蘞": "蔹", +"蘢": "茏", +"蘭": "兰", +"蘺": "蓠", +"蘿": "萝", +"虆": "蔂", +"處": "处", +"虛": "虚", +"虜": "虏", +"號": "号", +"虧": "亏", +"虫": "虫", +"虯": "虬", +"蛺": "蛱", +"蛻": "蜕", +"蜆": "蚬", +"蜡": "蜡", +"蝕": "蚀", +"蝟": "猬", +"蝦": "虾", +"蝸": "蜗", +"螄": "蛳", +"螞": "蚂", +"螢": "萤", +"螮": "䗖", +"螻": "蝼", +"螿": "螀", +"蟄": "蛰", +"蟈": "蝈", +"蟎": "螨", +"蟣": "虮", +"蟬": "蝉", +"蟯": "蛲", +"蟲": "虫", +"蟶": "蛏", +"蟻": "蚁", +"蠅": "蝇", +"蠆": "虿", +"蠐": "蛴", +"蠑": "蝾", +"蠟": "蜡", +"蠣": "蛎", +"蠨": "蟏", +"蠱": "蛊", +"蠶": "蚕", +"蠻": "蛮", +"衆": "众", +"衊": "蔑", +"術": "术", +"衕": "同", +"衚": "胡", +"衛": "卫", +"衝": "冲", +"衹": "只", +"袞": "衮", +"裊": "袅", +"裏": "里", +"補": "补", +"裝": "装", +"裡": "里", +"製": "制", +"複": "复", +"褌": "裈", +"褘": "袆", +"褲": "裤", +"褳": "裢", +"褸": "褛", +"褻": "亵", +"襇": "裥", +"襏": "袯", +"襖": "袄", +"襝": "裣", +"襠": "裆", +"襤": "褴", +"襪": "袜", +"襬": "䙓", +"襯": "衬", +"襲": "袭", +"覆": "复", +"覆蓋": "覆盖", +"翻來覆去": "翻来覆去", +"見": "见", +"覎": "觃", +"規": "规", +"覓": "觅", +"視": "视", +"覘": "觇", +"覡": "觋", +"覥": "觍", +"覦": "觎", +"親": "亲", +"覬": "觊", +"覯": "觏", +"覲": "觐", +"覷": "觑", +"覺": "觉", +"覽": "览", +"覿": "觌", +"觀": "观", +"觴": "觞", +"觶": "觯", +"觸": "触", +"訁": "讠", +"訂": "订", +"訃": "讣", +"計": "计", +"訊": "讯", +"訌": "讧", +"討": "讨", +"訐": "讦", +"訒": "讱", +"訓": "训", +"訕": "讪", +"訖": "讫", +"託": "讬", +"記": "记", +"訛": "讹", +"訝": "讶", +"訟": "讼", +"訢": "䜣", +"訣": "诀", +"訥": "讷", +"訩": "讻", +"訪": "访", +"設": "设", +"許": "许", +"訴": "诉", +"訶": "诃", +"診": "诊", +"註": "注", +"詁": "诂", +"詆": "诋", +"詎": "讵", +"詐": "诈", +"詒": "诒", +"詔": "诏", +"評": "评", +"詖": "诐", +"詗": "诇", +"詘": "诎", +"詛": "诅", +"詞": "词", +"詠": "咏", +"詡": "诩", +"詢": "询", +"詣": "诣", +"試": "试", +"詩": "诗", +"詫": "诧", +"詬": "诟", +"詭": "诡", +"詮": "诠", +"詰": "诘", +"話": "话", +"該": "该", +"詳": "详", +"詵": "诜", +"詼": "诙", +"詿": "诖", +"誄": "诔", +"誅": "诛", +"誆": "诓", +"誇": "夸", +"誌": "志", +"認": "认", +"誑": "诳", +"誒": "诶", +"誕": "诞", +"誘": "诱", +"誚": "诮", +"語": "语", +"誠": "诚", +"誡": "诫", +"誣": "诬", +"誤": "误", +"誥": "诰", +"誦": "诵", +"誨": "诲", +"說": "说", +"説": "说", +"誰": "谁", +"課": "课", +"誶": "谇", +"誹": "诽", +"誼": "谊", +"誾": "訚", +"調": "调", +"諂": "谄", +"諄": "谆", +"談": "谈", +"諉": "诿", +"請": "请", +"諍": "诤", +"諏": "诹", +"諑": "诼", +"諒": "谅", +"論": "论", +"諗": "谂", +"諛": "谀", +"諜": "谍", +"諝": "谞", +"諞": "谝", +"諢": "诨", +"諤": "谔", +"諦": "谛", +"諧": "谐", +"諫": "谏", +"諭": "谕", +"諮": "谘", +"諱": "讳", +"諳": "谙", +"諶": "谌", +"諷": "讽", +"諸": "诸", +"諺": "谚", +"諼": "谖", +"諾": "诺", +"謀": "谋", +"謁": "谒", +"謂": "谓", +"謄": "誊", +"謅": "诌", +"謊": "谎", +"謎": "谜", +"謐": "谧", +"謔": "谑", +"謖": "谡", +"謗": "谤", +"謙": "谦", +"謚": "谥", +"講": "讲", +"謝": "谢", +"謠": "谣", +"謡": "谣", +"謨": "谟", +"謫": "谪", +"謬": "谬", +"謭": "谫", +"謳": "讴", +"謹": "谨", +"謾": "谩", +"譅": "䜧", +"證": "证", +"譎": "谲", +"譏": "讥", +"譖": "谮", +"識": "识", +"譙": "谯", +"譚": "谭", +"譜": "谱", +"譫": "谵", +"譯": "译", +"議": "议", +"譴": "谴", +"護": "护", +"譸": "诪", +"譽": "誉", +"譾": "谫", +"讀": "读", +"變": "变", +"讎": "仇", +"讎": "雠", +"讒": "谗", +"讓": "让", +"讕": "谰", +"讖": "谶", +"讜": "谠", +"讞": "谳", +"豈": "岂", +"豎": "竖", +"豐": "丰", +"豬": "猪", +"豶": "豮", +"貓": "猫", +"貙": "䝙", +"貝": "贝", +"貞": "贞", +"貟": "贠", +"負": "负", +"財": "财", +"貢": "贡", +"貧": "贫", +"貨": "货", +"販": "贩", +"貪": "贪", +"貫": "贯", +"責": "责", +"貯": "贮", +"貰": "贳", +"貲": "赀", +"貳": "贰", +"貴": "贵", +"貶": "贬", +"買": "买", +"貸": "贷", +"貺": "贶", +"費": "费", +"貼": "贴", +"貽": "贻", +"貿": "贸", +"賀": "贺", +"賁": "贲", +"賂": "赂", +"賃": "赁", +"賄": "贿", +"賅": "赅", +"資": "资", +"賈": "贾", +"賊": "贼", +"賑": "赈", +"賒": "赊", +"賓": "宾", +"賕": "赇", +"賙": "赒", +"賚": "赉", +"賜": "赐", +"賞": "赏", +"賠": "赔", +"賡": "赓", +"賢": "贤", +"賣": "卖", +"賤": "贱", +"賦": "赋", +"賧": "赕", +"質": "质", +"賫": "赍", +"賬": "账", +"賭": "赌", +"賰": "䞐", +"賴": "赖", +"賵": "赗", +"賺": "赚", +"賻": "赙", +"購": "购", +"賽": "赛", +"賾": "赜", +"贄": "贽", +"贅": "赘", +"贇": "赟", +"贈": "赠", +"贊": "赞", +"贋": "赝", +"贍": "赡", +"贏": "赢", +"贐": "赆", +"贓": "赃", +"贔": "赑", +"贖": "赎", +"贗": "赝", +"贛": "赣", +"贜": "赃", +"赬": "赪", +"趕": "赶", +"趙": "赵", +"趨": "趋", +"趲": "趱", +"跡": "迹", +"踐": "践", +"踴": "踊", +"蹌": "跄", +"蹕": "跸", +"蹣": "蹒", +"蹤": "踪", +"蹺": "跷", +"躂": "跶", +"躉": "趸", +"躊": "踌", +"躋": "跻", +"躍": "跃", +"躑": "踯", +"躒": "跞", +"躓": "踬", +"躕": "蹰", +"躚": "跹", +"躡": "蹑", +"躥": "蹿", +"躦": "躜", +"躪": "躏", +"軀": "躯", +"車": "车", +"軋": "轧", +"軌": "轨", +"軍": "军", +"軑": "轪", +"軒": "轩", +"軔": "轫", +"軛": "轭", +"軟": "软", +"軤": "轷", +"軫": "轸", +"軲": "轱", +"軸": "轴", +"軹": "轵", +"軺": "轺", +"軻": "轲", +"軼": "轶", +"軾": "轼", +"較": "较", +"輅": "辂", +"輇": "辁", +"輈": "辀", +"載": "载", +"輊": "轾", +"輒": "辄", +"輓": "挽", +"輔": "辅", +"輕": "轻", +"輛": "辆", +"輜": "辎", +"輝": "辉", +"輞": "辋", +"輟": "辍", +"輥": "辊", +"輦": "辇", +"輩": "辈", +"輪": "轮", +"輬": "辌", +"輯": "辑", +"輳": "辏", +"輸": "输", +"輻": "辐", +"輾": "辗", +"輿": "舆", +"轀": "辒", +"轂": "毂", +"轄": "辖", +"轅": "辕", +"轆": "辘", +"轉": "转", +"轍": "辙", +"轎": "轿", +"轔": "辚", +"轟": "轰", +"轡": "辔", +"轢": "轹", +"轤": "轳", +"辟": "辟", +"辦": "办", +"辭": "辞", +"辮": "辫", +"辯": "辩", +"農": "农", +"迴": "回", +"适": "适", +"逕": "迳", +"這": "这", +"連": "连", +"週": "周", +"進": "进", +"遊": "游", +"運": "运", +"過": "过", +"達": "达", +"違": "违", +"遙": "遥", +"遜": "逊", +"遞": "递", +"遠": "远", +"適": "适", +"遲": "迟", +"遷": "迁", +"選": "选", +"遺": "遗", +"遼": "辽", +"邁": "迈", +"還": "还", +"邇": "迩", +"邊": "边", +"邏": "逻", +"邐": "逦", +"郁": "郁", +"郟": "郏", +"郵": "邮", +"鄆": "郓", +"鄉": "乡", +"鄒": "邹", +"鄔": "邬", +"鄖": "郧", +"鄧": "邓", +"鄭": "郑", +"鄰": "邻", +"鄲": "郸", +"鄴": "邺", +"鄶": "郐", +"鄺": "邝", +"酇": "酂", +"酈": "郦", +"醖": "酝", +"醜": "丑", +"醞": "酝", +"醫": "医", +"醬": "酱", +"醱": "酦", +"釀": "酿", +"釁": "衅", +"釃": "酾", +"釅": "酽", +"采": "采", +"釋": "释", +"釐": "厘", +"釒": "钅", +"釓": "钆", +"釔": "钇", +"釕": "钌", +"釗": "钊", +"釘": "钉", +"釙": "钋", +"針": "针", +"釣": "钓", +"釤": "钐", +"釧": "钏", +"釩": "钒", +"釵": "钗", +"釷": "钍", +"釹": "钕", +"釺": "钎", +"鈀": "钯", +"鈁": "钫", +"鈃": "钘", +"鈄": "钭", +"鈈": "钚", +"鈉": "钠", +"鈍": "钝", +"鈎": "钩", +"鈐": "钤", +"鈑": "钣", +"鈒": "钑", +"鈔": "钞", +"鈕": "钮", +"鈞": "钧", +"鈣": "钙", +"鈥": "钬", +"鈦": "钛", +"鈧": "钪", +"鈮": "铌", +"鈰": "铈", +"鈳": "钶", +"鈴": "铃", +"鈷": "钴", +"鈸": "钹", +"鈹": "铍", +"鈺": "钰", +"鈽": "钸", +"鈾": "铀", +"鈿": "钿", +"鉀": "钾", +"鉅": "钜", +"鉈": "铊", +"鉉": "铉", +"鉋": "铇", +"鉍": "铋", +"鉑": "铂", +"鉕": "钷", +"鉗": "钳", +"鉚": "铆", +"鉛": "铅", +"鉞": "钺", +"鉢": "钵", +"鉤": "钩", +"鉦": "钲", +"鉬": "钼", +"鉭": "钽", +"鉶": "铏", +"鉸": "铰", +"鉺": "铒", +"鉻": "铬", +"鉿": "铪", +"銀": "银", +"銃": "铳", +"銅": "铜", +"銍": "铚", +"銑": "铣", +"銓": "铨", +"銖": "铢", +"銘": "铭", +"銚": "铫", +"銛": "铦", +"銜": "衔", +"銠": "铑", +"銣": "铷", +"銥": "铱", +"銦": "铟", +"銨": "铵", +"銩": "铥", +"銪": "铕", +"銫": "铯", +"銬": "铐", +"銱": "铞", +"銳": "锐", +"銷": "销", +"銹": "锈", +"銻": "锑", +"銼": "锉", +"鋁": "铝", +"鋃": "锒", +"鋅": "锌", +"鋇": "钡", +"鋌": "铤", +"鋏": "铗", +"鋒": "锋", +"鋙": "铻", +"鋝": "锊", +"鋟": "锓", +"鋣": "铘", +"鋤": "锄", +"鋥": "锃", +"鋦": "锔", +"鋨": "锇", +"鋩": "铓", +"鋪": "铺", +"鋭": "锐", +"鋮": "铖", +"鋯": "锆", +"鋰": "锂", +"鋱": "铽", +"鋶": "锍", +"鋸": "锯", +"鋼": "钢", +"錁": "锞", +"錄": "录", +"錆": "锖", +"錇": "锫", +"錈": "锩", +"錏": "铔", +"錐": "锥", +"錒": "锕", +"錕": "锟", +"錘": "锤", +"錙": "锱", +"錚": "铮", +"錛": "锛", +"錟": "锬", +"錠": "锭", +"錡": "锜", +"錢": "钱", +"錦": "锦", +"錨": "锚", +"錩": "锠", +"錫": "锡", +"錮": "锢", +"錯": "错", +"録": "录", +"錳": "锰", +"錶": "表", +"錸": "铼", +"鍀": "锝", +"鍁": "锨", +"鍃": "锪", +"鍆": "钔", +"鍇": "锴", +"鍈": "锳", +"鍋": "锅", +"鍍": "镀", +"鍔": "锷", +"鍘": "铡", +"鍚": "钖", +"鍛": "锻", +"鍠": "锽", +"鍤": "锸", +"鍥": "锲", +"鍩": "锘", +"鍬": "锹", +"鍰": "锾", +"鍵": "键", +"鍶": "锶", +"鍺": "锗", +"鍾": "钟", +"鎂": "镁", +"鎄": "锿", +"鎇": "镅", +"鎊": "镑", +"鎔": "镕", +"鎖": "锁", +"鎘": "镉", +"鎚": "锤", +"鎛": "镈", +"鎝": "𨱏", +"鎡": "镃", +"鎢": "钨", +"鎣": "蓥", +"鎦": "镏", +"鎧": "铠", +"鎩": "铩", +"鎪": "锼", +"鎬": "镐", +"鎮": "镇", +"鎰": "镒", +"鎲": "镋", +"鎳": "镍", +"鎵": "镓", +"鎸": "镌", +"鎿": "镎", +"鏃": "镞", +"鏇": "镟", +"鏈": "链", +"鏌": "镆", +"鏍": "镙", +"鏐": "镠", +"鏑": "镝", +"鏗": "铿", +"鏘": "锵", +"鏜": "镗", +"鏝": "镘", +"鏞": "镛", +"鏟": "铲", +"鏡": "镜", +"鏢": "镖", +"鏤": "镂", +"鏨": "錾", +"鏰": "镚", +"鏵": "铧", +"鏷": "镤", +"鏹": "镪", +"鏽": "锈", +"鐃": "铙", +"鐋": "铴", +"鐐": "镣", +"鐒": "铹", +"鐓": "镦", +"鐔": "镡", +"鐘": "钟", +"鐙": "镫", +"鐝": "镢", +"鐠": "镨", +"鐦": "锎", +"鐧": "锏", +"鐨": "镄", +"鐫": "镌", +"鐮": "镰", +"鐲": "镯", +"鐳": "镭", +"鐵": "铁", +"鐶": "镮", +"鐸": "铎", +"鐺": "铛", +"鐿": "镱", +"鑄": "铸", +"鑊": "镬", +"鑌": "镔", +"鑒": "鉴", +"鑔": "镲", +"鑕": "锧", +"鑞": "镴", +"鑠": "铄", +"鑣": "镳", +"鑥": "镥", +"鑭": "镧", +"鑰": "钥", +"鑱": "镵", +"鑲": "镶", +"鑷": "镊", +"鑹": "镩", +"鑼": "锣", +"鑽": "钻", +"鑾": "銮", +"鑿": "凿", +"钁": "镢", +"镟": "旋", +"長": "长", +"門": "门", +"閂": "闩", +"閃": "闪", +"閆": "闫", +"閈": "闬", +"閉": "闭", +"開": "开", +"閌": "闶", +"閎": "闳", +"閏": "闰", +"閑": "闲", +"間": "间", +"閔": "闵", +"閘": "闸", +"閡": "阂", +"閣": "阁", +"閤": "合", +"閥": "阀", +"閨": "闺", +"閩": "闽", +"閫": "阃", +"閬": "阆", +"閭": "闾", +"閱": "阅", +"閲": "阅", +"閶": "阊", +"閹": "阉", +"閻": "阎", +"閼": "阏", +"閽": "阍", +"閾": "阈", +"閿": "阌", +"闃": "阒", +"闆": "板", +"闈": "闱", +"闊": "阔", +"闋": "阕", +"闌": "阑", +"闍": "阇", +"闐": "阗", +"闒": "阘", +"闓": "闿", +"闔": "阖", +"闕": "阙", +"闖": "闯", +"關": "关", +"闞": "阚", +"闠": "阓", +"闡": "阐", +"闤": "阛", +"闥": "闼", +"阪": "坂", +"陘": "陉", +"陝": "陕", +"陣": "阵", +"陰": "阴", +"陳": "陈", +"陸": "陆", +"陽": "阳", +"隉": "陧", +"隊": "队", +"階": "阶", +"隕": "陨", +"際": "际", +"隨": "随", +"險": "险", +"隱": "隐", +"隴": "陇", +"隸": "隶", +"隻": "只", +"雋": "隽", +"雖": "虽", +"雙": "双", +"雛": "雏", +"雜": "杂", +"雞": "鸡", +"離": "离", +"難": "难", +"雲": "云", +"電": "电", +"霢": "霡", +"霧": "雾", +"霽": "霁", +"靂": "雳", +"靄": "霭", +"靈": "灵", +"靚": "靓", +"靜": "静", +"靨": "靥", +"鞀": "鼗", +"鞏": "巩", +"鞝": "绱", +"鞦": "秋", +"鞽": "鞒", +"韁": "缰", +"韃": "鞑", +"韆": "千", +"韉": "鞯", +"韋": "韦", +"韌": "韧", +"韍": "韨", +"韓": "韩", +"韙": "韪", +"韜": "韬", +"韞": "韫", +"韻": "韵", +"響": "响", +"頁": "页", +"頂": "顶", +"頃": "顷", +"項": "项", +"順": "顺", +"頇": "顸", +"須": "须", +"頊": "顼", +"頌": "颂", +"頎": "颀", +"頏": "颃", +"預": "预", +"頑": "顽", +"頒": "颁", +"頓": "顿", +"頗": "颇", +"領": "领", +"頜": "颌", +"頡": "颉", +"頤": "颐", +"頦": "颏", +"頭": "头", +"頮": "颒", +"頰": "颊", +"頲": "颋", +"頴": "颕", +"頷": "颔", +"頸": "颈", +"頹": "颓", +"頻": "频", +"頽": "颓", +"顆": "颗", +"題": "题", +"額": "额", +"顎": "颚", +"顏": "颜", +"顒": "颙", +"顓": "颛", +"顔": "颜", +"願": "愿", +"顙": "颡", +"顛": "颠", +"類": "类", +"顢": "颟", +"顥": "颢", +"顧": "顾", +"顫": "颤", +"顬": "颥", +"顯": "显", +"顰": "颦", +"顱": "颅", +"顳": "颞", +"顴": "颧", +"風": "风", +"颭": "飐", +"颮": "飑", +"颯": "飒", +"颱": "台", +"颳": "刮", +"颶": "飓", +"颸": "飔", +"颺": "飏", +"颻": "飖", +"颼": "飕", +"飀": "飗", +"飄": "飘", +"飆": "飙", +"飈": "飚", +"飛": "飞", +"飠": "饣", +"飢": "饥", +"飣": "饤", +"飥": "饦", +"飩": "饨", +"飪": "饪", +"飫": "饫", +"飭": "饬", +"飯": "饭", +"飲": "饮", +"飴": "饴", +"飼": "饲", +"飽": "饱", +"飾": "饰", +"飿": "饳", +"餃": "饺", +"餄": "饸", +"餅": "饼", +"餉": "饷", +"養": "养", +"餌": "饵", +"餎": "饹", +"餏": "饻", +"餑": "饽", +"餒": "馁", +"餓": "饿", +"餕": "馂", +"餖": "饾", +"餚": "肴", +"餛": "馄", +"餜": "馃", +"餞": "饯", +"餡": "馅", +"館": "馆", +"餱": "糇", +"餳": "饧", +"餶": "馉", +"餷": "馇", +"餺": "馎", +"餼": "饩", +"餾": "馏", +"餿": "馊", +"饁": "馌", +"饃": "馍", +"饅": "馒", +"饈": "馐", +"饉": "馑", +"饊": "馓", +"饋": "馈", +"饌": "馔", +"饑": "饥", +"饒": "饶", +"饗": "飨", +"饜": "餍", +"饞": "馋", +"饢": "馕", +"馬": "马", +"馭": "驭", +"馮": "冯", +"馱": "驮", +"馳": "驰", +"馴": "驯", +"馹": "驲", +"駁": "驳", +"駐": "驻", +"駑": "驽", +"駒": "驹", +"駔": "驵", +"駕": "驾", +"駘": "骀", +"駙": "驸", +"駛": "驶", +"駝": "驼", +"駟": "驷", +"駡": "骂", +"駢": "骈", +"駭": "骇", +"駰": "骃", +"駱": "骆", +"駸": "骎", +"駿": "骏", +"騁": "骋", +"騂": "骍", +"騅": "骓", +"騌": "骔", +"騍": "骒", +"騎": "骑", +"騏": "骐", +"騖": "骛", +"騙": "骗", +"騤": "骙", +"騧": "䯄", +"騫": "骞", +"騭": "骘", +"騮": "骝", +"騰": "腾", +"騶": "驺", +"騷": "骚", +"騸": "骟", +"騾": "骡", +"驀": "蓦", +"驁": "骜", +"驂": "骖", +"驃": "骠", +"驄": "骢", +"驅": "驱", +"驊": "骅", +"驌": "骕", +"驍": "骁", +"驏": "骣", +"驕": "骄", +"驗": "验", +"驚": "惊", +"驛": "驿", +"驟": "骤", +"驢": "驴", +"驤": "骧", +"驥": "骥", +"驦": "骦", +"驪": "骊", +"驫": "骉", +"骯": "肮", +"髏": "髅", +"髒": "脏", +"體": "体", +"髕": "髌", +"髖": "髋", +"髮": "发", +"鬆": "松", +"鬍": "胡", +"鬚": "须", +"鬢": "鬓", +"鬥": "斗", +"鬧": "闹", +"鬩": "阋", +"鬮": "阄", +"鬱": "郁", +"魎": "魉", +"魘": "魇", +"魚": "鱼", +"魛": "鱽", +"魢": "鱾", +"魨": "鲀", +"魯": "鲁", +"魴": "鲂", +"魷": "鱿", +"魺": "鲄", +"鮁": "鲅", +"鮃": "鲆", +"鮊": "鲌", +"鮋": "鲉", +"鮍": "鲏", +"鮎": "鲇", +"鮐": "鲐", +"鮑": "鲍", +"鮒": "鲋", +"鮓": "鲊", +"鮚": "鲒", +"鮜": "鲘", +"鮝": "鲞", +"鮞": "鲕", +"鮦": "鲖", +"鮪": "鲔", +"鮫": "鲛", +"鮭": "鲑", +"鮮": "鲜", +"鮳": "鲓", +"鮶": "鲪", +"鮺": "鲝", +"鯀": "鲧", +"鯁": "鲠", +"鯇": "鲩", +"鯉": "鲤", +"鯊": "鲨", +"鯒": "鲬", +"鯔": "鲻", +"鯕": "鲯", +"鯖": "鲭", +"鯗": "鲞", +"鯛": "鲷", +"鯝": "鲴", +"鯡": "鲱", +"鯢": "鲵", +"鯤": "鲲", +"鯧": "鲳", +"鯨": "鲸", +"鯪": "鲮", +"鯫": "鲰", +"鯴": "鲺", +"鯷": "鳀", +"鯽": "鲫", +"鯿": "鳊", +"鰁": "鳈", +"鰂": "鲗", +"鰃": "鳂", +"鰈": "鲽", +"鰉": "鳇", +"鰍": "鳅", +"鰏": "鲾", +"鰐": "鳄", +"鰒": "鳆", +"鰓": "鳃", +"鰜": "鳒", +"鰟": "鳑", +"鰠": "鳋", +"鰣": "鲥", +"鰥": "鳏", +"鰨": "鳎", +"鰩": "鳐", +"鰭": "鳍", +"鰮": "鳁", +"鰱": "鲢", +"鰲": "鳌", +"鰳": "鳓", +"鰵": "鳘", +"鰷": "鲦", +"鰹": "鲣", +"鰺": "鲹", +"鰻": "鳗", +"鰼": "鳛", +"鰾": "鳔", +"鱂": "鳉", +"鱅": "鳙", +"鱈": "鳕", +"鱉": "鳖", +"鱒": "鳟", +"鱔": "鳝", +"鱖": "鳜", +"鱗": "鳞", +"鱘": "鲟", +"鱝": "鲼", +"鱟": "鲎", +"鱠": "鲙", +"鱣": "鳣", +"鱤": "鳡", +"鱧": "鳢", +"鱨": "鲿", +"鱭": "鲚", +"鱯": "鳠", +"鱷": "鳄", +"鱸": "鲈", +"鱺": "鲡", +"䰾": "鲃", +"䲁": "鳚", +"鳥": "鸟", +"鳧": "凫", +"鳩": "鸠", +"鳬": "凫", +"鳲": "鸤", +"鳳": "凤", +"鳴": "鸣", +"鳶": "鸢", +"鳾": "䴓", +"鴆": "鸩", +"鴇": "鸨", +"鴉": "鸦", +"鴒": "鸰", +"鴕": "鸵", +"鴛": "鸳", +"鴝": "鸲", +"鴞": "鸮", +"鴟": "鸱", +"鴣": "鸪", +"鴦": "鸯", +"鴨": "鸭", +"鴯": "鸸", +"鴰": "鸹", +"鴴": "鸻", +"鴷": "䴕", +"鴻": "鸿", +"鴿": "鸽", +"鵁": "䴔", +"鵂": "鸺", +"鵃": "鸼", +"鵐": "鹀", +"鵑": "鹃", +"鵒": "鹆", +"鵓": "鹁", +"鵜": "鹈", +"鵝": "鹅", +"鵠": "鹄", +"鵡": "鹉", +"鵪": "鹌", +"鵬": "鹏", +"鵮": "鹐", +"鵯": "鹎", +"鵲": "鹊", +"鵷": "鹓", +"鵾": "鹍", +"鶄": "䴖", +"鶇": "鸫", +"鶉": "鹑", +"鶊": "鹒", +"鶓": "鹋", +"鶖": "鹙", +"鶘": "鹕", +"鶚": "鹗", +"鶡": "鹖", +"鶥": "鹛", +"鶩": "鹜", +"鶪": "䴗", +"鶬": "鸧", +"鶯": "莺", +"鶲": "鹟", +"鶴": "鹤", +"鶹": "鹠", +"鶺": "鹡", +"鶻": "鹘", +"鶼": "鹣", +"鶿": "鹚", +"鷀": "鹚", +"鷁": "鹢", +"鷂": "鹞", +"鷄": "鸡", +"鷈": "䴘", +"鷊": "鹝", +"鷓": "鹧", +"鷖": "鹥", +"鷗": "鸥", +"鷙": "鸷", +"鷚": "鹨", +"鷥": "鸶", +"鷦": "鹪", +"鷫": "鹔", +"鷯": "鹩", +"鷲": "鹫", +"鷳": "鹇", +"鷸": "鹬", +"鷹": "鹰", +"鷺": "鹭", +"鷽": "鸴", +"鷿": "䴙", +"鸂": "㶉", +"鸇": "鹯", +"鸌": "鹱", +"鸏": "鹲", +"鸕": "鸬", +"鸘": "鹴", +"鸚": "鹦", +"鸛": "鹳", +"鸝": "鹂", +"鸞": "鸾", +"鹵": "卤", +"鹹": "咸", +"鹺": "鹾", +"鹽": "盐", +"麗": "丽", +"麥": "麦", +"麩": "麸", +"麯": "曲", +"麵": "面", +"麼": "么", +"麽": "么", +"黃": "黄", +"黌": "黉", +"點": "点", +"黨": "党", +"黲": "黪", +"黴": "霉", +"黶": "黡", +"黷": "黩", +"黽": "黾", +"黿": "鼋", +"鼉": "鼍", +"鼕": "冬", +"鼴": "鼹", +"齊": "齐", +"齋": "斋", +"齎": "赍", +"齏": "齑", +"齒": "齿", +"齔": "龀", +"齕": "龁", +"齗": "龂", +"齙": "龅", +"齜": "龇", +"齟": "龃", +"齠": "龆", +"齡": "龄", +"齣": "出", +"齦": "龈", +"齪": "龊", +"齬": "龉", +"齲": "龋", +"齶": "腭", +"齷": "龌", +"龍": "龙", +"龎": "厐", +"龐": "庞", +"龔": "龚", +"龕": "龛", +"龜": "龟", + +"幾畫": "几画", +"賣畫": "卖画", +"滷鹼": "卤碱", +"原畫": "原画", +"口鹼": "口碱", +"古畫": "古画", +"名畫": "名画", +"奇畫": "奇画", +"如畫": "如画", +"弱鹼": "弱碱", +"彩畫": "彩画", +"所畫": "所画", +"扉畫": "扉画", +"教畫": "教画", +"水鹼": "水碱", +"洋鹼": "洋碱", +"炭畫": "炭画", +"畫一": "画一", +"畫上": "画上", +"畫下": "画下", +"畫中": "画中", +"畫供": "画供", +"畫兒": "画儿", +"畫具": "画具", +"畫出": "画出", +"畫史": "画史", +"畫品": "画品", +"畫商": "画商", +"畫圈": "画圈", +"畫境": "画境", +"畫工": "画工", +"畫帖": "画帖", +"畫幅": "画幅", +"畫意": "画意", +"畫成": "画成", +"畫景": "画景", +"畫本": "画本", +"畫架": "画架", +"畫框": "画框", +"畫法": "画法", +"畫王": "画王", +"畫界": "画界", +"畫符": "画符", +"畫紙": "画纸", +"畫線": "画线", +"畫航": "画航", +"畫舫": "画舫", +"畫虎": "画虎", +"畫論": "画论", +"畫譜": "画谱", +"畫象": "画象", +"畫質": "画质", +"畫貼": "画贴", +"畫軸": "画轴", +"畫頁": "画页", +"鹽鹼": "盐碱", +"鹼": "碱", +"鹼基": "碱基", +"鹼度": "碱度", +"鹼水": "碱水", +"鹼熔": "碱熔", +"磁畫": "磁画", +"策畫": "策画", +"組畫": "组画", +"絹畫": "绢画", +"耐鹼": "耐碱", +"肉鹼": "肉碱", +"膠畫": "胶画", +"茶鹼": "茶碱", +"西畫": "西画", +"貼畫": "贴画", +"返鹼": "返碱", +"鍾鍛": "锺锻", +"鍛鍾": "锻锺", +"雕畫": "雕画", +"鯰": "鲶", +"三聯畫": "三联画", +"中國畫": "中国画", +"書畫": "书画", +"書畫社": "书画社", +"五筆畫": "五笔画", +"作畫": "作画", +"入畫": "入画", +"寫生畫": "写生画", +"刻畫": "刻画", +"動畫": "动画", +"勾畫": "勾画", +"單色畫": "单色画", +"卡通畫": "卡通画", +"國畫": "国画", +"圖畫": "图画", +"壁畫": "壁画", +"字畫": "字画", +"宣傳畫": "宣传画", +"工筆畫": "工笔画", +"年畫": "年画", +"幽默畫": "幽默画", +"指畫": "指画", +"描畫": "描画", +"插畫": "插画", +"擘畫": "擘画", +"春畫": "春画", +"木刻畫": "木刻画", +"機械畫": "机械画", +"比畫": "比画", +"毛筆畫": "毛笔画", +"水粉畫": "水粉画", +"油畫": "油画", +"海景畫": "海景画", +"漫畫": "漫画", +"點畫": "点画", +"版畫": "版画", +"畫": "画", +"畫像": "画像", +"畫冊": "画册", +"畫刊": "画刊", +"畫匠": "画匠", +"畫捲": "画卷", +"畫圖": "画图", +"畫壇": "画坛", +"畫室": "画室", +"畫家": "画家", +"畫屏": "画屏", +"畫展": "画展", +"畫布": "画布", +"畫師": "画师", +"畫廊": "画廊", +"畫報": "画报", +"畫押": "画押", +"畫板": "画板", +"畫片": "画片", +"畫畫": "画画", +"畫皮": "画皮", +"畫眉鳥": "画眉鸟", +"畫稿": "画稿", +"畫筆": "画笔", +"畫院": "画院", +"畫集": "画集", +"畫面": "画面", +"筆畫": "笔画", +"細密畫": "细密画", +"繪畫": "绘画", +"自畫像": "自画像", +"蠟筆畫": "蜡笔画", +"裸體畫": "裸体画", +"西洋畫": "西洋画", +"透視畫": "透视画", +"銅版畫": "铜版画", +"鍾": "锺", +"靜物畫": "静物画", +"餘": "馀", +} + +zh2TW = { +"缺省": "預設", +"串行": "串列", +"以太网": "乙太網", +"位图": "點陣圖", +"例程": "常式", +"信道": "通道", +"光标": "游標", +"光盘": "光碟", +"光驱": "光碟機", +"全角": "全形", +"加载": "載入", +"半角": "半形", +"变量": "變數", +"噪声": "雜訊", +"脱机": "離線", +"声卡": "音效卡", +"老字号": "老字號", +"字号": "字型大小", +"字库": "字型檔", +"字段": "欄位", +"字符": "字元", +"存盘": "存檔", +"寻址": "定址", +"尾注": "章節附註", +"异步": "非同步", +"总线": "匯流排", +"括号": "括弧", +"接口": "介面", +"控件": "控制項", +"权限": "許可權", +"盘片": "碟片", +"硅片": "矽片", +"硅谷": "矽谷", +"硬盘": "硬碟", +"磁盘": "磁碟", +"磁道": "磁軌", +"程控": "程式控制", +"端口": "埠", +"算子": "運算元", +"算法": "演算法", +"芯片": "晶片", +"芯片": "晶元", +"词组": "片語", +"译码": "解碼", +"软驱": "軟碟機", +"快闪存储器": "快閃記憶體", +"闪存": "快閃記憶體", +"鼠标": "滑鼠", +"进制": "進位", +"交互式": "互動式", +"仿真": "模擬", +"优先级": "優先順序", +"传感": "感測", +"便携式": "攜帶型", +"信息论": "資訊理論", +"写保护": "防寫", +"分布式": "分散式", +"分辨率": "解析度", +"服务器": "伺服器", +"等于": "等於", +"局域网": "區域網", +"计算机": "電腦", +"扫瞄仪": "掃瞄器", +"宽带": "寬頻", +"数据库": "資料庫", +"奶酪": "乳酪", +"巨商": "鉅賈", +"手电": "手電筒", +"万历": "萬曆", +"永历": "永曆", +"词汇": "辭彙", +"习用": "慣用", +"元音": "母音", +"任意球": "自由球", +"头球": "頭槌", +"入球": "進球", +"粒入球": "顆進球", +"打门": "射門", +"火锅盖帽": "蓋火鍋", +"打印机": "印表機", +"打印機": "印表機", +"字节": "位元組", +"字節": "位元組", +"打印": "列印", +"打印": "列印", +"硬件": "硬體", +"硬件": "硬體", +"二极管": "二極體", +"二極管": "二極體", +"三极管": "三極體", +"三極管": "三極體", +"软件": "軟體", +"軟件": "軟體", +"网络": "網路", +"網絡": "網路", +"人工智能": "人工智慧", +"航天飞机": "太空梭", +"穿梭機": "太空梭", +"因特网": "網際網路", +"互聯網": "網際網路", +"机器人": "機器人", +"機械人": "機器人", +"移动电话": "行動電話", +"流動電話": "行動電話", +"调制解调器": "數據機", +"調制解調器": "數據機", +"短信": "簡訊", +"短訊": "簡訊", +"乌兹别克斯坦": "烏茲別克", +"乍得": "查德", +"乍得": "查德", +"也门": "葉門", +"也門": "葉門", +"伯利兹": "貝里斯", +"伯利茲": "貝里斯", +"佛得角": "維德角", +"佛得角": "維德角", +"克罗地亚": "克羅埃西亞", +"克羅地亞": "克羅埃西亞", +"冈比亚": "甘比亞", +"岡比亞": "甘比亞", +"几内亚比绍": "幾內亞比索", +"幾內亞比紹": "幾內亞比索", +"列支敦士登": "列支敦斯登", +"列支敦士登": "列支敦斯登", +"利比里亚": "賴比瑞亞", +"利比里亞": "賴比瑞亞", +"加纳": "迦納", +"加納": "迦納", +"加蓬": "加彭", +"加蓬": "加彭", +"博茨瓦纳": "波札那", +"博茨瓦納": "波札那", +"卡塔尔": "卡達", +"卡塔爾": "卡達", +"卢旺达": "盧安達", +"盧旺達": "盧安達", +"危地马拉": "瓜地馬拉", +"危地馬拉": "瓜地馬拉", +"厄瓜多尔": "厄瓜多", +"厄瓜多爾": "厄瓜多", +"厄立特里亚": "厄利垂亞", +"厄立特里亞": "厄利垂亞", +"吉布提": "吉布地", +"吉布堤": "吉布地", +"哈萨克斯坦": "哈薩克", +"哥斯达黎加": "哥斯大黎加", +"哥斯達黎加": "哥斯大黎加", +"图瓦卢": "吐瓦魯", +"圖瓦盧": "吐瓦魯", +"土库曼斯坦": "土庫曼", +"圣卢西亚": "聖露西亞", +"聖盧西亞": "聖露西亞", +"圣基茨和尼维斯": "聖克里斯多福及尼維斯", +"聖吉斯納域斯": "聖克里斯多福及尼維斯", +"圣文森特和格林纳丁斯": "聖文森及格瑞那丁", +"聖文森特和格林納丁斯": "聖文森及格瑞那丁", +"圣马力诺": "聖馬利諾", +"聖馬力諾": "聖馬利諾", +"圭亚那": "蓋亞那", +"圭亞那": "蓋亞那", +"坦桑尼亚": "坦尚尼亞", +"坦桑尼亞": "坦尚尼亞", +"埃塞俄比亚": "衣索比亞", +"埃塞俄比亞": "衣索比亞", +"基里巴斯": "吉里巴斯", +"基里巴斯": "吉里巴斯", +"塔吉克斯坦": "塔吉克", +"塞拉利昂": "獅子山", +"塞拉利昂": "獅子山", +"塞浦路斯": "塞普勒斯", +"塞浦路斯": "塞普勒斯", +"塞舌尔": "塞席爾", +"塞舌爾": "塞席爾", +"多米尼加": "多明尼加", +"多明尼加共和國": "多明尼加", +"多米尼加联邦": "多米尼克", +"多明尼加聯邦": "多米尼克", +"安提瓜和巴布达": "安地卡及巴布達", +"安提瓜和巴布達": "安地卡及巴布達", +"尼日利亚": "奈及利亞", +"尼日利亞": "奈及利亞", +"尼日尔": "尼日", +"尼日爾": "尼日", +"巴巴多斯": "巴貝多", +"巴巴多斯": "巴貝多", +"巴布亚新几内亚": "巴布亞紐幾內亞", +"巴布亞新畿內亞": "巴布亞紐幾內亞", +"布基纳法索": "布吉納法索", +"布基納法索": "布吉納法索", +"布隆迪": "蒲隆地", +"布隆迪": "蒲隆地", +"希腊": "希臘", +"帕劳": "帛琉", +"意大利": "義大利", +"意大利": "義大利", +"所罗门群岛": "索羅門群島", +"所羅門群島": "索羅門群島", +"文莱": "汶萊", +"斯威士兰": "史瓦濟蘭", +"斯威士蘭": "史瓦濟蘭", +"斯洛文尼亚": "斯洛維尼亞", +"斯洛文尼亞": "斯洛維尼亞", +"新西兰": "紐西蘭", +"新西蘭": "紐西蘭", +"格林纳达": "格瑞那達", +"格林納達": "格瑞那達", +"格鲁吉亚": "喬治亞", +"格魯吉亞": "喬治亞", +"佐治亚": "喬治亞", +"佐治亞": "喬治亞", +"毛里塔尼亚": "茅利塔尼亞", +"毛里塔尼亞": "茅利塔尼亞", +"毛里求斯": "模里西斯", +"毛里裘斯": "模里西斯", +"沙特阿拉伯": "沙烏地阿拉伯", +"沙地阿拉伯": "沙烏地阿拉伯", +"波斯尼亚和黑塞哥维那": "波士尼亞赫塞哥維納", +"波斯尼亞黑塞哥維那": "波士尼亞赫塞哥維納", +"津巴布韦": "辛巴威", +"津巴布韋": "辛巴威", +"洪都拉斯": "宏都拉斯", +"洪都拉斯": "宏都拉斯", +"特立尼达和托巴哥": "千里達托貝哥", +"特立尼達和多巴哥": "千里達托貝哥", +"瑙鲁": "諾魯", +"瑙魯": "諾魯", +"瓦努阿图": "萬那杜", +"瓦努阿圖": "萬那杜", +"溫納圖萬": "那杜", +"科摩罗": "葛摩", +"科摩羅": "葛摩", +"科特迪瓦": "象牙海岸", +"突尼斯": "突尼西亞", +"索马里": "索馬利亞", +"索馬里": "索馬利亞", +"老挝": "寮國", +"老撾": "寮國", +"肯尼亚": "肯亞", +"肯雅": "肯亞", +"苏里南": "蘇利南", +"莫桑比克": "莫三比克", +"莱索托": "賴索托", +"萊索托": "賴索托", +"贝宁": "貝南", +"貝寧": "貝南", +"赞比亚": "尚比亞", +"贊比亞": "尚比亞", +"阿塞拜疆": "亞塞拜然", +"阿塞拜疆": "亞塞拜然", +"阿拉伯联合酋长国": "阿拉伯聯合大公國", +"阿拉伯聯合酋長國": "阿拉伯聯合大公國", +"马尔代夫": "馬爾地夫", +"馬爾代夫": "馬爾地夫", +"马耳他": "馬爾他", +"马里共和国": "馬利共和國", +"馬里共和國": "馬利共和國", +"方便面": "速食麵", +"快速面": "速食麵", +"即食麵": "速食麵", +"薯仔": "土豆", +"蹦极跳": "笨豬跳", +"绑紧跳": "笨豬跳", +"冷菜": "冷盤", +"凉菜": "冷盤", +"出租车": "計程車", +"台球": "撞球", +"桌球": "撞球", +"雪糕": "冰淇淋", +"卫生": "衛生", +"衞生": "衛生", +"平治": "賓士", +"奔驰": "賓士", +"積架": "捷豹", +"福士": "福斯", +"雪铁龙": "雪鐵龍", +"马自达": "馬自達", +"萬事得": "馬自達", +"拿破仑": "拿破崙", +"拿破侖": "拿破崙", +"布什": "布希", +"布殊": "布希", +"克林顿": "柯林頓", +"克林頓": "柯林頓", +"侯赛因": "海珊", +"侯賽因": "海珊", +"凡高": "梵谷", +"狄安娜": "黛安娜", +"戴安娜": "黛安娜", +"赫拉": "希拉", +} + +zh2HK = { +"打印机": "打印機", +"印表機": "打印機", +"字节": "位元組", +"字節": "位元組", +"打印": "打印", +"列印": "打印", +"硬件": "硬件", +"硬體": "硬件", +"二极管": "二極管", +"二極體": "二極管", +"三极管": "三極管", +"三極體": "三極管", +"数码": "數碼", +"數位": "數碼", +"软件": "軟件", +"軟體": "軟件", +"网络": "網絡", +"網路": "網絡", +"人工智能": "人工智能", +"人工智慧": "人工智能", +"航天飞机": "穿梭機", +"太空梭": "穿梭機", +"因特网": "互聯網", +"網際網路": "互聯網", +"机器人": "機械人", +"機器人": "機械人", +"移动电话": "流動電話", +"行動電話": "流動電話", +"调制解调器": "調制解調器", +"數據機": "調制解調器", +"短信": "短訊", +"簡訊": "短訊", +"乍得": "乍得", +"查德": "乍得", +"也门": "也門", +"葉門": "也門", +"伯利兹": "伯利茲", +"貝里斯": "伯利茲", +"佛得角": "佛得角", +"維德角": "佛得角", +"克罗地亚": "克羅地亞", +"克羅埃西亞": "克羅地亞", +"冈比亚": "岡比亞", +"甘比亞": "岡比亞", +"几内亚比绍": "幾內亞比紹", +"幾內亞比索": "幾內亞比紹", +"列支敦士登": "列支敦士登", +"列支敦斯登": "列支敦士登", +"利比里亚": "利比里亞", +"賴比瑞亞": "利比里亞", +"加纳": "加納", +"迦納": "加納", +"加蓬": "加蓬", +"加彭": "加蓬", +"博茨瓦纳": "博茨瓦納", +"波札那": "博茨瓦納", +"卡塔尔": "卡塔爾", +"卡達": "卡塔爾", +"卢旺达": "盧旺達", +"盧安達": "盧旺達", +"危地马拉": "危地馬拉", +"瓜地馬拉": "危地馬拉", +"厄瓜多尔": "厄瓜多爾", +"厄瓜多": "厄瓜多爾", +"厄立特里亚": "厄立特里亞", +"厄利垂亞": "厄立特里亞", +"吉布提": "吉布堤", +"吉布地": "吉布堤", +"哥斯达黎加": "哥斯達黎加", +"哥斯大黎加": "哥斯達黎加", +"图瓦卢": "圖瓦盧", +"吐瓦魯": "圖瓦盧", +"圣卢西亚": "聖盧西亞", +"聖露西亞": "聖盧西亞", +"圣基茨和尼维斯": "聖吉斯納域斯", +"聖克里斯多福及尼維斯": "聖吉斯納域斯", +"圣文森特和格林纳丁斯": "聖文森特和格林納丁斯", +"聖文森及格瑞那丁": "聖文森特和格林納丁斯", +"圣马力诺": "聖馬力諾", +"聖馬利諾": "聖馬力諾", +"圭亚那": "圭亞那", +"蓋亞那": "圭亞那", +"坦桑尼亚": "坦桑尼亞", +"坦尚尼亞": "坦桑尼亞", +"埃塞俄比亚": "埃塞俄比亞", +"衣索匹亞": "埃塞俄比亞", +"衣索比亞": "埃塞俄比亞", +"基里巴斯": "基里巴斯", +"吉里巴斯": "基里巴斯", +"狮子山": "獅子山", +"塞普勒斯": "塞浦路斯", +"塞舌尔": "塞舌爾", +"塞席爾": "塞舌爾", +"多米尼加": "多明尼加共和國", +"多明尼加": "多明尼加共和國", +"多米尼加联邦": "多明尼加聯邦", +"多米尼克": "多明尼加聯邦", +"安提瓜和巴布达": "安提瓜和巴布達", +"安地卡及巴布達": "安提瓜和巴布達", +"尼日利亚": "尼日利亞", +"奈及利亞": "尼日利亞", +"尼日尔": "尼日爾", +"尼日": "尼日爾", +"巴巴多斯": "巴巴多斯", +"巴貝多": "巴巴多斯", +"巴布亚新几内亚": "巴布亞新畿內亞", +"巴布亞紐幾內亞": "巴布亞新畿內亞", +"布基纳法索": "布基納法索", +"布吉納法索": "布基納法索", +"布隆迪": "布隆迪", +"蒲隆地": "布隆迪", +"義大利": "意大利", +"所罗门群岛": "所羅門群島", +"索羅門群島": "所羅門群島", +"斯威士兰": "斯威士蘭", +"史瓦濟蘭": "斯威士蘭", +"斯洛文尼亚": "斯洛文尼亞", +"斯洛維尼亞": "斯洛文尼亞", +"新西兰": "新西蘭", +"紐西蘭": "新西蘭", +"格林纳达": "格林納達", +"格瑞那達": "格林納達", +"格鲁吉亚": "喬治亞", +"格魯吉亞": "喬治亞", +"梵蒂冈": "梵蒂岡", +"毛里塔尼亚": "毛里塔尼亞", +"茅利塔尼亞": "毛里塔尼亞", +"毛里求斯": "毛里裘斯", +"模里西斯": "毛里裘斯", +"沙烏地阿拉伯": "沙特阿拉伯", +"波斯尼亚和黑塞哥维那": "波斯尼亞黑塞哥維那", +"波士尼亞赫塞哥維納": "波斯尼亞黑塞哥維那", +"津巴布韦": "津巴布韋", +"辛巴威": "津巴布韋", +"洪都拉斯": "洪都拉斯", +"宏都拉斯": "洪都拉斯", +"特立尼达和托巴哥": "特立尼達和多巴哥", +"千里達托貝哥": "特立尼達和多巴哥", +"瑙鲁": "瑙魯", +"諾魯": "瑙魯", +"瓦努阿图": "瓦努阿圖", +"萬那杜": "瓦努阿圖", +"科摩罗": "科摩羅", +"葛摩": "科摩羅", +"索马里": "索馬里", +"索馬利亞": "索馬里", +"老挝": "老撾", +"寮國": "老撾", +"肯尼亚": "肯雅", +"肯亞": "肯雅", +"莫桑比克": "莫桑比克", +"莫三比克": "莫桑比克", +"莱索托": "萊索托", +"賴索托": "萊索托", +"贝宁": "貝寧", +"貝南": "貝寧", +"赞比亚": "贊比亞", +"尚比亞": "贊比亞", +"阿塞拜疆": "阿塞拜疆", +"亞塞拜然": "阿塞拜疆", +"阿拉伯联合酋长国": "阿拉伯聯合酋長國", +"阿拉伯聯合大公國": "阿拉伯聯合酋長國", +"马尔代夫": "馬爾代夫", +"馬爾地夫": "馬爾代夫", +"馬利共和國": "馬里共和國", +"方便面": "即食麵", +"快速面": "即食麵", +"速食麵": "即食麵", +"泡麵": "即食麵", +"土豆": "馬鈴薯", +"华乐": "中樂", +"民乐": "中樂", +"計程車": "的士", +"出租车": "的士", +"公車": "巴士", +"自行车": "單車", +"犬只": "狗隻", +"台球": "桌球", +"撞球": "桌球", +"冰淇淋": "雪糕", +"賓士": "平治", +"捷豹": "積架", +"福斯": "福士", +"雪铁龙": "先進", +"雪鐵龍": "先進", +"沃尓沃": "富豪", +"马自达": "萬事得", +"馬自達": "萬事得", +"寶獅": "標致", +"拿破崙": "拿破侖", +"布什": "布殊", +"布希": "布殊", +"克林顿": "克林頓", +"柯林頓": "克林頓", +"萨达姆": "薩達姆", +"海珊": "侯賽因", +"侯赛因": "侯賽因", +"大卫·贝克汉姆": "大衛碧咸", +"迈克尔·欧文": "米高奧雲", +"珍妮弗·卡普里亚蒂": "卡佩雅蒂", +"马拉特·萨芬": "沙芬", +"迈克尔·舒马赫": "舒麥加", +"希特勒": "希特拉", +"狄安娜": "戴安娜", +"黛安娜": "戴安娜", +} + +zh2CN = { +"記憶體": "内存", +"預設": "默认", +"串列": "串行", +"乙太網": "以太网", +"點陣圖": "位图", +"常式": "例程", +"游標": "光标", +"光碟": "光盘", +"光碟機": "光驱", +"全形": "全角", +"共用": "共享", +"載入": "加载", +"半形": "半角", +"變數": "变量", +"雜訊": "噪声", +"因數": "因子", +"功能變數名稱": "域名", +"音效卡": "声卡", +"字型大小": "字号", +"字型檔": "字库", +"欄位": "字段", +"字元": "字符", +"存檔": "存盘", +"定址": "寻址", +"章節附註": "尾注", +"非同步": "异步", +"匯流排": "总线", +"括弧": "括号", +"介面": "接口", +"控制項": "控件", +"許可權": "权限", +"碟片": "盘片", +"矽片": "硅片", +"矽谷": "硅谷", +"硬碟": "硬盘", +"磁碟": "磁盘", +"磁軌": "磁道", +"程式控制": "程控", +"運算元": "算子", +"演算法": "算法", +"晶片": "芯片", +"晶元": "芯片", +"片語": "词组", +"軟碟機": "软驱", +"快閃記憶體": "快闪存储器", +"滑鼠": "鼠标", +"進位": "进制", +"互動式": "交互式", +"優先順序": "优先级", +"感測": "传感", +"攜帶型": "便携式", +"資訊理論": "信息论", +"迴圈": "循环", +"防寫": "写保护", +"分散式": "分布式", +"解析度": "分辨率", +"伺服器": "服务器", +"等於": "等于", +"區域網": "局域网", +"巨集": "宏", +"掃瞄器": "扫瞄仪", +"寬頻": "宽带", +"資料庫": "数据库", +"乳酪": "奶酪", +"鉅賈": "巨商", +"手電筒": "手电", +"萬曆": "万历", +"永曆": "永历", +"辭彙": "词汇", +"母音": "元音", +"自由球": "任意球", +"頭槌": "头球", +"進球": "入球", +"顆進球": "粒入球", +"射門": "打门", +"蓋火鍋": "火锅盖帽", +"印表機": "打印机", +"打印機": "打印机", +"位元組": "字节", +"字節": "字节", +"列印": "打印", +"打印": "打印", +"硬體": "硬件", +"二極體": "二极管", +"二極管": "二极管", +"三極體": "三极管", +"三極管": "三极管", +"數位": "数码", +"數碼": "数码", +"軟體": "软件", +"軟件": "软件", +"網路": "网络", +"網絡": "网络", +"人工智慧": "人工智能", +"太空梭": "航天飞机", +"穿梭機": "航天飞机", +"網際網路": "因特网", +"互聯網": "因特网", +"機械人": "机器人", +"機器人": "机器人", +"行動電話": "移动电话", +"流動電話": "移动电话", +"調制解調器": "调制解调器", +"數據機": "调制解调器", +"短訊": "短信", +"簡訊": "短信", +"烏茲別克": "乌兹别克斯坦", +"查德": "乍得", +"乍得": "乍得", +"也門": "", +"葉門": "也门", +"伯利茲": "伯利兹", +"貝里斯": "伯利兹", +"維德角": "佛得角", +"佛得角": "佛得角", +"克羅地亞": "克罗地亚", +"克羅埃西亞": "克罗地亚", +"岡比亞": "冈比亚", +"甘比亞": "冈比亚", +"幾內亞比紹": "几内亚比绍", +"幾內亞比索": "几内亚比绍", +"列支敦斯登": "列支敦士登", +"列支敦士登": "列支敦士登", +"利比里亞": "利比里亚", +"賴比瑞亞": "利比里亚", +"加納": "加纳", +"迦納": "加纳", +"加彭": "加蓬", +"加蓬": "加蓬", +"博茨瓦納": "博茨瓦纳", +"波札那": "博茨瓦纳", +"卡塔爾": "卡塔尔", +"卡達": "卡塔尔", +"盧旺達": "卢旺达", +"盧安達": "卢旺达", +"危地馬拉": "危地马拉", +"瓜地馬拉": "危地马拉", +"厄瓜多爾": "厄瓜多尔", +"厄瓜多": "厄瓜多尔", +"厄立特里亞": "厄立特里亚", +"厄利垂亞": "厄立特里亚", +"吉布堤": "吉布提", +"吉布地": "吉布提", +"哈薩克": "哈萨克斯坦", +"哥斯達黎加": "哥斯达黎加", +"哥斯大黎加": "哥斯达黎加", +"圖瓦盧": "图瓦卢", +"吐瓦魯": "图瓦卢", +"土庫曼": "土库曼斯坦", +"聖盧西亞": "圣卢西亚", +"聖露西亞": "圣卢西亚", +"聖吉斯納域斯": "圣基茨和尼维斯", +"聖克里斯多福及尼維斯": "圣基茨和尼维斯", +"聖文森特和格林納丁斯": "圣文森特和格林纳丁斯", +"聖文森及格瑞那丁": "圣文森特和格林纳丁斯", +"聖馬力諾": "圣马力诺", +"聖馬利諾": "圣马力诺", +"圭亞那": "圭亚那", +"蓋亞那": "圭亚那", +"坦桑尼亞": "坦桑尼亚", +"坦尚尼亞": "坦桑尼亚", +"埃塞俄比亞": "埃塞俄比亚", +"衣索匹亞": "埃塞俄比亚", +"衣索比亞": "埃塞俄比亚", +"吉里巴斯": "基里巴斯", +"基里巴斯": "基里巴斯", +"塔吉克": "塔吉克斯坦", +"塞拉利昂": "塞拉利昂", +"塞普勒斯": "塞浦路斯", +"塞浦路斯": "塞浦路斯", +"塞舌爾": "塞舌尔", +"塞席爾": "塞舌尔", +"多明尼加共和國": "多米尼加", +"多明尼加": "多米尼加", +"多明尼加聯邦": "多米尼加联邦", +"多米尼克": "多米尼加联邦", +"安提瓜和巴布達": "安提瓜和巴布达", +"安地卡及巴布達": "安提瓜和巴布达", +"尼日利亞": "尼日利亚", +"奈及利亞": "尼日利亚", +"尼日爾": "尼日尔", +"尼日": "尼日尔", +"巴貝多": "巴巴多斯", +"巴巴多斯": "巴巴多斯", +"巴布亞新畿內亞": "巴布亚新几内亚", +"巴布亞紐幾內亞": "巴布亚新几内亚", +"布基納法索": "布基纳法索", +"布吉納法索": "布基纳法索", +"蒲隆地": "布隆迪", +"布隆迪": "布隆迪", +"希臘": "希腊", +"帛琉": "帕劳", +"義大利": "意大利", +"意大利": "意大利", +"所羅門群島": "所罗门群岛", +"索羅門群島": "所罗门群岛", +"汶萊": "文莱", +"斯威士蘭": "斯威士兰", +"史瓦濟蘭": "斯威士兰", +"斯洛文尼亞": "斯洛文尼亚", +"斯洛維尼亞": "斯洛文尼亚", +"新西蘭": "新西兰", +"紐西蘭": "新西兰", +"格林納達": "格林纳达", +"格瑞那達": "格林纳达", +"格魯吉亞": "乔治亚", +"喬治亞": "乔治亚", +"梵蒂岡": "梵蒂冈", +"毛里塔尼亞": "毛里塔尼亚", +"茅利塔尼亞": "毛里塔尼亚", +"毛里裘斯": "毛里求斯", +"模里西斯": "毛里求斯", +"沙地阿拉伯": "沙特阿拉伯", +"沙烏地阿拉伯": "沙特阿拉伯", +"波斯尼亞黑塞哥維那": "波斯尼亚和黑塞哥维那", +"波士尼亞赫塞哥維納": "波斯尼亚和黑塞哥维那", +"津巴布韋": "津巴布韦", +"辛巴威": "津巴布韦", +"宏都拉斯": "洪都拉斯", +"洪都拉斯": "洪都拉斯", +"特立尼達和多巴哥": "特立尼达和托巴哥", +"千里達托貝哥": "特立尼达和托巴哥", +"瑙魯": "瑙鲁", +"諾魯": "瑙鲁", +"瓦努阿圖": "瓦努阿图", +"萬那杜": "瓦努阿图", +"溫納圖": "瓦努阿图", +"科摩羅": "科摩罗", +"葛摩": "科摩罗", +"象牙海岸": "科特迪瓦", +"突尼西亞": "突尼斯", +"索馬里": "索马里", +"索馬利亞": "索马里", +"老撾": "老挝", +"寮國": "老挝", +"肯雅": "肯尼亚", +"肯亞": "肯尼亚", +"蘇利南": "苏里南", +"莫三比克": "莫桑比克", +"莫桑比克": "莫桑比克", +"萊索托": "莱索托", +"賴索托": "莱索托", +"貝寧": "贝宁", +"貝南": "贝宁", +"贊比亞": "赞比亚", +"尚比亞": "赞比亚", +"亞塞拜然": "阿塞拜疆", +"阿塞拜疆": "阿塞拜疆", +"阿拉伯聯合酋長國": "阿拉伯联合酋长国", +"阿拉伯聯合大公國": "阿拉伯联合酋长国", +"南韓": "韩国", +"馬爾代夫": "马尔代夫", +"馬爾地夫": "马尔代夫", +"馬爾他": "马耳他", +"馬利共和國": "马里共和国", +"即食麵": "方便面", +"快速面": "方便面", +"速食麵": "方便面", +"泡麵": "方便面", +"笨豬跳": "蹦极跳", +"绑紧跳": "蹦极跳", +"冷盤": "凉菜", +"冷菜": "凉菜", +"散钱": "零钱", +"谐星": "笑星", +"夜学": "夜校", +"华乐": "民乐", +"中樂": "民乐", +"屋价": "房价", +"的士": "出租车", +"計程車": "出租车", +"公車": "公共汽车", +"單車": "自行车", +"節慶": "节日", +"芝士": "乾酪", +"狗隻": "犬只", +"士多啤梨": "草莓", +"忌廉": "奶油", +"桌球": "台球", +"撞球": "台球", +"雪糕": "冰淇淋", +"衞生": "卫生", +"衛生": "卫生", +"賓士": "奔驰", +"平治": "奔驰", +"積架": "捷豹", +"福斯": "大众", +"福士": "大众", +"雪鐵龍": "雪铁龙", +"萬事得": "马自达", +"馬自達": "马自达", +"寶獅": "标志", +"拿破崙": "拿破仑", +"布殊": "布什", +"布希": "布什", +"柯林頓": "克林顿", +"克林頓": "克林顿", +"薩達姆": "萨达姆", +"海珊": "萨达姆", +"梵谷": "凡高", +"大衛碧咸": "大卫·贝克汉姆", +"米高奧雲": "迈克尔·欧文", +"卡佩雅蒂": "珍妮弗·卡普里亚蒂", +"沙芬": "马拉特·萨芬", +"舒麥加": "迈克尔·舒马赫", +"希特拉": "希特勒", +"黛安娜": "戴安娜", +"希拉": "赫拉", +} + +zh2SG = { +"方便面": "快速面", +"速食麵": "快速面", +"即食麵": "快速面", +"蹦极跳": "绑紧跳", +"笨豬跳": "绑紧跳", +"凉菜": "冷菜", +"冷盤": "冷菜", +"零钱": "散钱", +"散紙": "散钱", +"笑星": "谐星", +"夜校": "夜学", +"民乐": "华乐", +"住房": "住屋", +"房价": "屋价", +"泡麵": "快速面", +}