diff --git a/Gemfile b/Gemfile index c9582ad..52c4b6c 100644 --- a/Gemfile +++ b/Gemfile @@ -1,3 +1,3 @@ source 'https://rubygems.org' gem 'github-pages', group: :jekyll_plugins -gem 'jekyll-scholar' \ No newline at end of file +gem 'jekyll-scholar', group: :jekyll_plugins \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index b341ecd..f9e2f9f 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,18 +1,19 @@ GEM remote: https://rubygems.org/ specs: - activesupport (4.2.10) - i18n (~> 0.7) + activesupport (6.0.3.3) + concurrent-ruby (~> 1.0, >= 1.0.2) + i18n (>= 0.7, < 2) minitest (~> 5.1) - thread_safe (~> 0.3, >= 0.3.4) tzinfo (~> 1.1) - addressable (2.5.2) - public_suffix (>= 2.0.2, < 4.0) + zeitwerk (~> 2.2, >= 2.2.2) + addressable (2.7.0) + public_suffix (>= 2.0.2, < 5.0) bibtex-ruby (4.4.7) latex-decode (~> 0.0) - citeproc (1.0.9) + citeproc (1.0.10) namae (~> 1.0) - citeproc-ruby (1.1.10) + citeproc-ruby (1.1.12) citeproc (~> 1.0, >= 1.0.9) csl (~> 1.5) coffee-script (2.4.1) @@ -20,145 +21,146 @@ GEM execjs coffee-script-source (1.11.1) colorator (1.1.0) - commonmarker (0.17.11) + commonmarker (0.17.13) ruby-enum (~> 0.5) - concurrent-ruby (1.0.5) - csl (1.5.0) + concurrent-ruby (1.1.7) + csl (1.5.1) namae (~> 1.0) - csl-styles (1.0.1.9) + csl-styles (1.0.1.10) csl (~> 1.0) - dnsruby (1.61.2) - addressable (~> 2.5) - em-websocket (0.5.1) + dnsruby (1.61.4) + simpleidn (~> 0.1) + em-websocket (0.5.2) eventmachine (>= 0.12.9) http_parser.rb (~> 0.6.0) - ethon (0.11.0) + ethon (0.12.0) ffi (>= 1.3.0) eventmachine (1.2.7) execjs (2.7.0) - faraday (0.15.2) + faraday (1.0.1) multipart-post (>= 1.2, < 3) - ffi (1.9.25) + ffi (1.13.1) forwardable-extended (2.6.0) - gemoji (3.0.0) - github-pages (191) - activesupport (= 4.2.10) - github-pages-health-check (= 1.8.1) - jekyll (= 3.7.3) - jekyll-avatar (= 0.6.0) + gemoji (3.0.1) + github-pages (208) + github-pages-health-check (= 1.16.1) + jekyll (= 3.9.0) + jekyll-avatar (= 0.7.0) jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.1.5) + jekyll-commonmark-ghpages (= 0.1.6) jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.10.0) + jekyll-feed (= 0.15.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.9.4) - jekyll-mentions (= 1.4.1) - jekyll-optional-front-matter (= 0.3.0) + jekyll-github-metadata (= 2.13.0) + jekyll-mentions (= 1.6.0) + jekyll-optional-front-matter (= 0.3.2) jekyll-paginate (= 1.1.0) - jekyll-readme-index (= 0.2.0) - jekyll-redirect-from (= 0.14.0) - jekyll-relative-links (= 0.5.3) - jekyll-remote-theme (= 0.3.1) + jekyll-readme-index (= 0.3.0) + jekyll-redirect-from (= 0.16.0) + jekyll-relative-links (= 0.6.1) + jekyll-remote-theme (= 0.4.2) jekyll-sass-converter (= 1.5.2) - jekyll-seo-tag (= 2.5.0) - jekyll-sitemap (= 1.2.0) - jekyll-swiss (= 0.4.0) + jekyll-seo-tag (= 2.6.1) + jekyll-sitemap (= 1.4.0) + jekyll-swiss (= 1.0.0) jekyll-theme-architect (= 0.1.1) jekyll-theme-cayman (= 0.1.1) jekyll-theme-dinky (= 0.1.1) - jekyll-theme-hacker (= 0.1.1) + jekyll-theme-hacker (= 0.1.2) jekyll-theme-leap-day (= 0.1.1) jekyll-theme-merlot (= 0.1.1) jekyll-theme-midnight (= 0.1.1) jekyll-theme-minimal (= 0.1.1) jekyll-theme-modernist (= 0.1.1) - jekyll-theme-primer (= 0.5.3) + jekyll-theme-primer (= 0.5.4) jekyll-theme-slate (= 0.1.1) jekyll-theme-tactile (= 0.1.1) jekyll-theme-time-machine (= 0.1.1) - jekyll-titles-from-headings (= 0.5.1) - jemoji (= 0.10.1) - kramdown (= 1.17.0) - liquid (= 4.0.0) - listen (= 3.1.5) + jekyll-titles-from-headings (= 0.5.3) + jemoji (= 0.12.0) + kramdown (= 2.3.0) + kramdown-parser-gfm (= 1.1.0) + liquid (= 4.0.3) mercenary (~> 0.3) - minima (= 2.5.0) - nokogiri (>= 1.8.2, < 2.0) - rouge (= 2.2.1) + minima (= 2.5.1) + nokogiri (>= 1.10.4, < 2.0) + rouge (= 3.23.0) terminal-table (~> 1.4) - github-pages-health-check (1.8.1) + github-pages-health-check (1.16.1) addressable (~> 2.3) dnsruby (~> 1.60) octokit (~> 4.0) - public_suffix (~> 2.0) + public_suffix (~> 3.0) typhoeus (~> 1.3) - html-pipeline (2.8.4) + html-pipeline (2.14.0) activesupport (>= 2) nokogiri (>= 1.4) http_parser.rb (0.6.0) i18n (0.9.5) concurrent-ruby (~> 1.0) - jekyll (3.7.3) + jekyll (3.9.0) addressable (~> 2.4) colorator (~> 1.0) em-websocket (~> 0.5) i18n (~> 0.7) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) - kramdown (~> 1.14) + kramdown (>= 1.17, < 3) liquid (~> 4.0) mercenary (~> 0.3.3) pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.6.0) - jekyll (~> 3.0) + jekyll-avatar (0.7.0) + jekyll (>= 3.0, < 5.0) jekyll-coffeescript (1.1.1) coffee-script (~> 2.2) coffee-script-source (~> 1.11.1) - jekyll-commonmark (1.2.0) + jekyll-commonmark (1.3.1) commonmarker (~> 0.14) - jekyll (>= 3.0, < 4.0) - jekyll-commonmark-ghpages (0.1.5) + jekyll (>= 3.7, < 5.0) + jekyll-commonmark-ghpages (0.1.6) commonmarker (~> 0.17.6) - jekyll-commonmark (~> 1) - rouge (~> 2) + jekyll-commonmark (~> 1.2) + rouge (>= 2.0, < 4.0) jekyll-default-layout (0.1.4) jekyll (~> 3.0) - jekyll-feed (0.10.0) - jekyll (~> 3.3) + jekyll-feed (0.15.0) + jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.9.4) - jekyll (~> 3.1) + jekyll-github-metadata (2.13.0) + jekyll (>= 3.4, < 5.0) octokit (~> 4.0, != 4.4.0) - jekyll-mentions (1.4.1) + jekyll-mentions (1.6.0) html-pipeline (~> 2.3) - jekyll (~> 3.0) - jekyll-optional-front-matter (0.3.0) - jekyll (~> 3.0) + jekyll (>= 3.7, < 5.0) + jekyll-optional-front-matter (0.3.2) + jekyll (>= 3.0, < 5.0) jekyll-paginate (1.1.0) - jekyll-readme-index (0.2.0) - jekyll (~> 3.0) - jekyll-redirect-from (0.14.0) - jekyll (~> 3.3) - jekyll-relative-links (0.5.3) - jekyll (~> 3.3) - jekyll-remote-theme (0.3.1) - jekyll (~> 3.5) - rubyzip (>= 1.2.1, < 3.0) + jekyll-readme-index (0.3.0) + jekyll (>= 3.0, < 5.0) + jekyll-redirect-from (0.16.0) + jekyll (>= 3.3, < 5.0) + jekyll-relative-links (0.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-remote-theme (0.4.2) + addressable (~> 2.0) + jekyll (>= 3.5, < 5.0) + jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) + rubyzip (>= 1.3.0, < 3.0) jekyll-sass-converter (1.5.2) sass (~> 3.4) - jekyll-scholar (5.14.0) + jekyll-scholar (5.16.0) bibtex-ruby (~> 4.0, >= 4.0.13) citeproc-ruby (~> 1.0) csl-styles (~> 1.0) jekyll (~> 3.0) - jekyll-seo-tag (2.5.0) - jekyll (~> 3.3) - jekyll-sitemap (1.2.0) - jekyll (~> 3.3) - jekyll-swiss (0.4.0) + jekyll-seo-tag (2.6.1) + jekyll (>= 3.3, < 5.0) + jekyll-sitemap (1.4.0) + jekyll (>= 3.7, < 5.0) + jekyll-swiss (1.0.0) jekyll-theme-architect (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) @@ -168,8 +170,8 @@ GEM jekyll-theme-dinky (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) - jekyll-theme-hacker (0.1.1) - jekyll (~> 3.5) + jekyll-theme-hacker (0.1.2) + jekyll (> 3.5, < 5.0) jekyll-seo-tag (~> 2.0) jekyll-theme-leap-day (0.1.1) jekyll (~> 3.5) @@ -186,8 +188,8 @@ GEM jekyll-theme-modernist (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) - jekyll-theme-primer (0.5.3) - jekyll (~> 3.5) + jekyll-theme-primer (0.5.4) + jekyll (> 3.5, < 5.0) jekyll-github-metadata (~> 2.9) jekyll-seo-tag (~> 2.0) jekyll-theme-slate (0.1.1) @@ -199,62 +201,71 @@ GEM jekyll-theme-time-machine (0.1.1) jekyll (~> 3.5) jekyll-seo-tag (~> 2.0) - jekyll-titles-from-headings (0.5.1) - jekyll (~> 3.3) - jekyll-watch (2.0.0) + jekyll-titles-from-headings (0.5.3) + jekyll (>= 3.3, < 5.0) + jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.10.1) + jemoji (0.12.0) gemoji (~> 3.0) html-pipeline (~> 2.2) - jekyll (~> 3.0) - kramdown (1.17.0) + jekyll (>= 3.0, < 5.0) + kramdown (2.3.0) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) latex-decode (0.3.1) - liquid (4.0.0) - listen (3.1.5) - rb-fsevent (~> 0.9, >= 0.9.4) - rb-inotify (~> 0.9, >= 0.9.7) - ruby_dep (~> 1.2) + liquid (4.0.3) + listen (3.2.1) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) mercenary (0.3.6) mini_portile2 (2.4.0) - minima (2.5.0) - jekyll (~> 3.5) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.11.3) - multipart-post (2.0.0) + minitest (5.14.2) + multipart-post (2.1.1) namae (1.0.1) - nokogiri (1.10.8) + nokogiri (1.10.10) mini_portile2 (~> 2.4.0) - octokit (4.10.0) + octokit (4.18.0) + faraday (>= 0.9) sawyer (~> 0.8.0, >= 0.5.3) - pathutil (0.16.1) + pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (2.0.5) - rb-fsevent (0.10.3) - rb-inotify (0.9.10) - ffi (>= 0.5.0, < 2) - rouge (2.2.1) - ruby-enum (0.7.2) + public_suffix (3.1.1) + rb-fsevent (0.10.4) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.4) + rouge (3.23.0) + ruby-enum (0.8.0) i18n - ruby_dep (1.5.0) - rubyzip (2.0.0) - safe_yaml (1.0.4) - sass (3.5.7) + rubyzip (2.3.0) + safe_yaml (1.0.5) + sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.8.1) - addressable (>= 2.3.5, < 2.6) - faraday (~> 0.8, < 1.0) + sawyer (0.8.2) + addressable (>= 2.3.5) + faraday (> 0.8, < 2.0) + simpleidn (0.1.1) + unf (~> 0.1.4) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) thread_safe (0.3.6) - typhoeus (1.3.0) + typhoeus (1.4.0) ethon (>= 0.9.0) - tzinfo (1.2.5) + tzinfo (1.2.7) thread_safe (~> 0.1) - unicode-display_width (1.4.0) + unf (0.1.4) + unf_ext + unf_ext (0.0.7.7) + unicode-display_width (1.7.0) + zeitwerk (2.4.0) PLATFORMS ruby @@ -264,4 +275,4 @@ DEPENDENCIES jekyll-scholar BUNDLED WITH - 1.16.4 + 2.1.4 diff --git a/_bibliography/references.bib b/_bibliography/references.bib new file mode 100644 index 0000000..774f2ff --- /dev/null +++ b/_bibliography/references.bib @@ -0,0 +1,90 @@ +--- +--- +References +========== + + +@article{gnmt, + author = {Yonghui Wu and + Mike Schuster and + Zhifeng Chen and + Quoc V. Le and + Mohammad Norouzi and + Wolfgang Macherey and + Maxim Krikun and + Yuan Cao and + Qin Gao and + Klaus Macherey and + Jeff Klingner and + Apurva Shah and + Melvin Johnson and + Xiaobing Liu and + Lukasz Kaiser and + Stephan Gouws and + Yoshikiyo Kato and + Taku Kudo and + Hideto Kazawa and + Keith Stevens and + George Kurian and + Nishant Patil and + Wei Wang and + Cliff Young and + Jason Smith and + Jason Riesa and + Alex Rudnick and + Oriol Vinyals and + Greg Corrado and + Macduff Hughes and + Jeffrey Dean}, + title = {Google's Neural Machine Translation System: Bridging the Gap between + Human and Machine Translation}, + journal = {CoRR}, + volume = {abs/1609.08144}, + year = {2016}, + url = {http://arxiv.org/abs/1609.08144}, + archivePrefix = {arXiv}, + eprint = {1609.08144}, + timestamp = {Thu, 14 Mar 2019 09:34:18 +0100}, + biburl = {https://dblp.org/rec/journals/corr/WuSCLNMKCGMKSJL16.bib}, + bibsource = {dblp computer science bibliography, https://dblp.org} +} + +@misc{bahdanau2014neural, + abstract = {Neural machine translation is a recently proposed approach to machine +translation. Unlike the traditional statistical machine translation, the neural +machine translation aims at building a single neural network that can be +jointly tuned to maximize the translation performance. The models proposed +recently for neural machine translation often belong to a family of +encoder-decoders and consists of an encoder that encodes a source sentence into +a fixed-length vector from which a decoder generates a translation. In this +paper, we conjecture that the use of a fixed-length vector is a bottleneck in +improving the performance of this basic encoder-decoder architecture, and +propose to extend this by allowing a model to automatically (soft-)search for +parts of a source sentence that are relevant to predicting a target word, +without having to form these parts as a hard segment explicitly. With this new +approach, we achieve a translation performance comparable to the existing +state-of-the-art phrase-based system on the task of English-to-French +translation. Furthermore, qualitative analysis reveals that the +(soft-)alignments found by the model agree well with our intuition.}, + added-at = {2020-06-07T20:24:58.000+0200}, + author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua}, + biburl = {https://www.bibsonomy.org/bibtex/2713375898fd7d2477f6ab6dc3dd66c2c/jan.hofmann1}, + description = {[1409.0473] Neural Machine Translation by Jointly Learning to Align and Translate}, + interhash = {bb2ca011eeafccb0bd2505c9476dcd10}, + intrahash = {713375898fd7d2477f6ab6dc3dd66c2c}, + keywords = {thema:pyramid_scene_parsing}, + note = {cite arxiv:1409.0473Comment: Accepted at ICLR 2015 as oral presentation}, + timestamp = {2020-06-07T20:24:58.000+0200}, + title = {Neural Machine Translation by Jointly Learning to Align and Translate}, + url = {http://arxiv.org/abs/1409.0473}, + year = 2014 +} + +@misc{attention, + title={Attention Is All You Need}, + author={Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin}, + year={2017}, + eprint={1706.03762}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} diff --git a/_config.yml b/_config.yml index 9b82b05..efd05fc 100644 --- a/_config.yml +++ b/_config.yml @@ -24,3 +24,5 @@ markdown: kramdown kramdown: toc_levels: 1..2 + +plugins: ['jekyll/scholar'] \ No newline at end of file diff --git a/_debug.yml b/_debug.yml index 20c27e7..66717f1 100644 --- a/_debug.yml +++ b/_debug.yml @@ -27,3 +27,5 @@ markdown: kramdown kramdown: toc_levels: 1..2 + +plugins: ['jekyll/scholar'] \ No newline at end of file diff --git a/_drafts/Benchmark1-scaling.md b/_drafts/Benchmark1-scaling.md index 1deefd9..89b2621 100644 --- a/_drafts/Benchmark1-scaling.md +++ b/_drafts/Benchmark1-scaling.md @@ -101,12 +101,12 @@ The definitions helm chart values can be found [here](https://mlbench.readthedoc ## Results * Epochs to Top-1 Validation Accuracy - - Validation Accuracy @ 1 + + Validation Accuracy @ 1 * Time to Top-1 Validation Accuracy - - Validation Accuracy @ 1 + + Validation Accuracy @ 1 diff --git a/_drafts/MLBench-benchmarking-mpi-speed.md b/_drafts/MLBench-benchmarking-mpi-speed.md index e308461..b2b61bb 100644 --- a/_drafts/MLBench-benchmarking-mpi-speed.md +++ b/_drafts/MLBench-benchmarking-mpi-speed.md @@ -17,8 +17,8 @@ In this experiment, we compare MPI P2P communication for CUDA Inter-Process Communication (IPC) improves communication between GPUs on the same node. In openmpi, one can use `--mca btl_smcuda_use_cuda_ipc` to turn on/off this functionality. We demostrate the influence of CUDA-IPC by sending/receiving a vector on a node with two GPUs. ### Results - - MPI Speed P2P + + MPI Speed P2P - The P2P communication between two nodes is bounded by network bandwidth (`7.5 Gbit/s` measured by `iperf`). Communicating large vectors on CPU/GPU have similar throughput. @@ -41,8 +41,8 @@ The connection between GPUs are PHB which traverss PCIe as well as a PCIe Host B ### Results The results of experiments are shown below. Note that the bandwidth here is calculated by dividing the vector size by the time it spent. The actual bandwidth depends on the implementation of all reduce. - - MPI Speed Collective + + MPI Speed Collective - The NCCL all reduce does not give better performance when the GPU per machine is 1 or 2. diff --git a/_drafts/MLBench-limits-lie-00.md b/_drafts/MLBench-limits-lie-00.md index 8c2aff4..b36c2e0 100644 --- a/_drafts/MLBench-limits-lie-00.md +++ b/_drafts/MLBench-limits-lie-00.md @@ -15,22 +15,22 @@ First create a cluster of two `n1-standard-4` instances with `limits.cpu=1000m` ### master/worker-0 node - - lie-00-resource-node1 + + lie-00-resource-node1 Only 2 pods are for mlbench: `release1-mlbench-master-6448bfb454-sxm2l` (`100m` CPU) and `release1-mlbench-worker-0` (`1000m` CPU). The rest of pods request `1161m` of CPU and `750MB` memory. The summary of resources on this node is (requests `2261m` CPU in total ) - - The MLBench Dashboard + + The MLBench Dashboard ### worker-1 node On worker-1 node, there are much less pods. - - lie-00-resource-node2 + + lie-00-resource-node2 So the amount of resources available is limited to the master node. In the previous setting we can allocate at most `3920-1161-100=2659m` for each worker. diff --git a/_layouts/default.html b/_layouts/default.html index c99f6e6..aa76969 100644 --- a/_layouts/default.html +++ b/_layouts/default.html @@ -13,3 +13,13 @@ + + + \ No newline at end of file diff --git a/_posts/2018-09-07-introducing-mlbench.md b/_posts/2018-09-07-introducing-mlbench.md index ff1ae9f..69f892a 100644 --- a/_posts/2018-09-07-introducing-mlbench.md +++ b/_posts/2018-09-07-introducing-mlbench.md @@ -8,8 +8,8 @@ excerpt_separator: --- MLBench is a framework for distributed machine learning. Its purpose is to improve transparency, reproducibility, robustness, and to provide fair performance measures as well as reference implementations, helping adoption of distributed machine learning methods both in industry and in the academic community. - - The MLBench Dashboard + + The MLBench Dashboard diff --git a/_posts/2020-09-08-communication-backend-comparison.md b/_posts/2020-09-08-communication-backend-comparison.md index 40212ca..6de3f26 100644 --- a/_posts/2020-09-08-communication-backend-comparison.md +++ b/_posts/2020-09-08-communication-backend-comparison.md @@ -73,8 +73,8 @@ and can be sped up using distributed training. #### CPU In the graph below, we compare the speeds taken to perform an `all reduce` operation between 2, 4 and 8 workers, of `Float16` and `Float32` CPU tensors. - - Backend performance comparison (CPU tensors) + + Backend performance comparison (CPU tensors) ##### Key differences @@ -88,8 +88,8 @@ In the graph below, we compare the speeds taken to perform an `all reduce` opera We now compare the speeds for GPU tensors. Here, we have the addition of NCCL in the comparison. - - Backend performance comparison (GPU tensors) + + Backend performance comparison (GPU tensors) ##### Key differences diff --git a/_posts/2020-10-02-nlp-translation.md b/_posts/2020-10-02-nlp-translation.md new file mode 100644 index 0000000..2405544 --- /dev/null +++ b/_posts/2020-10-02-nlp-translation.md @@ -0,0 +1,234 @@ +--- +layout: post +title: NLP Translation tasks, results discussion +author: e_hoelzl +published: true +tags: [performance, results] +excerpt_separator: +--- + +The popularity and relevance of Natural Language Processing (NLP) may come from the +fascination of teaching machines to understand and assimilate human language, +and use them as tools to complement and facilitate our everyday lives. + +Machine translation is one branch of NLP, and consists of having automated model capable of + translating text from one language to another almost instantaniously. + +In this blog post, we analyze how distributed learning improves the training time of two different machine translation models: +an LSTM variant (GNMT) and an attention based model (Transformer). + + + +Those models present two main limitations that makes training very time consuming: + - They need millions of data points to reach acceptable performance. + - Models are quite large (hundreds of millions of parameters), and computations take significant time compared to simpler models. + +Each of those problems can be solved using distribution: + - Distribute the data on multiple machines (data-parallel). + - Distributed computations for one data point on multiple cores (compute-parallel), requires model to be parallelizable. + +Based on these limitations, we can divide processing of datapoints over multiple workers, +or even subdivide the computations required to process a single datapoint. +In our experiments, we focus on dividing the data (data-parallel). +We plan to extend these results to model-parallel training in the future. + + +## Models + +First let's have a quick look at the models' architectures to understand the scale. + +### LSTM +The LSTM variant we implemented was designed by Google {% cite gnmt %} and is called the Google Neural Machine translation. +The architecture is shown in the figure below + + + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/gnmt.png) + *GNMT Architecture* + + +Left side is the Encoder network, right side is the Decoder, connected via the attention module. +The first encoder LSTM layer is bi-directional, and others are uni-directional. +Residual connections start from the layer third from the bottom in the encoder and decoder. + +This model follows the sequence-to-sequence learning framework, and uses stacked residual LSTM connections in the encoder and decoder modules. +The residual connections allow for deeper stacked LSTM layers, as without residuals, the stack typically suffer from vanishing/exploding +gradients when too many layers are used. +The attention module is based on the one described in {% cite bahdanau2014neural %} + +In our implementation, the encoder and decoder have each 4 stacked LSTM layers with residual connections, and hidden sizes of 1024. +This gives a model with a total of 160,671,297 trainable parameters. + +### Transformer + +This model was first published in {% cite attention %}, and aims at completely disregarding recurrence and relying entirely on self-attention +mechanisms to perform sequence modelling and translation problems. + +Transformer uses Multi-Head attention mechanisms: instead of computing the attention once, it runs through the scaled dot-product attention multiple times +in parallel. + +The figure below shows an overview of the architecture + + + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/transformer.png) + *Transformer Architecture* + + +Our implementation follows the original one described in the paper: encoder and decoder each have 6 identical layers. +Each of the layers are composed of: +- Encoder layers: Multi-head attention, followed by position-wise feed-forward layer (with residual connections) +- Decoder layers: Similar to encoder layers, but with an additional multi-head attention layer that performs attention on the encoder output. + +All multi-head attention modules have 16 heads for both encoder and decoder layers. This results in a model that has a total of 210,808,832 parameters. + + +## Training + +### Loss Function +For both of the models, we use Negative Log-Likelihood loss with label smoothing. +The models output a probability for each word of the vocabulary for the translated sentence. +From this, we can compute $$NLLLoss(\mathbf{\hat{y}}, \mathbf{y})$$ where $$\mathbf{\hat{y}}$$ is the model output and $\mathbf{y}$ is the target. + + +$$ Smooth Loss = -mean (log softmax(\mathbf{\hat{y}})) $$ +$$ Loss = confidence * NLLLoss + smoothing * SmoothLoss $$ + +Where $$confidence = 1 - smoothing$$. The smoothing is set to a value of 0.1 for both tasks. + +### Optimization +As we have seen above, both models have a very high number of parameters to train. +This can be an issue when using GPUs, as the model needs to fit in memory, and back-propagation requires the memory to be at least twice the size of the model for it to work in memory. + +For example: +- Transformer has 200 million trainable parameters. In full precision (`Float32`), this results in 800 MB for only storing the weights. +- Forward pass requires to multiply and store each output. So we add another 800MB for forward pass +- Each sent/received tensor for other workers will be of 800 MB. For 4 workers this results in 3.2 GB needed +- Those sent tensors will take longer to be received as they are larger. +- Backpropagation requires at least 3 to 4 times the amount of memory needed by the model to work, so another 3.2 GB of memory + +Considering those numbers, this results in memory usage of already ~ 8GB, which in reality is much greater as CUDA amd CudNN need also their share of memory. +From our experiments, a memory of 16GB is far from enough to train those models in full precision. + +For that, instead of using regular precision, we used +mixed-precision training, where most computations are done in `Float16`. We use a synchronous data-parallel version of `Adam`, +where gradients are aggregated amongst all workers before updating weights. + +### Datasets +Both tasks use the same test set, but are trained on slightly different data sets: +- The LSTM is trained on the English to German World Machine Translation 16 (WMT16) dataset, comprising of 3,975,116 translated sentences +- The Transformer is trained on the English to German World Machine Translation 17 (WMT17) dataset, comprising of 4,590,101. + + +More details on both tasks can be found in our [documentation](https://mlbench.readthedocs.io/en/latest/benchmark-tasks.html#task-4-machine-translation). + +## Results + +Let us now get to fun part; the results. As previously discussed, those models have important training times, and the aim of MLBench is to study the benefit of distribution. +For reproducibility purposes, here is the hardware and software we have used: +- Cloud service: Google Cloud +- Machine Type: `n1-standard-4` +- PyTorch 1.5.1 +- NVIDIA Tesla-T4 GPU (1 per node) +- 4 cores and 15GB of RAM +- NCCL communication backend + +The goal for both models is determined by the Bilingual Evaluation Understudy Score (BLEU): +- The LSTM task stops when reaching a BLEU Score of 24.0 +- The Transformer task stops when reaching a BLEU score of 25.0 + +The models are trained on 1,2,4,8 and 16 workers, and all step times are precisely measured to obtain an accurate speed up quantification. +Speedups are computed with respect to the 1 worker case, and are intended to illustrate the distributive capabilities of the task. + +### Overall Speedups + +The graphs below show the time speedups for the LSTM model and Transformer model (respectively). + + + *GNMT Speedups* + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4a_speedups.png) + + + +
+ + + *Transformer Speedups* + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4b_speedups.png) + + +The left graph shows the absolute speed ups with respect to one worker, and the right one omits +communication times from the speed up. This allows us to better see the effect of communication. + + +A few interesting points: +- Overall speedups follow a sub-linear pattern, while compute are roughly linear. +- Scaling the number of compute nodes gives nearly perfect scaling for both tasks (right plot) +- Using more powerful communication hardware (e.g. Tesla V100) will positively affect speedups. We currently have around 10Gbps + connection speed between the workers, and such hardware could increase it by a factor of at least 10. + +As the distribution level increases, we can see that communication becomes more and more heavy, and attenuates speedups quite significantly. + +### Step times + +The next figures show the total time spent in each step of training. + + + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4a_times.png) + *Step times for GNMT* + + + + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4b_times.png) + *Step times for Transformer* + + +- The top left graph in each figure shows the total training time `total = compute + communication` +- Computation times are `compute = forward + backward + optimization + loss computation + init + end` +- Communication are only `aggregation` steps, and are precisely measured to take only into account communication of tensors between workers. + +As expected, we can see that compute steps take less time as we increase the number of nodes, +while communication increasingly takes more and more time, following a sub-linear path. Looking at both graphs, +we can see that `aggregation` times increase, but slowly, and reach a plateau quite quickly: the time spent communicating to 8 and 16 workers doesn't differ much. + +Other compute steps follow the same pattern, but inversely: Fast decrease in the beginning, and slowly plateaus. The steps that benefit the most from distribution are +the backpropagation and computing the loss. This makes sense, as batches get smaller for each machine. + + +### Performance comparison + +Finally, the following figures show the share of time spent for each step of training. The *Aggregation* step corresponds to the aggregation of weights between the workers, +and is the only step where communication happens. + +#### LSTM + + *Step shares for GNMT* + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4a_step_shares.png) + + +Communication takes up a huge part of training as we increase distribution: around 80% of the time is spent sending tensors for 16 workers ! +This could be made faster by using a more appropriate connectivity between the workers (currently it is at 10GB/s) that can reduce times by a factor of 10 or more. + +We can clearly see the limits of the used hardware here: communication quickly becomes the bottleneck, as very large tensors are shared between increasing number of workers. +Here, *All Reduce* aggregation of gradients is performed before the optimization step, which yields a lot of exchanged messages. It would be interesting to see how the time spent communicating +tensors can be reduced by using a more advanced aggregation technique (e.g. sharing with neighbors in a pre-defined topology) + + +#### Transformer + + *Step shares for Transformer* + ![test]({{ site.baseurl }}public/images/blog/2020-10-02-nlp-translation/task4b_step_shares.png) + + +Compared to the LSTM model, the communication time ratio follows a similar path. However, as this model does not use LSTM layers, overall time is lower. + +## Conclusion +Both models solve an identical task, with almost identical datasets, and similar training algorithm, but use very different models. It is hence interesting to see how both react to distribution. +These similar results show that both models benefit similarly from multiple workers, and are both very quickly bottlenecked by the communication hardware. Here, nodes communicate over a regular high speed +network, which mimics a real "distributed" training environment, where machines could be in different locations. Using direct, or higher performance communication between the nodes (e.g. NVLink, or Google's Virtual NIC) +we would observe speedups close to the compute speedups, so close to linear speedups for both models. + +----- + +## References + + +{% bibliography --cited %} \ No newline at end of file diff --git a/index.md b/index.md index e6007fb..830ce24 100644 --- a/index.md +++ b/index.md @@ -66,10 +66,10 @@ Check out our blog!

Sponsors

diff --git a/public/css/lightbox.css b/public/css/lightbox.css index a95d45a..320badf 100644 --- a/public/css/lightbox.css +++ b/public/css/lightbox.css @@ -76,7 +76,7 @@ html.lb-disable-scrolling { width: 32px; height: 32px; margin: 0 auto; - background: url(../images/loading.gif) no-repeat; + background: url(../images/assets/loading.gif) no-repeat; } .lb-nav { @@ -107,7 +107,7 @@ html.lb-disable-scrolling { width: 34%; left: 0; float: left; - background: url(../images/prev.png) left 48% no-repeat; + background: url(../images/assets/prev.png) left 48% no-repeat; filter: progid:DXImageTransform.Microsoft.Alpha(Opacity=0); opacity: 0; -webkit-transition: opacity 0.6s; @@ -125,7 +125,7 @@ html.lb-disable-scrolling { width: 64%; right: 0; float: right; - background: url(../images/next.png) right 48% no-repeat; + background: url(../images/assets/next.png) right 48% no-repeat; filter: progid:DXImageTransform.Microsoft.Alpha(Opacity=0); opacity: 0; -webkit-transition: opacity 0.6s; @@ -189,7 +189,7 @@ html.lb-disable-scrolling { float: right; width: 30px; height: 30px; - background: url(../images/close.png) top right no-repeat; + background: url(../images/assets/close.png) top right no-repeat; text-align: right; outline: none; filter: progid:DXImageTransform.Microsoft.Alpha(Opacity=70); diff --git a/public/images/Create_Run.png b/public/images/Create_Run.png deleted file mode 100644 index 24651e9..0000000 Binary files a/public/images/Create_Run.png and /dev/null differ diff --git a/public/images/New_Run.png b/public/images/New_Run.png deleted file mode 100644 index f5be1b4..0000000 Binary files a/public/images/New_Run.png and /dev/null differ diff --git a/public/images/Pytorch_New_Run.png b/public/images/Pytorch_New_Run.png deleted file mode 100644 index 0776867..0000000 Binary files a/public/images/Pytorch_New_Run.png and /dev/null differ diff --git a/public/images/Run_Loss.png b/public/images/Run_Loss.png deleted file mode 100644 index a24d305..0000000 Binary files a/public/images/Run_Loss.png and /dev/null differ diff --git a/public/images/Run_Stdout.png b/public/images/Run_Stdout.png deleted file mode 100644 index 4b6cdf9..0000000 Binary files a/public/images/Run_Stdout.png and /dev/null differ diff --git a/public/images/Worker_Details.png b/public/images/Worker_Details.png deleted file mode 100644 index ca37d53..0000000 Binary files a/public/images/Worker_Details.png and /dev/null differ diff --git a/public/images/Facebook-Wordmark-Gray.png b/public/images/assets/Facebook-Wordmark-Gray.png similarity index 100% rename from public/images/Facebook-Wordmark-Gray.png rename to public/images/assets/Facebook-Wordmark-Gray.png diff --git a/public/images/Logo_EPFL.png b/public/images/assets/Logo_EPFL.png similarity index 100% rename from public/images/Logo_EPFL.png rename to public/images/assets/Logo_EPFL.png diff --git a/public/images/close.png b/public/images/assets/close.png similarity index 100% rename from public/images/close.png rename to public/images/assets/close.png diff --git a/public/images/google.png b/public/images/assets/google.png similarity index 100% rename from public/images/google.png rename to public/images/assets/google.png diff --git a/public/images/loading.gif b/public/images/assets/loading.gif similarity index 100% rename from public/images/loading.gif rename to public/images/assets/loading.gif diff --git a/public/images/next.png b/public/images/assets/next.png similarity index 100% rename from public/images/next.png rename to public/images/assets/next.png diff --git a/public/images/prev.png b/public/images/assets/prev.png similarity index 100% rename from public/images/prev.png rename to public/images/assets/prev.png diff --git a/public/images/pwc_logo.png b/public/images/assets/pwc_logo.png similarity index 100% rename from public/images/pwc_logo.png rename to public/images/assets/pwc_logo.png diff --git a/public/images/Dashboard_Index.png b/public/images/blog/2018-09-07-introducing-mlbench/Dashboard_Index.png similarity index 100% rename from public/images/Dashboard_Index.png rename to public/images/blog/2018-09-07-introducing-mlbench/Dashboard_Index.png diff --git a/public/images/backends_comparison_by_workers.png b/public/images/blog/2020-09-08-communication-backend-comparison/backends_comparison_by_workers.png similarity index 100% rename from public/images/backends_comparison_by_workers.png rename to public/images/blog/2020-09-08-communication-backend-comparison/backends_comparison_by_workers.png diff --git a/public/images/backends_comparison_by_workers_CUDA.png b/public/images/blog/2020-09-08-communication-backend-comparison/backends_comparison_by_workers_CUDA.png similarity index 100% rename from public/images/backends_comparison_by_workers_CUDA.png rename to public/images/blog/2020-09-08-communication-backend-comparison/backends_comparison_by_workers_CUDA.png diff --git a/public/images/blog/2020-10-02-nlp-translation/gnmt.png b/public/images/blog/2020-10-02-nlp-translation/gnmt.png new file mode 100644 index 0000000..77bb597 Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/gnmt.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4a_speedups.png b/public/images/blog/2020-10-02-nlp-translation/task4a_speedups.png new file mode 100644 index 0000000..b9a07b6 Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4a_speedups.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4a_step_shares.png b/public/images/blog/2020-10-02-nlp-translation/task4a_step_shares.png new file mode 100644 index 0000000..0a89a52 Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4a_step_shares.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4a_times.png b/public/images/blog/2020-10-02-nlp-translation/task4a_times.png new file mode 100644 index 0000000..c85bfaf Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4a_times.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4b_speedups.png b/public/images/blog/2020-10-02-nlp-translation/task4b_speedups.png new file mode 100644 index 0000000..dfd5176 Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4b_speedups.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4b_step_shares.png b/public/images/blog/2020-10-02-nlp-translation/task4b_step_shares.png new file mode 100644 index 0000000..41b5743 Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4b_step_shares.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/task4b_times.png b/public/images/blog/2020-10-02-nlp-translation/task4b_times.png new file mode 100644 index 0000000..dbc66cc Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/task4b_times.png differ diff --git a/public/images/blog/2020-10-02-nlp-translation/transformer.png b/public/images/blog/2020-10-02-nlp-translation/transformer.png new file mode 100644 index 0000000..b18ca7b Binary files /dev/null and b/public/images/blog/2020-10-02-nlp-translation/transformer.png differ diff --git a/public/images/lie-00-resource-node1-summary.png b/public/images/blog/drafts/lie-00-resource-node1-summary.png similarity index 100% rename from public/images/lie-00-resource-node1-summary.png rename to public/images/blog/drafts/lie-00-resource-node1-summary.png diff --git a/public/images/lie-00-resource-node1.png b/public/images/blog/drafts/lie-00-resource-node1.png similarity index 100% rename from public/images/lie-00-resource-node1.png rename to public/images/blog/drafts/lie-00-resource-node1.png diff --git a/public/images/lie-00-resource-node2.png b/public/images/blog/drafts/lie-00-resource-node2.png similarity index 100% rename from public/images/lie-00-resource-node2.png rename to public/images/blog/drafts/lie-00-resource-node2.png diff --git a/public/images/mpi-speed-collective.png b/public/images/blog/drafts/mpi-speed-collective.png similarity index 100% rename from public/images/mpi-speed-collective.png rename to public/images/blog/drafts/mpi-speed-collective.png diff --git a/public/images/mpi-speed-p2p.png b/public/images/blog/drafts/mpi-speed-p2p.png similarity index 100% rename from public/images/mpi-speed-p2p.png rename to public/images/blog/drafts/mpi-speed-p2p.png diff --git a/public/images/scaling-epoch-prec1.png b/public/images/blog/drafts/scaling-epoch-prec1.png similarity index 100% rename from public/images/scaling-epoch-prec1.png rename to public/images/blog/drafts/scaling-epoch-prec1.png diff --git a/public/images/scaling-time-prec1.png b/public/images/blog/drafts/scaling-time-prec1.png similarity index 100% rename from public/images/scaling-time-prec1.png rename to public/images/blog/drafts/scaling-time-prec1.png diff --git a/public/images/pytorch-tutorial-result.png b/public/images/pytorch-tutorial-result.png deleted file mode 100644 index d15383f..0000000 Binary files a/public/images/pytorch-tutorial-result.png and /dev/null differ diff --git a/public/images/scaling-prec1.png b/public/images/scaling-prec1.png deleted file mode 100644 index f0d2ba7..0000000 Binary files a/public/images/scaling-prec1.png and /dev/null differ diff --git a/public/images/scaling-throughput.png b/public/images/scaling-throughput.png deleted file mode 100644 index 482fd20..0000000 Binary files a/public/images/scaling-throughput.png and /dev/null differ diff --git a/public/images/scaling-time-cost.png b/public/images/scaling-time-cost.png deleted file mode 100644 index c082850..0000000 Binary files a/public/images/scaling-time-cost.png and /dev/null differ diff --git a/public/images/scaling-time-loss.png b/public/images/scaling-time-loss.png deleted file mode 100644 index 2e4721c..0000000 Binary files a/public/images/scaling-time-loss.png and /dev/null differ