book.bib.backup

@ARTICLE{Glide2021,
  author    = {Alex Nichol and
               Prafulla Dhariwal and
               Aditya Ramesh and
               Pranav Shyam and
               Pamela Mishkin and
               Bob McGrew and
               Ilya Sutskever and
               Mark Chen},
  title     = {{GLIDE:} Towards Photorealistic Image Generation and Editing with
               Text-Guided Diffusion Models},
  journal   = {CoRR},
  volume    = {abs/2112.10741},
  year      = {2021},
  url       = {https://arxiv.org/abs/2112.10741},
  eprinttype = {arXiv},
  eprint    = {2112.10741},
  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10741.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org},
}

@ARTICLE{Lu2020,
  AUTHOR = {Yu, Jun and Li, Jing and Yu, Zhou and Huang, Qingming},
  YEAR = {2020},
  DOI = {10.1109/TCSVT.2019.2947482},
  JOURNALTITLE = {IEEE Transactions on Circuits and Systems for Video Technology},
  NUMBER = {12},
  PAGES = {4467--4480},
  TITLE = {Multimodal Transformer With Multi-View Visual Representation for Image Captioning},
  VOLUME = {30},
}

@MISC{Fedus2021,
  AUTHOR = {Fedus, William and Zoph, Barret and Shazeer, Noam},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2101.03961},
  YEAR = {2021},
  DOI = {10.48550/ARXIV.2101.03961},
  KEYWORDS = {Machine Learning (cs.LG),Artificial Intelligence (cs.AI),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
}

@MISC{Mustafa2022,
  YEAR = {2022},
  AUTHOR = {Mustafa, Basil and Riquelme, Carlos and Puigcerver, Joan and Jenatton, Rodolphe and Houlsby, Neil},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2206.02770},
  DOI = {10.48550/ARXIV.2206.02770},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Multimodal Contrastive Learning with LIMoE: the Language-Image Mixture of Experts},
}

@ARTICLE{Carion2020,
  YEAR = {2020},
  AUTHOR = {Carion, Nicolas and Massa, Francisco and Synnaeve, Gabriel and Usunier, Nicolas and Kirillov, Alexander and Zagoruyko, Sergey},
  URL = {https://arxiv.org/abs/2005.12872},
  EPRINT = {2005.12872},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {End-to-End Object Detection with Transformers},
}

@MISC{Crawshaw2020,
  YEAR = {2020},
  AUTHOR = {Crawshaw, Michael},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2009.09796},
  DOI = {10.48550/ARXIV.2009.09796},
  KEYWORDS = {Machine Learning (cs.LG),Computer Vision and Pattern Recognition (cs.CV),Machine Learning (stat.ML),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Multi-Task Learning with Deep Neural Networks: A Survey},
}

@ARTICLE{Baltrusaitis2019,
  YEAR = {2019},
  AUTHOR = {Baltrušaitis, Tadas and Ahuja, Chaitanya and Morency, Louis-Philippe},
  DOI = {10.1109/TPAMI.2018.2798607},
  JOURNALTITLE = {IEEE Transactions on Pattern Analysis and Machine Intelligence},
  NUMBER = {2},
  PAGES = {423--443},
  TITLE = {Multimodal Machine Learning: A Survey and Taxonomy},
  VOLUME = {41},
}

@ARTICLE{Kaiser2017,
  YEAR = {2017},
  AUTHOR = {Kaiser, Lukasz and Gomez, Aidan N. and Shazeer, Noam and Vaswani, Ashish and Parmar, Niki and Jones, Llion and Uszkoreit, Jakob},
  URL = {https://arxiv.org/pdf/1706.05137.pdf},
  JOURNALTITLE = {arXiv},
  TITLE = {One Model To Learn Them All},
}

@INPROCEEDINGS{Hu2021,
  YEAR = {2021},
  AUTHOR = {Hu, Ronghang and Singh, Amanpreet},
  BOOKTITLE = {2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
  DOI = {10.1109/ICCV48922.2021.00147},
  PAGES = {1419--1429},
  TITLE = {UniT: Multimodal Multitask Learning with a Unified Transformer},
}

@MISC{Li2019,
  AUTHOR = {Li, Liunian Harold and Yatskar, Mark and Yin, Da and Hsieh, Cho-Jui and Chang, Kai-Wei},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1908.03557},
  YEAR = {2019},
  DOI = {10.48550/ARXIV.1908.03557},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Computation and Language (cs.CL),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {VisualBERT: A Simple and Performant Baseline for Vision and Language},
}

@ONLINE{Dean21,
  AUTHOR = {Dean, Jeff},
  URL = {https://blog.google/technology/ai/introducing-pathways-next-generation-ai-architecture/},
  YEAR = {2021},
  TITLE = {Introducing Pathways: A next-generation AI architecture},
}

@ARTICLE{Krishna2017,
  AUTHOR = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A. and Bernstein, Michael S. and Fei-Fei, Li},
  LOCATION = {USA},
  PUBLISHER = {Kluwer Academic Publishers},
  URL = {https://DOI.org/10.1007/s11263-016-0981-7},
  YEAR = {2017},
  DOI = {10.1007/s11263-016-0981-7},
  ISSN = {0920-5691},
  JOURNALTITLE = {Int. J. Comput. Vision},
  KEYWORDS = {Language,Relationships,Attributes,Question answering,Scene graph,Crowdsourcing,Computer vision,Knowledge,Image,Objects,Dataset},
  NUMBER = {1},
  PAGES = {32--73},
  TITLE = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
  VOLUME = {123},
}

@INPROCEEDINGS{Wang2022,
  AUTHOR = {Wang, Peng and Yang, An and Men, Rui and Lin, Junyang and Bai, Shuai and Li, Zhikang and Ma, Jianxin and Zhou, Chang and Zhou, Jingren and Yang, Hongxia},
  EDITOR = {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},
  PUBLISHER = {PMLR},
  URL = {https://proceedings.mlr.press/v162/wang22al.html},
  BOOKTITLE = {Proceedings of the 39th International Conference on Machine Learning},
  YEAR = {2022},
  FILE = {https://proceedings.mlr.press/v162/wang22al/wang22al.pdf},
  PAGES = {23318--23340},
  SERIES = {Proceedings of Machine Learning Research},
  TITLE = {{OFA}: Unifying Architectures, Tasks, and Modalities Through a Simple Sequence-to-Sequence Learning Framework},
  VOLUME = {162},
}

@MISC{Reed2022,
  AUTHOR = {Reed, Scott and Zolna, Konrad and Parisotto, Emilio and Colmenarejo, Sergio Gomez and Novikov, Alexander and Barth-Maron, Gabriel and Gimenez, Mai and Sulsky, Yury and Kay, Jackie and Springenberg, Jost Tobias and Eccles, Tom and Bruce, Jake and Razavi, Ali and Edwards, Ashley and Heess, Nicolas and Chen, Yutian and Hadsell, Raia and Vinyals, Oriol and Bordbar, Mahyar and de Freitas, Nando},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2205.06175},
  YEAR = {2022},
  DOI = {10.48550/ARXIV.2205.06175},
  KEYWORDS = {Artificial Intelligence (cs.AI),Computation and Language (cs.CL),Machine Learning (cs.LG),Robotics (cs.RO),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {A Generalist Agent},
}

@ARTICLE{Chowdhery2022,
  YEAR = {2022},
  AUTHOR = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and Schuh, Parker and Shi, Kensen and Tsvyashchenko, Sasha and Maynez, Joshua and Rao, Abhishek and Barnes, Parker and Tay, Yi and Shazeer, Noam and Prabhakaran, Vinodkumar and Reif, Emily and Du, Nan and Hutchinson, Ben and Pope, Reiner and Bradbury, James and Austin, Jacob and Isard, Michael and Gur-Ari, Guy and Yin, Pengcheng and Duke, Toju and Levskaya, Anselm and Ghemawat, Sanjay and Dev, Sunipa and Michalewski, Henryk and Garcia, Xavier and Misra, Vedant and Robinson, Kevin and Fedus, Liam and Zhou, Denny and Ippolito, Daphne and Luan, David and Lim, Hyeontaek and Zoph, Barret and Spiridonov, Alexander and Sepassi, Ryan and Dohan, David and Agrawal, Shivani and Omernick, Mark and Dai, Andrew M. and Pillai, Thanumalayan Sankaranarayana and Pellat, Marie and Lewkowycz, Aitor and Moreira, Erica and Child, Rewon and Polozov, Oleksandr and Lee, Katherine and Zhou, Zongwei and Wang, Xuezhi and Saeta, Brennan and Diaz, Mark and Firat, Orhan and Catasta, Michele and Wei, Jason and Meier-Hellstern, Kathy and Eck, Douglas and Dean, Jeff and Petrov, Slav and Fiedel, Noah},
  URL = {https://arxiv.org/abs/2204.02311},
  JOURNALTITLE = {arxiv:2204.02311},
  TITLE = {PaLM: Scaling Language Modeling with Pathways},
}

@MISC{Yu2022,
  AUTHOR = {Yu, Jiahui and Xu, Yuanzhong and Koh, Jing Yu and Luong, Thang and Baid, Gunjan and Wang, Zirui and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Ayan, Burcu Karagol and Hutchinson, Ben and Han, Wei and Parekh, Zarana and Li, Xin and Zhang, Han and Baldridge, Jason and Wu, Yonghui},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2206.10789},
  YEAR = {2022},
  DOI = {10.48550/ARXIV.2206.10789},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Scaling Autoregressive Models for Content-Rich Text-to-Image Generation},
}

@INPROCEEDINGS{Fernando2017,
  AUTHOR = {Fernando, Chrisantha and Banarse, Dylan and Blundell, Charles and Zwols, Yori and Ha, David and Rusu, Andrei A. and Pritzel, Alexander and Wierstra, Daan},
  URL = {https://arxiv.org/abs/1701.08734},
  YEAR = {2017},
  TITLE = {PathNet: Evolution Channels Gradient Descent in Super Neural Networks},
}

@INPROCEEDINGS{He2016b,
  AUTHOR = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  EDITOR = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  LOCATION = {Cham},
  PUBLISHER = {Springer International Publishing},
  BOOKTITLE = {Computer Vision -- ECCV 2016},
  YEAR = {2016},
  ISBN = {978-3-319-46493-0},
  PAGES = {630--645},
  TITLE = {Identity Mappings in Deep Residual Networks},
}

@INPROCEEDINGS{Dean20,
  AUTHOR = {Dean, Jeffrey},
  BOOKTITLE = {2020 IEEE International Solid- State Circuits Conference - (ISSCC)},
  YEAR = {2020},
  DOI = {10.1109/ISSCC19947.2020.9063049},
  PAGES = {8--14},
  TITLE = {1.1 The Deep Learning Revolution and Its Implications for Computer Architecture and Chip Design},
}

@REPORT{Lewkowycz2022,
  AUTHOR = {Lewkowycz, Aitor and Andreassen, Anders and Dohan, David Martin and Dyer, Ethan S and Michalewski, Henryk and Ramasesh, Vinay and Slone, Ambrose and Anil, Cem and Schlag, Imanol and Gutman-Solo, Theo and Wu, Yuhuai and Neyshabur, Behnam and Gur-Ari, Guy and Misra, Vedant},
  URL = {https://arxiv.org/abs/2206.14858},
  YEAR = {2022},
  TITLE = {Solving Quantitative Reasoning Problems with Language Models},
  TYPE = {techreport},
}

@INPROCEEDINGS{Riquelme2021,
  AUTHOR = {Riquelme, Carlos and Puigcerver, Joan and Mustafa, Basil and Neumann, Maxim and Jenatton, Rodolphe and Susano Pinto, André and Keysers, Daniel and Houlsby, Neil},
  EDITOR = {Ranzato, M. and Beygelzimer, A. and Dauphin, Y. and Liang, P.S. and Vaughan, J. Wortman},
  PUBLISHER = {Curran Associates, Inc.},
  URL = {https://proceedings.neurips.cc/paper/2021/file/48237d9f2dea8c74c2a72126cf63d933-Paper.pdf},
  BOOKTITLE = {Advances in Neural Information Processing Systems},
  YEAR = {2021},
  PAGES = {8583--8595},
  TITLE = {Scaling Vision with Sparse Mixture of Experts},
  VOLUME = {34},
}

@MISC{Gesmundo2022a,
  AUTHOR = {Gesmundo, Andrea and Dean, Jeff},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2205.10937},
  YEAR = {2022},
  DOI = {10.48550/ARXIV.2205.10937},
  KEYWORDS = {Machine Learning (cs.LG),Artificial Intelligence (cs.AI),Computer Vision and Pattern Recognition (cs.CV),Neural and Evolutionary Computing (cs.NE),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {muNet: Evolving Pretrained Deep Neural Networks into Scalable Auto-tuning Multitask Systems},
}

@ARTICLE{Steiner2021,
  YEAR = {2021},
  AUTHOR = {Steiner, Andreas and Kolesnikov, Alexander and Zhai, Xiaohua and Wightman, Ross and Uszkoreit, Jakob and Beyer, Lucas},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2106.10270},
  DOI = {10.48550/ARXIV.2106.10270},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Artificial Intelligence (cs.AI),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers},
}

@INPROCEEDINGS{Houlsby2019,
  ABSTRACT = {Fine-tuning large pretrained models is an effective transfer mechanism in NLP. However, in the presence of many downstream tasks, fine-tuning is parameter inefficient: an entire new model is required for every task. As an alternative, we propose transfer with adapter modules. Adapter modules yield a compact and extensible model; they add only a few trainable parameters per task, and new tasks can be added without revisiting previous ones. The parameters of the original network remain fixed, yielding a high degree of parameter sharing. To demonstrate adapter’s effectiveness, we transfer the recently proposed BERT Transformer model to $26$ diverse text classification tasks, including the GLUE benchmark. Adapters attain near state-of-the-art performance, whilst adding only a few parameters per task. On GLUE, we attain within $0.8%$ of the performance of full fine-tuning, adding only $3.6%$ parameters per task. By contrast, fine-tuning trains $100%$ of the parameters per task.},
  AUTHOR = {Houlsby, Neil and Giurgiu, Andrei and Jastrzebski, Stanislaw and Morrone, Bruna and De Laroussilhe, Quentin and Gesmundo, Andrea and Attariyan, Mona and Gelly, Sylvain},
  EDITOR = {Chaudhuri, Kamalika and Salakhutdinov, Ruslan},
  PUBLISHER = {PMLR},
  URL = {https://proceedings.mlr.press/v97/houlsby19a.html},
  BOOKTITLE = {Proceedings of the 36th International Conference on Machine Learning},
  YEAR = {2019},
  FILE = {http://proceedings.mlr.press/v97/houlsby19a/houlsby19a.pdf},
  PAGES = {2790--2799},
  SERIES = {Proceedings of Machine Learning Research},
  TITLE = {Parameter-Efficient Transfer Learning for {NLP}},
  VOLUME = {97},
}

@INPROCEEDINGS{Rebuffi2017,
  AUTHOR = {Rebuffi, Sylvestre-Alvise and Bilen, Hakan and Vedaldi, Andrea},
  EDITOR = {Guyon, I. and Luxburg, U. Von and Bengio, S. and Wallach, H. and Fergus, R. and Vishwanathan, S. and Garnett, R.},
  PUBLISHER = {Curran Associates, Inc.},
  URL = {https://proceedings.neurips.cc/paper/2017/file/e7b24b112a44fdd9ee93bdf998c6ca0e-Paper.pdf},
  BOOKTITLE = {Advances in Neural Information Processing Systems},
  YEAR = {2017},
  TITLE = {Learning multiple visual domains with residual adapters},
  VOLUME = {30},
}

@MISC{Bilen2017,
  AUTHOR = {Bilen, Hakan and Rebuffi, SSylvestre and Jakab, Tomas},
  YEAR = {2017},
  TITLE = {Visual domain decathlon},
}

@ARTICLE{Doerr2021,
  YEAR = {2021},
  AUTHOR = {Doerr, Benjamin and Neumann, Frank},
  LOCATION = {New York, NY, USA},
  PUBLISHER = {Association for Computing Machinery},
  URL = {https://doi.org/10.1145/3472304},
  DOI = {10.1145/3472304},
  ISSN = {2688-299X},
  JOURNALTITLE = {ACM Trans. Evol. Learn. Optim.},
  KEYWORDS = {parameterized complexity,discrete optimization,evolutionary algorithms,estimation of distribution algorithms,Theory},
  NUMBER = {4},
  TITLE = {A Survey on Recent Progress in the Theory of Evolutionary Algorithms for Discrete Optimization},
  VOLUME = {1},
}

@ARTICLE{Baeck1993,
  YEAR = {1993},
  AUTHOR = {B{\"a}ck, Thomas and Schwefel, Hans-Paul},
  DOI = {10.1162/evco.1993.1.1.1},
  JOURNALTITLE = {Evolutionary Computation},
  NUMBER = {1},
  PAGES = {1--23},
  TITLE = {An Overview of Evolutionary Algorithms for Parameter Optimization},
  VOLUME = {1},
}

@MISC{Hinton2015,
  YEAR = {2015},
  AUTHOR = {Hinton, Geoffrey and Vinyals, Oriol and Dean, Jeff},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1503.02531},
  DOI = {10.48550/ARXIV.1503.02531},
  KEYWORDS = {Machine Learning (stat.ML),Machine Learning (cs.LG),Neural and Evolutionary Computing (cs.NE),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Distilling the Knowledge in a Neural Network},
}

@INPROCEEDINGS{Shaazer2017,
  YEAR = {2017},
  AUTHOR = {Shazeer, Noam and Mirhoseini, Azalia and Maziarz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
  URL = {https://openreview.net/pdf?id=B1ckMDqlg},
  TITLE = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
}

@ARTICLE{Jordan1994,
  YEAR = {1994},
  AUTHOR = {Jordan, Michael I. and Jacobs, Robert A.},
  DOI = {10.1162/neco.1994.6.2.181},
  JOURNALTITLE = {Neural Computation},
  NUMBER = {2},
  PAGES = {181--214},
  TITLE = {Hierarchical Mixtures of Experts and the EM Algorithm},
  VOLUME = {6},
}

@ARTICLE{Jacobs1991,
  YEAR = {1991},
  AUTHOR = {Jacobs, Robert A. and Jordan, Michael I. and Nowlan, Steven J. and Hinton, Geoffrey E.},
  DOI = {10.1162/neco.1991.3.1.79},
  JOURNALTITLE = {Neural Computation},
  NUMBER = {1},
  PAGES = {79--87},
  TITLE = {Adaptive Mixtures of Local Experts},
  VOLUME = {3},
}

@INPROCEEDINGS{sennrich-etal-2016-neural,
  YEAR = {2016},
  AUTHOR = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  LOCATION = {Berlin, Germany},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/P16-1162},
  BOOKTITLE = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  DOI = {10.18653/v1/P16-1162},
  PAGES = {1715--1725},
  TITLE = {Neural Machine Translation of Rare Words with Subword Units},
}

@INPROCEEDINGS{pmlr-v139-ramesh21a,
  YEAR = {2021},
  AUTHOR = {Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  EDITOR = {Meila, Marina and Zhang, Tong},
  PUBLISHER = {PMLR},
  URL = {https://proceedings.mlr.press/v139/ramesh21a.html},
  BOOKTITLE = {Proceedings of the 38th International Conference on Machine Learning},
  FILE = {http://proceedings.mlr.press/v139/ramesh21a/ramesh21a.pdf},
  PAGES = {8821--8831},
  SERIES = {Proceedings of Machine Learning Research},
  TITLE = {Zero-Shot Text-to-Image Generation},
  VOLUME = {139},
}

@ARTICLE{ResNet,
  YEAR = {2015},
  AUTHOR = {He, Kaiming and Zhang, Xiangyu and Ren, Shaoqing and Sun, Jian},
  URL = {http://arxiv.org/abs/1512.03385},
  EPRINT = {1512.03385},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {Deep Residual Learning for Image Recognition},
}

@INPROCEEDINGS{mccoco,
  YEAR = {2014},
  AUTHOR = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C. Lawrence},
  PUBLISHER = {Springer International Publishing},
  BOOKTITLE = {Computer Vision -- ECCV 2014},
  ISBN = {978-3-319-10602-1},
  PAGES = {740--755},
  TITLE = {Microsoft COCO: Common Objects in Context},
}

@INPROCEEDINGS{kudo-richardson-2018-sentencepiece,
  YEAR = {2018},
  AUTHOR = {Kudo, Taku and Richardson, John},
  LOCATION = {Brussels, Belgium},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/D18-2012},
  BOOKTITLE = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations},
  DOI = {10.18653/v1/D18-2012},
  PAGES = {66--71},
  TITLE = {{S}entence{P}iece: A simple and language independent subword tokenizer and detokenizer for Neural Text Processing},
}

@ARTICLE{Devlin2018,
  YEAR = {2018},
  AUTHOR = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  EPRINT = {1810.04805},
  EPRINTCLASS = {cs.CL},
  EPRINTTYPE = {arXiv},
  FILE = {:http\://arxiv.org/pdf/1810.04805v2:PDF},
  KEYWORDS = {cs.CL},
  TITLE = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
}

@ARTICLE{brown2020language,
  YEAR = {2020},
  AUTHOR = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and others},
  JOURNALTITLE = {Advances in neural information processing systems},
  PAGES = {1877--1901},
  TITLE = {Language models are few-shot learners},
  VOLUME = {33},
}

@ARTICLE{ImageNet,
  YEAR = {2015},
  AUTHOR = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and Berg, Alexander C. and Fei-Fei, Li},
  LOCATION = {USA},
  PUBLISHER = {Kluwer Academic Publishers},
  URL = {https://doi.org/10.1007/s11263-015-0816-y},
  DOI = {10.1007/s11263-015-0816-y},
  ISSN = {0920-5691},
  JOURNALTITLE = {Int. J. Comput. Vision},
  KEYWORDS = {Benchmark,Object detection,Large-scale,Object recognition,Dataset},
  NUMBER = {3},
  PAGES = {211--252},
  TITLE = {ImageNet Large Scale Visual Recognition Challenge},
  VOLUME = {115},
}

@ARTICLE{dosovitskiy2020image,
  YEAR = {2020},
  AUTHOR = {Dosovitskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and others},
  JOURNALTITLE = {arXiv preprint arXiv:2010.11929},
  TITLE = {An image is worth 16x16 words: Transformers for image recognition at scale},
}

@ARTICLE{vaswani2017attention,
  YEAR = {2017},
  AUTHOR = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {Ł}ukasz and Polosukhin, Illia},
  JOURNALTITLE = {Advances in neural information processing systems},
  TITLE = {Attention is all you need},
  VOLUME = {30},
}

@INPROCEEDINGS{deng2009imagenet,
  YEAR = {2009},
  AUTHOR = {Deng, Jia and Dong, Wei and Socher, Richard and Li, Li-Jia and Li, Kai and Fei-Fei, Li},
  ORGANIZATION = {Ieee},
  BOOKTITLE = {2009 IEEE conference on computer vision and pattern recognition},
  PAGES = {248--255},
  TITLE = {Imagenet: A large-scale hierarchical image database},
}

@ARTICLE{parti,
  YEAR = {2022},
  AUTHOR = {Yu, Jiahui and Xu, Yuanzhong and Koh, Jing and Luong, Thang and Baid, Gunjan and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Ayan, Burcu and Hutchinson, Ben and Han, Wei and Parekh, Zarana and Li, Xin and Zhang, Han and Baldridge, Jason and Wu, Yonghui},
  DOI = {10.48550/arXiv.2206.10789},
  TITLE = {Scaling Autoregressive Models for Content-Rich Text-to-Image Generation},
}

@INPROCEEDINGS{lewis-etal-2020-bart,
  YEAR = {2020},
  AUTHOR = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Veselin and Zettlemoyer, Luke},
  LOCATION = {Online},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/2020.acl-main.703},
  BOOKTITLE = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  DOI = {10.18653/v1/2020.acl-main.703},
  PAGES = {7871--7880},
  TITLE = {{BART}: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension},
}

@ARTICLE{atari,
  YEAR = {2013},
  AUTHOR = {Bellemare, Marc G. and Naddaf, Yavar and Veness, Joel and Bowling, Michael},
  LOCATION = {El Segundo, CA, USA},
  PUBLISHER = {AI Access Foundation},
  ISSN = {1076-9757},
  JOURNALTITLE = {J. Artif. Int. Res.},
  NUMBER = {1},
  PAGES = {253--279},
  TITLE = {The Arcade Learning Environment: An Evaluation Platform for General Agents},
  VOLUME = {47},
}

@ONLINE{darkMatter,
  YEAR = {2021},
  AUTHOR = {Yann, Lecun and Ishan, Misra},
  URL = {https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/},
  TITLE = {Self-supervised learning: The dark matter of intelligence},
  URLDATE = {2022-06-26},
}

@ONLINE{redditUsers,
  YEAR = {2016},
  AUTHOR = {MICHAEL BARTHEL, GALEN STOCKING, JESSE HOLCOMB and MITCHELL, AMY},
  URL = {https://www.pewresearch.org/journalism/2016/02/25/reddit-news-users-more-likely-to-be-male-young-and-digital-in-their-news-preferences/},
  TITLE = {Reddit news users more likely to be male, young and digital in their news preferences},
  URLDATE = {2022-08-07},
}

@ONLINE{coco_eval,
  YEAR = {2019},
  AUTHOR = {Mircosoft},
  URL = {https://cocodataset.org/#detection-eval},
  TITLE = {Evaluate:Detection},
  URLDATE = {2022-07-09},
}

@ONLINE{unsupBrain,
  YEAR = {2021},
  AUTHOR = {Mineault, Patrick},
  URL = {https://xcorr.net/2021/12/31/2021-in-review-unsupervised-brain-models/},
  TITLE = {Unsupervised models of the brain},
  URLDATE = {2022-06-26},
}

@ARTICLE{zhuang2021unsupervised,
  YEAR = {2021},
  AUTHOR = {Zhuang, Chengxu and Yan, Siming and Nayebi, Aran and Schrimpf, Martin and Frank, Michael C and DiCarlo, James J and Yamins, Daniel LK},
  PUBLISHER = {National Acad Sciences},
  JOURNALTITLE = {Proceedings of the National Academy of Sciences},
  NUMBER = {3},
  PAGES = {e2014196118},
  TITLE = {Unsupervised neural network models of the ventral visual stream},
  VOLUME = {118},
}

@ARTICLE{liu2019roberta,
  YEAR = {2019},
  AUTHOR = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin},
  JOURNALTITLE = {arXiv preprint arXiv:1907.11692},
  TITLE = {Roberta: A robustly optimized bert pretraining approach},
}

@ARTICLE{bromley1993signature,
  YEAR = {1993},
  AUTHOR = {Bromley, Jane and Guyon, Isabelle and LeCun, Yann and S{\"a}ckinger, Eduard and Shah, Roopak},
  JOURNALTITLE = {Advances in neural information processing systems},
  TITLE = {Signature verification using a" siamese" time delay neural network},
  VOLUME = {6},
}

@INPROCEEDINGS{caron2021emerging,
  YEAR = {2021},
  AUTHOR = {Caron, Mathilde and Touvron, Hugo and Misra, Ishan and Jégou, Hervé and Mairal, Julien and Bojanowski, Piotr and Joulin, Armand},
  BOOKTITLE = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
  PAGES = {9650--9660},
  TITLE = {Emerging properties in self-supervised vision transformers},
}

@INPROCEEDINGS{mahajan2018exploring,
  YEAR = {2018},
  AUTHOR = {Mahajan, Dhruv and Girshick, Ross and Ramanathan, Vignesh and He, Kaiming and Paluri, Manohar and Li, Yixuan and Bharambe, Ashwin and Van Der Maaten, Laurens},
  BOOKTITLE = {Proceedings of the European conference on computer vision (ECCV)},
  PAGES = {181--196},
  TITLE = {Exploring the limits of weakly supervised pretraining},
}

@ARTICLE{kolesnikov2019large,
  YEAR = {2019},
  AUTHOR = {Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Puigcerver, Joan and Yung, Jessica and Gelly, Sylvain and Houlsby, Neil},
  PUBLISHER = {arXiv},
  JOURNALTITLE = {arXiv preprint arXiv:1912.11370},
  NUMBER = {8},
  TITLE = {Large scale learning of general visual representations for transfer},
  VOLUME = {2},
}

@ARTICLE{rajpurkar2016squad,
  YEAR = {2016},
  AUTHOR = {Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
  JOURNALTITLE = {arXiv preprint arXiv:1606.05250},
  TITLE = {Squad: 100,000+ questions for machine comprehension of text},
}

@ARTICLE{rajpurkar2018know,
  YEAR = {2018},
  AUTHOR = {Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
  JOURNALTITLE = {arXiv preprint arXiv:1806.03822},
  TITLE = {Know what you don't know: Unanswerable questions for SQuAD},
}

@ARTICLE{srivastava2022beyond,
  YEAR = {2022},
  AUTHOR = {Srivastava, Aarohi and Rastogi, Abhinav and Rao, Abhishek and Shoeb, Abu Awal Md and Abid, Abubakar and Fisch, Adam and Brown, Adam R and Santoro, Adam and Gupta, Aditya and Garriga-Alonso, Adrià and others},
  JOURNALTITLE = {arXiv preprint arXiv:2206.04615},
  TITLE = {Beyond the Imitation Game: Quantifying and extrapolating the capabilities of language models},
}

@ARTICLE{bowman2021will,
  YEAR = {2021},
  AUTHOR = {Bowman, Samuel R and Dahl, George E},
  JOURNALTITLE = {arXiv preprint arXiv:2104.02145},
  TITLE = {What Will it Take to Fix Benchmarking in Natural Language Understanding?},
}

@ARTICLE{goodfellow2014explaining,
  YEAR = {2014},
  AUTHOR = {Goodfellow, Ian J and Shlens, Jonathon and Szegedy, Christian},
  JOURNALTITLE = {arXiv preprint arXiv:1412.6572},
  TITLE = {Explaining and harnessing adversarial examples},
}

@INPROCEEDINGS{recht2019imagenet,
  YEAR = {2019},
  AUTHOR = {Recht, Benjamin and Roelofs, Rebecca and Schmidt, Ludwig and Shankar, Vaishaal},
  ORGANIZATION = {PMLR},
  BOOKTITLE = {International Conference on Machine Learning},
  PAGES = {5389--5400},
  TITLE = {Do imagenet classifiers generalize to imagenet?},
}

@ARTICLE{beyer2020we,
  YEAR = {2020},
  AUTHOR = {Beyer, Lucas and Hénaff, Olivier J and Kolesnikov, Alexander and Zhai, Xiaohua and Oord, A{\"a}ron van den},
  JOURNALTITLE = {arXiv preprint arXiv:2006.07159},
  TITLE = {Are we done with imagenet?},
}

@ARTICLE{li2022mask,
  AUTHOR = {Li, Feng and Zhang, Hao and Liu, Shilong and Zhang, Lei and Ni, Lionel M and Shum, Heung-Yeung and others},
  YEAR = {2022},
  JOURNALTITLE = {arXiv preprint arXiv:2206.02777},
  TITLE = {Mask DINO: Towards A Unified Transformer-based Framework for Object Detection and Segmentation},
}

@INPROCEEDINGS{koehn2005europarl,
  YEAR = {2005},
  AUTHOR = {Koehn, Philipp},
  BOOKTITLE = {Proceedings of machine translation summit x: papers},
  PAGES = {79--86},
  TITLE = {Europarl: A parallel corpus for statistical machine translation},
}

@MISC{Gokaslan2019OpenWeb,
  YEAR = {2019},
  AUTHOR = {Gokaslan, Aaron and Cohen, Vanya},
  TITLE = {OpenWebText Corpus},
}

@ARTICLE{xue2020mt5,
  YEAR = {2020},
  AUTHOR = {Xue, Linting and Constant, Noah and Roberts, Adam and Kale, Mihir and Al-Rfou, Rami and Siddhant, Aditya and Barua, Aditya and Raffel, Colin},
  JOURNALTITLE = {arXiv preprint arXiv:2010.11934},
  TITLE = {mT5: A massively multilingual pre-trained text-to-text transformer},
}

@ARTICLE{wenzek2019ccnet,
  YEAR = {2019},
  AUTHOR = {Wenzek, Guillaume and Lachaux, Marie-Anne and Conneau, Alexis and Chaudhary, Vishrav and Guzmán, Francisco and Joulin, Armand and Grave, Edouard},
  JOURNALTITLE = {arXiv preprint arXiv:1911.00359},
  TITLE = {Ccnet: Extracting high quality monolingual datasets from web crawl data},
}

@ARTICLE{bandy2021addressing,
  YEAR = {2021},
  AUTHOR = {Bandy, Jack and Vincent, Nicholas},
  JOURNALTITLE = {arXiv preprint arXiv:2105.05241},
  TITLE = {Addressing" documentation debt" in machine learning research: A retrospective datasheet for bookcorpus},
}

@ARTICLE{gao2017knowledge,
  YEAR = {2017},
  AUTHOR = {Gao, Jiyang and Li, Zhen and Nevatia, Ram and others},
  JOURNALTITLE = {arXiv preprint arXiv:1711.07607},
  TITLE = {Knowledge concentration: Learning 100k object classifiers in a single CNN},
}

@INPROCEEDINGS{shao2019objects365,
  YEAR = {2019},
  AUTHOR = {Shao, Shuai and Li, Zeming and Zhang, Tianyuan and Peng, Chao and Yu, Gang and Zhang, Xiangyu and Li, Jing and Sun, Jian},
  BOOKTITLE = {Proceedings of the IEEE/CVF international conference on computer vision},
  PAGES = {8430--8439},
  TITLE = {Objects365: A large-scale, high-quality dataset for object detection},
}

@ARTICLE{yuan2022wudaomm,
  YEAR = {2022},
  AUTHOR = {Yuan, Sha and Shuai, Zhao and Jiahong, Leng and Zhao, Xue and Hanyu, Zhao and Jie, Tang},
  JOURNALTITLE = {arXiv preprint arXiv:2203.11480},
  TITLE = {WuDaoMM: A large-scale Multi-Modal Dataset for Pre-training models},
}

@INPROCEEDINGS{srinivasan2021wit,
  YEAR = {2021},
  AUTHOR = {Srinivasan, Krishna and Raman, Karthik and Chen, Jiecao and Bendersky, Michael and Najork, Marc},
  BOOKTITLE = {Proceedings of the 44th International ACM SIGIR Conference on Research and Development in Information Retrieval},
  PAGES = {2443--2449},
  TITLE = {Wit: Wikipedia-based image text dataset for multimodal multilingual machine learning},
}

@ARTICLE{tiedemann2018emerging,
  AUTHOR = {Tiedemann, J{\"o}rg},
  YEAR = {2018},
  JOURNALTITLE = {arXiv preprint arXiv:1802.00273},
  TITLE = {Emerging language spaces learned from massively multilingual corpora},
}

@ARTICLE{mayer2014creating,
  AUTHOR = {Mayer, Thomas and Cysouw, Michael},
  YEAR = {2014},
  JOURNALTITLE = {Oceania},
  NUMBER = {273},
  PAGES = {40},
  TITLE = {Creating a massively parallel Bible corpus},
  VOLUME = {135},
}

@INPROCEEDINGS{zellers2019recognition,
  YEAR = {2019},
  AUTHOR = {Zellers, Rowan and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
  BOOKTITLE = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  PAGES = {6720--6731},
  TITLE = {From recognition to cognition: Visual commonsense reasoning},
}

@INPROCEEDINGS{antol2015vqa,
  YEAR = {2015},
  AUTHOR = {Antol, Stanislaw and Agrawal, Aishwarya and Lu, Jiasen and Mitchell, Margaret and Batra, Dhruv and Zitnick, C Lawrence and Parikh, Devi},
  BOOKTITLE = {Proceedings of the IEEE international conference on computer vision},
  PAGES = {2425--2433},
  TITLE = {Vqa: Visual question answering},
}

@INPROCEEDINGS{zhang2016yin,
  YEAR = {2016},
  AUTHOR = {Zhang, Peng and Goyal, Yash and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi},
  BOOKTITLE = {Proceedings of the IEEE conference on computer vision and pattern recognition},
  PAGES = {5014--5022},
  TITLE = {Yin and yang: Balancing and answering binary visual questions},
}

@INPROCEEDINGS{goyal2017making,
  YEAR = {2017},
  AUTHOR = {Goyal, Yash and Khot, Tejas and Summers-Stay, Douglas and Batra, Dhruv and Parikh, Devi},
  BOOKTITLE = {Proceedings of the IEEE conference on computer vision and pattern recognition},
  PAGES = {6904--6913},
  TITLE = {Making the v in vqa matter: Elevating the role of image understanding in visual question answering},
}

@INPROCEEDINGS{hudson2019gqa,
  YEAR = {2019},
  AUTHOR = {Hudson, Drew A and Manning, Christopher D},
  BOOKTITLE = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  PAGES = {6700--6709},
  TITLE = {Gqa: A new dataset for real-world visual reasoning and compositional question answering},
}

@ARTICLE{shekhar2017foil,
  YEAR = {2017},
  AUTHOR = {Shekhar, Ravi and Pezzelle, Sandro and Klimovich, Yauhen and Herbelot, Aurélie and Nabi, Moin and Sangineto, Enver and Bernardi, Raffaella},
  JOURNALTITLE = {arXiv preprint arXiv:1705.01359},
  TITLE = {Foil it! find one mismatch between image and language caption},
}

@ARTICLE{ribeiro2020beyond,
  YEAR = {2020},
  AUTHOR = {Ribeiro, Marco Tulio and Wu, Tongshuang and Guestrin, Carlos and Singh, Sameer},
  JOURNALTITLE = {arXiv preprint arXiv:2005.04118},
  TITLE = {Beyond accuracy: Behavioral testing of NLP models with CheckList},
}

@INPROCEEDINGS{parcalabescu-etal-2022-valse,
  YEAR = {2022},
  AUTHOR = {Parcalabescu, Letitia and Cafagna, Michele and Muradjan, Lilitta and Frank, Anette and Calixto, Iacer and Gatt, Albert},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/2022.acl-long.567},
  BOOKTITLE = {Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  PAGES = {8253--8280},
  TITLE = {{VALSE}: A Task-Independent Benchmark for Vision and Language Models Centered on Linguistic Phenomena},
}

@ARTICLE{sheng2019woman,
  YEAR = {2019},
  AUTHOR = {Sheng, Emily and Chang, Kai-Wei and Natarajan, Premkumar and Peng, Nanyun},
  JOURNALTITLE = {arXiv preprint arXiv:1909.01326},
  TITLE = {The woman worked as a babysitter: On biases in language generation},
}

@INPROCEEDINGS{dhamala2021bold,
  YEAR = {2021},
  AUTHOR = {Dhamala, Jwala and Sun, Tony and Kumar, Varun and Krishna, Satyapriya and Pruksachatkun, Yada and Chang, Kai-Wei and Gupta, Rahul},
  BOOKTITLE = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency},
  PAGES = {862--872},
  TITLE = {Bold: Dataset and metrics for measuring biases in open-ended language generation},
}

@ARTICLE{prabhu2020large,
  YEAR = {2020},
  AUTHOR = {Prabhu, Vinay Uday and Birhane, Abeba},
  JOURNALTITLE = {arXiv preprint arXiv:2006.16923},
  TITLE = {Large image datasets: A pyrrhic win for computer vision?},
}

@ARTICLE{birhane2021multimodal,
  YEAR = {2021},
  AUTHOR = {Birhane, Abeba and Prabhu, Vinay Uday and Kahembwe, Emmanuel},
  JOURNALTITLE = {arXiv preprint arXiv:2110.01963},
  TITLE = {Multimodal datasets: misogyny, pornography, and malignant stereotypes},
}

@ARTICLE{strubell2019energy,
  YEAR = {2019},
  AUTHOR = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
  JOURNALTITLE = {arXiv preprint arXiv:1906.02243},
  TITLE = {Energy and policy considerations for deep learning in NLP},
}

@ARTICLE{lottick2019energy,
  YEAR = {2019},
  AUTHOR = {Lottick, Kadan and Susai, Silvia and Friedler, Sorelle A and Wilson, Jonathan P},
  JOURNALTITLE = {arXiv preprint arXiv:1911.08354},
  TITLE = {Energy Usage Reports: Environmental awareness as part of algorithmic accountability},
}

@ARTICLE{henderson2020towards,
  YEAR = {2020},
  AUTHOR = {Henderson, Peter and Hu, Jieru and Romoff, Joshua and Brunskill, Emma and Jurafsky, Dan and Pineau, Joelle},
  JOURNALTITLE = {Journal of Machine Learning Research},
  NUMBER = {248},
  PAGES = {1--43},
  TITLE = {Towards the systematic reporting of the energy and carbon footprints of machine learning},
  VOLUME = {21},
}

@INPROCEEDINGS{guo2016ms,
  YEAR = {2016},
  AUTHOR = {Guo, Yandong and Zhang, Lei and Hu, Yuxiao and He, Xiaodong and Gao, Jianfeng},
  ORGANIZATION = {Springer},
  BOOKTITLE = {European conference on computer vision},
  PAGES = {87--102},
  TITLE = {Ms-celeb-1m: A dataset and benchmark for large-scale face recognition},
}

@INPROCEEDINGS{sun,
  YEAR = {2010},
  AUTHOR = {Xiao, Jianxiong and Hays, James and Ehinger, Krista A. and Oliva, Aude and Torralba, Antonio},
  BOOKTITLE = {2010 IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
  DOI = {10.1109/CVPR.2010.5539970},
  PAGES = {3485--3492},
  TITLE = {SUN database: Large-scale scene recognition from abbey to zoo},
}

@ARTICLE{pascalvoc,
  YEAR = {2010},
  AUTHOR = {Everingham, Mark and {van Gool}, Luc and Williams, {Christopher K. I.} and Winn, John and Zisserman, Andrew},
  LANGUAGE = {English},
  PUBLISHER = {Springer Netherlands},
  DOI = {10.1007/s11263-009-0275-4},
  ISSN = {0920-5691},
  JOURNALTITLE = {International Journal of Computer Vision},
  KEYWORDS = {Benchmark,Database,Object detection,Object recognition},
  NUMBER = {2},
  PAGES = {303--338},
  TITLE = {The PASCAL Visual Object Classes (VOC) Challenge},
  VOLUME = {88},
}

@ARTICLE{WordNet,
  YEAR = {2000},
  AUTHOR = {Fellbaum, Christiane D.},
  JOURNALTITLE = {Language},
  PAGES = {706},
  TITLE = {WordNet : an electronic lexical database},
  VOLUME = {76},
}

@INPROCEEDINGS{Socher10connectingmodalities,
  YEAR = {2010},
  AUTHOR = {Socher, Richard and Fei-fei, Li},
  BOOKTITLE = {In IEEE Computer Society Conference on Computer Vision and Pattern Recognition},
  TITLE = {Connecting modalities: Semi-supervised segmentation and annotation of images using unaligned text corpora},
}

@ARTICLE{5487377,
  YEAR = {2010},
  AUTHOR = {Yao, Benjamin Z. and Yang, Xiong and Lin, Liang and Lee, Mun Wai and Zhu, Song-Chun},
  DOI = {10.1109/JPROC.2010.2050411},
  JOURNALTITLE = {Proceedings of the IEEE},
  NUMBER = {8},
  PAGES = {1485--1508},
  TITLE = {I2T: Image Parsing to Text Description},
  VOLUME = {98},
}

@INPROCEEDINGS{vinyals,
  YEAR = {2015},
  AUTHOR = {Vinyals, Oriol and Toshev, Alexander and Bengio, Samy and Erhan, Dumitru},
  DOI = {10.1109/CVPR.2015.7298935},
  PAGES = {3156--3164},
  TITLE = {Show and tell: A neural image caption generator},
}

@MISC{karpthy1,
  YEAR = {2014},
  AUTHOR = {Karpathy, Andrej and Fei-Fei, Li},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1412.2306},
  DOI = {10.48550/ARXIV.1412.2306},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Deep Visual-Semantic Alignments for Generating Image Descriptions},
}

@MISC{xu1,
  YEAR = {2015},
  AUTHOR = {Xu, Kelvin and Ba, Jimmy and Kiros, Ryan and Cho, Kyunghyun and Courville, Aaron and Salakhutdinov, Ruslan and Zemel, Richard and Bengio, Yoshua},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1502.03044},
  DOI = {10.48550/ARXIV.1502.03044},
  KEYWORDS = {Machine Learning (cs.LG),Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Show, Attend and Tell: Neural Image Caption Generation with Visual Attention},
}

@MISC{yao1,
  YEAR = {2018},
  AUTHOR = {Yao, Ting and Pan, Yingwei and Li, Yehao and Mei, Tao},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1809.07041},
  DOI = {10.48550/ARXIV.1809.07041},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Exploring Visual Relationship for Image Captioning},
}

@INPROCEEDINGS{devlin-etal-2019-bert,
  YEAR = {2019},
  AUTHOR = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  LOCATION = {Minneapolis, Minnesota},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/N19-1423},
  DOI = {10.18653/v1/N19-1423},
  PAGES = {4171--4186},
  TITLE = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
}

@INPROCEEDINGS{HerdadeKBS19,
  YEAR = {2019},
  AUTHOR = {Herdade, Simao and Kappeler, Armin and Boakye, Kofi and Soares, Joao},
  URL = {http://papers.nips.cc/paper/9293-image-captioning-transforming-objects-into-words},
  PAGES = {11135--11145},
  TITLE = {Image Captioning: Transforming Objects into Words},
}

@INPROCEEDINGS{huang1,
  YEAR = {2019},
  AUTHOR = {Huang, Lun and Wang, Wenmin and Chen, Jie and Wei, Xiao-Yong},
  DOI = {10.1109/ICCV.2019.00473},
  PAGES = {4633--4642},
  TITLE = {Attention on Attention for Image Captioning},
}

@INPROCEEDINGS{NIPS2017_3f5ee243,
  YEAR = {2017},
  AUTHOR = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia},
  EDITOR = {Guyon, I. and Luxburg, U. Von and Bengio, S. and Wallach, H. and Fergus, R. and Vishwanathan, S. and Garnett, R.},
  PUBLISHER = {Curran Associates, Inc.},
  URL = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
  BOOKTITLE = {Advances in Neural Information Processing Systems},
  TITLE = {Attention is All you Need},
  VOLUME = {30},
}

@INPROCEEDINGS{spice,
  YEAR = {2016},
  AUTHOR = {Anderson, Peter and Fernando, Basura and Johnson, Mark and Gould, Stephen},
  EDITOR = {Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max},
  LOCATION = {Cham},
  PUBLISHER = {Springer International Publishing},
  BOOKTITLE = {Computer Vision -- ECCV 2016},
  ISBN = {978-3-319-46454-1},
  PAGES = {382--398},
  TITLE = {SPICE: Semantic Propositional Image Caption Evaluation},
}

@INPROCEEDINGS{meteor,
  YEAR = {2005},
  AUTHOR = {Banerjee, Satanjeev and Lavie, Alon},
  LOCATION = {Ann Arbor, Michigan},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/W05-0909},
  BOOKTITLE = {Proceedings of the {ACL} Workshop on Intrinsic and Extrinsic Evaluation Measures for Machine Translation and/or Summarization},
  PAGES = {65--72},
  TITLE = {{METEOR}: An Automatic Metric for {MT} Evaluation with Improved Correlation with Human Judgments},
}

@INPROCEEDINGS{lin-2004-rouge,
  YEAR = {2004},
  AUTHOR = {Lin, Chin-Yew},
  LOCATION = {Barcelona, Spain},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/W04-1013},
  BOOKTITLE = {Text Summarization Branches Out},
  PAGES = {74--81},
  TITLE = {{ROUGE}: A Package for Automatic Evaluation of Summaries},
}

@INPROCEEDINGS{cider,
  YEAR = {2015},
  AUTHOR = {Vedantam, Ramakrishna and Zitnick, C. Lawrence and Parikh, Devi},
  BOOKTITLE = {2015 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  DOI = {10.1109/CVPR.2015.7299087},
  PAGES = {4566--4575},
  TITLE = {CIDEr: Consensus-based image description evaluation},
}

@INPROCEEDINGS{papineni-etal-2002-bleu,
  YEAR = {2002},
  AUTHOR = {Papineni, Kishore and Roukos, Salim and Ward, Todd and Zhu, Wei-Jing},
  LOCATION = {Philadelphia, Pennsylvania, USA},
  PUBLISHER = {Association for Computational Linguistics},
  URL = {https://aclanthology.org/P02-1040},
  BOOKTITLE = {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics},
  DOI = {10.3115/1073083.1073135},
  PAGES = {311--318},
  TITLE = {{B}leu: a Method for Automatic Evaluation of Machine Translation},
}

@INPROCEEDINGS{8578734,
  YEAR = {2018},
  AUTHOR = {Anderson, Peter and He, Xiaodong and Buehler, Chris and Teney, Damien and Johnson, Mark and Gould, Stephen and Zhang, Lei},
  BOOKTITLE = {2018 IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  DOI = {10.1109/CVPR.2018.00636},
  PAGES = {6077--6086},
  TITLE = {Bottom-Up and Top-Down Attention for Image Captioning and Visual Question Answering},
}

@MISC{rfnet,
  AUTHOR = {Jiang, Wenhao and Ma, Lin and Jiang, Yu-Gang and Liu, Wei and Zhang, Tong},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1807.09986},
  DOI = {10.48550/ARXIV.1807.09986},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Recurrent Fusion Network for Image Captioning},
}

@INPROCEEDINGS{8099614,
  YEAR = {2017},
  AUTHOR = {Rennie, Steven J. and Marcheret, Etienne and Mroueh, Youssef and Ross, Jerret and Goel, Vaibhava},
  BOOKTITLE = {2017 IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
  DOI = {10.1109/CVPR.2017.131},
  PAGES = {1179--1195},
  TITLE = {Self-Critical Sequence Training for Image Captioning},
}

@INPROCEEDINGS{Yang_2019_CVPR,
  YEAR = {2019},
  AUTHOR = {Yang, Xu and Tang, Kaihua and Zhang, Hanwang and Cai, Jianfei},
  BOOKTITLE = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  TITLE = {Auto-Encoding Scene Graphs for Image Captioning},
}

@MISC{GCN-LSTM,
  YEAR = {2018},
  AUTHOR = {Yao, Ting and Pan, Yingwei and Li, Yehao and Mei, Tao},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1809.07041},
  DOI = {10.48550/ARXIV.1809.07041},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Exploring Visual Relationship for Image Captioning},
}

@ARTICLE{galanter2016generative,
  YEAR = {2016},
  AUTHOR = {Galanter, Philip},
  PUBLISHER = {John Wiley \& Sons Hoboken, NJ},
  JOURNALTITLE = {A Companion to Digital Art},
  PAGES = {631},
  TITLE = {Generative art theory},
  VOLUME = {1},
}

@MISC{mordvintsev_2015,
  YEAR = {2015},
  AUTHOR = {Mordvintsev, Alexander},
  PUBLISHER = {Google},
  URL = {https://ai.googleblog.com/2015/06/inceptionism-going-deeper-into-neural.html},
  JOURNALTITLE = {Google AI Blog},
  TITLE = {Inceptionism: Going Deeper into Neural Networks},
}

@MISC{tensorflow2015,
  PUBLISHER = {tensorflow.org},
  URL = {https://www.tensorflow.org/tutorials/generative/deepdream},
  NOTE = {Google Colab available from tensorflow.org},
  TITLE = {DeepDream},
}

@MISC{StyleTransfer,
  YEAR = {2016},
  AUTHOR = {Gatys, Leon A. and Ecker, Alexander S. and Bethge, Matthias},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1508.06576},
  DOI = {10.48550/ARXIV.1508.06576},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Neural and Evolutionary Computing (cs.NE),Neurons and Cognition (q-bio.NC),FOS: Computer and information sciences,FOS: Computer and information sciences,FOS: Biological sciences,FOS: Biological sciences},
  TITLE = {A Neural Algorithm of Artistic Style},
}

@INPROCEEDINGS{NIPS2014_5ca3e9b1,
  YEAR = {2014},
  AUTHOR = {Goodfellow, Ian and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  EDITOR = {Ghahramani, Z. and Welling, M. and Cortes, C. and Lawrence, N. and Weinberger, K.Q.},
  PUBLISHER = {Curran Associates, Inc.},
  URL = {https://proceedings.neurips.cc/paper/2014/file/5ca3e9b122f61f8f06494c97b1afccf3-Paper.pdf},
  BOOKTITLE = {Advances in Neural Information Processing Systems},
  TITLE = {Generative Adversarial Nets},
  VOLUME = {27},
}

@INPROCEEDINGS{karras2019style,
  YEAR = {2019},
  AUTHOR = {Karras, Tero and Laine, Samuli and Aila, Timo},
  BOOKTITLE = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  PAGES = {4401--4410},
  TITLE = {A style-based generator architecture for generative adversarial networks},
}

@MISC{morris_2022,
  AUTHOR = {Morris, Jack},
  URL = {https://www.bibme.org/bibtex/website-citation},
  TITLE = {The Weird and Wonderful World of AI Art},
}

@INPROCEEDINGS{8477754,
  YEAR = {2018},
  AUTHOR = {Soderlund, Jacob and Blair, Alan},
  BOOKTITLE = {2018 IEEE Congress on Evolutionary Computation (CEC)},
  DOI = {10.1109/CEC.2018.8477754},
  PAGES = {1--8},
  TITLE = {Adversarial Image Generation Using Evolution and Deep Learning},
}

@ARTICLE{StyleGAN,
  YEAR = {2021},
  AUTHOR = {Patashnik, Or and Wu, Zongze and Shechtman, Eli and Cohen{-}Or, Daniel and Lischinski, Dani},
  URL = {https://arxiv.org/abs/2103.17249},
  EPRINT = {2103.17249},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {StyleCLIP: Text-Driven Manipulation of StyleGAN Imagery},
}

@ARTICLE{DiffusionModels,
  YEAR = {2021},
  AUTHOR = {Dhariwal, Prafulla and Nichol, Alex},
  URL = {https://arxiv.org/abs/2105.05233},
  EPRINT = {2105.05233},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {Diffusion Models Beat GANs on Image Synthesis},
}

@MISC{ruDALLE,
  YEAR = {2021},
  AUTHOR = {Shonenkov, Alex},
  URL = {https://github.com/ai-forever/ru-dalle},
  TITLE = {ruDALL-E},
}

@MISC{DALLEmini,
  YEAR = {2022},
  AUTHOR = {Boris, Dayma},
  URL = {https://huggingface.co/spaces/dalle-mini/dalle-mini},
  TITLE = {DALL·E mini},
}

@MISC{DALLEpytorch,
  YEAR = {2021},
  AUTHOR = {OpenAI},
  URL = {https://github.com/openai/DALL-E},
  TITLE = {DALL-E},
}

@INPROCEEDINGS{liu2022design,
  AUTHOR = {Liu, Vivian and Chilton, Lydia B},
  BOOKTITLE = {CHI Conference on Human Factors in Computing Systems},
  PAGES = {1--23},
  TITLE = {Design Guidelines for Prompt Engineering Text-to-Image Generative Models},
}

@ARTICLE{LAION,
  YEAR = {2021},
  AUTHOR = {Schuhmann, Christoph and Vencu, Richard and Beaumont, Romain and Kaczmarczyk, Robert and Mullis, Clayton and Katta, Aarush and Coombes, Theo and Jitsev, Jenia and Komatsuzaki, Aran},
  URL = {https://arxiv.org/abs/2111.02114},
  EPRINT = {2111.02114},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {{LAION-400M:} Open Dataset of CLIP-Filtered 400 Million Image-Text Pairs},
}

@MISC{WZRD,
  YEAR = {2020},
  AUTHOR = {WZRD},
  URL = {https://wzrd.ai/},
  TITLE = {WZRD},
}


@MISC{bias,
  YEAR = {2020},
  AUTHOR = {Esser, Patrick and Rombach, Robin and Ommer, Bj{\"o}rn},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2012.02516},
  DOI = {10.48550/ARXIV.2012.02516},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {A Note on Data Biases in Generative Models},
}

@INPROCEEDINGS{Radford2019LanguageMA,
  YEAR = {2019},
  AUTHOR = {Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  TITLE = {Language Models are Unsupervised Multitask Learners},
}

@MISC{GPT3,
  YEAR = {2020},
  AUTHOR = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel M. and Wu, Jeffrey and Winter, Clemens and Hesse, Christopher and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2005.14165},
  DOI = {10.48550/ARXIV.2005.14165},
  KEYWORDS = {Computation and Language (cs.CL),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Language Models are Few-Shot Learners},
}

@MISC{environment,
  YEAR = {2019},
  AUTHOR = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1906.02243},
  DOI = {10.48550/ARXIV.1906.02243},
  KEYWORDS = {Computation and Language (cs.CL),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Energy and Policy Considerations for Deep Learning in NLP},
}

@MISC{BERT,
  YEAR = {2018},
  AUTHOR = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1810.04805},
  DOI = {10.48550/ARXIV.1810.04805},
  KEYWORDS = {Computation and Language (cs.CL),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
}

@INPROCEEDINGS{attention,
  YEAR = {2017},
  AUTHOR = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, Łukasz and Polosukhin, Illia},
  EDITOR = {Guyon, I. and Luxburg, U. Von and Bengio, S. and Wallach, H. and Fergus, R. and Vishwanathan, S. and Garnett, R.},
  PUBLISHER = {Curran Associates, Inc.},
  URL = {https://proceedings.neurips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf},
  BOOKTITLE = {Advances in Neural Information Processing Systems},
  TITLE = {Attention is All you Need},
  VOLUME = {30},
}

@ARTICLE{3D,
  YEAR = {2022},
  AUTHOR = {Mccormack, Jon and Gambardella, Camilo Cruz},
  DOI = {10.1109/TEVC.2021.3095156},
  JOURNALTITLE = {IEEE Transactions on Evolutionary Computation},
  NUMBER = {1},
  PAGES = {88--99},
  TITLE = {Growing and Evolving 3-D Prints},
  VOLUME = {26},
}

@ARTICLE{misconduct,
  YEAR = {2021},
  AUTHOR = {Dehouche, Nassim},
  JOURNALTITLE = {Ethics in Science and Environmental Politics},
  PAGES = {17--23},
  TITLE = {Plagiarism in the age of massive Generative Pre-trained Transformers (GPT-3)},
  VOLUME = {21},
}

@INPROCEEDINGS{bias_ML,
  YEAR = {2021},
  AUTHOR = {Srinivasan, Ramya and Uchino, Kanji},
  BOOKTITLE = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency},
  PAGES = {41--51},
  TITLE = {Biases in generative art: A causal look from the lens of art history},
}

@INPROCEEDINGS{qiao2022initial,
  YEAR = {2022},
  AUTHOR = {Qiao, Han and Liu, Vivian and Chilton, Lydia},
  BOOKTITLE = {Creativity and Cognition},
  PAGES = {15--28},
  TITLE = {Initial Images: Using Image Prompts to Improve Subject Representation in Multimodal AI Generated Art},
}

@MISC{unrealEngine,
  YEAR = {2021},
  AUTHOR = {Aran, Komatsuzaki},
  PUBLISHER = {Twitter},
  URL = {https://twitter.com/arankomatsuzaki/status/1399471244760649729},
  TITLE = {When you generate images with VQGAN CLIP, the image quality dramatically improves if you add "unreal engine" to your prompt. People are now calling this "unreal engine trick"},
}

@MISC{NFT,
  YEAR = {2021},
  AUTHOR = {Wang, Qin and Li, Rujia and Wang, Qi and Chen, Shiping},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2105.07447},
  DOI = {10.48550/ARXIV.2105.07447},
  KEYWORDS = {Cryptography and Security (cs.CR),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Non-Fungible Token (NFT): Overview, Evaluation, Opportunities and Challenges},
}

@ARTICLE{EfficientNet,
  YEAR = {2019},
  AUTHOR = {Tan, Mingxing and Le, Quoc V.},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1905.11946},
  DOI = {10.48550/ARXIV.1905.11946},
  KEYWORDS = {Machine Learning (cs.LG),Computer Vision and Pattern Recognition (cs.CV),Machine Learning (stat.ML),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks},
}

@MISC{SimCLR,
  YEAR = {2020},
  AUTHOR = {Chen, Ting and Kornblith, Simon and Norouzi, Mohammad and Hinton, Geoffrey},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2002.05709},
  DOI = {10.48550/ARXIV.2002.05709},
  KEYWORDS = {Machine Learning (cs.LG),Computer Vision and Pattern Recognition (cs.CV),Machine Learning (stat.ML),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {A Simple Framework for Contrastive Learning of Visual Representations},
}

@MISC{BYOL,
  YEAR = {2020},
  AUTHOR = {Grill, Jean-Bastien and Strub, Florian and Altché, Florent and Tallec, Corentin and Richemond, Pierre H. and Buchatskaya, Elena and Doersch, Carl and Pires, Bernardo Avila and Guo, Zhaohan Daniel and Azar, Mohammad Gheshlaghi and Piot, Bilal and Kavukcuoglu, Koray and Munos, Rémi and Valko, Michal},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2006.07733},
  DOI = {10.48550/ARXIV.2006.07733},
  KEYWORDS = {Machine Learning (cs.LG),Computer Vision and Pattern Recognition (cs.CV),Machine Learning (stat.ML),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Bootstrap your own latent: A new approach to self-supervised Learning},
}

@MISC{COCO,
  YEAR = {2014},
  AUTHOR = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Bourdev, Lubomir and Girshick, Ross and Hays, James and Perona, Pietro and Ramanan, Deva and Zitnick, C. Lawrence and Dollár, Piotr},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1405.0312},
  DOI = {10.48550/ARXIV.1405.0312},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Microsoft COCO: Common Objects in Context},
}

@MISC{meshed_memory,
  YEAR = {2019},
  AUTHOR = {Cornia, Marcella and Stefanini, Matteo and Baraldi, Lorenzo and Cucchiara, Rita},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1912.08226},
  DOI = {10.48550/ARXIV.1912.08226},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Computation and Language (cs.CL),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Meshed-Memory Transformer for Image Captioning},
}

@ARTICLE{VAE,
  YEAR = {2019},
  AUTHOR = {Kingma, Diederik P. and Welling, Max},
  URL = {http://arxiv.org/abs/1906.02691},
  EPRINT = {1906.02691},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {An Introduction to Variational Autoencoders},
}

@MISC{DALLE,
  YEAR = {2021},
  AUTHOR = {Ramesh, Aditya and Pavlov, Mikhail and Goh, Gabriel and Gray, Scott and Voss, Chelsea and Radford, Alec and Chen, Mark and Sutskever, Ilya},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2102.12092},
  DOI = {10.48550/ARXIV.2102.12092},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Zero-Shot Text-to-Image Generation},
}

@MISC{CLIP,
  YEAR = {2021},
  AUTHOR = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2103.00020},
  DOI = {10.48550/ARXIV.2103.00020},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Learning Transferable Visual Models From Natural Language Supervision},
}

@MISC{VilBert,
  YEAR = {2019},
  AUTHOR = {Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1908.02265},
  DOI = {10.48550/ARXIV.1908.02265},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Computation and Language (cs.CL),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {ViLBERT: Pretraining Task-Agnostic Visiolinguistic Representations for Vision-and-Language Tasks},
}

@MISC{Flamingo,
  YEAR = {2022},
  AUTHOR = {Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and Ring, Roman and Rutherford, Eliza and Cabi, Serkan and Han, Tengda and Gong, Zhitao and Samangooei, Sina and Monteiro, Marianne and Menick, Jacob and Borgeaud, Sebastian and Brock, Andrew and Nematzadeh, Aida and Sharifzadeh, Sahand and Binkowski, Mikolaj and Barreira, Ricardo and Vinyals, Oriol and Zisserman, Andrew and Simonyan, Karen},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2204.14198},
  DOI = {10.48550/ARXIV.2204.14198},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Artificial Intelligence (cs.AI),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Flamingo: a Visual Language Model for Few-Shot Learning},
}

@MISC{GAN,
  YEAR = {2014},
  AUTHOR = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/1406.2661},
  DOI = {10.48550/ARXIV.1406.2661},
  KEYWORDS = {Machine Learning (stat.ML),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Generative Adversarial Networks},
}

@MISC{GLIDE,
  YEAR = {2021},
  AUTHOR = {Nichol, Alex and Dhariwal, Prafulla and Ramesh, Aditya and Shyam, Pranav and Mishkin, Pamela and McGrew, Bob and Sutskever, Ilya and Chen, Mark},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2112.10741},
  DOI = {10.48550/ARXIV.2112.10741},
  KEYWORDS = {Computer Vision and Pattern Recognition (cs.CV),Graphics (cs.GR),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models},
}

@MISC{Pathways,
  YEAR = {2022},
  AUTHOR = {Barham, Paul and Chowdhery, Aakanksha and Dean, Jeff and Ghemawat, Sanjay and Hand, Steven and Hurt, Dan and Isard, Michael and Lim, Hyeontaek and Pang, Ruoming and Roy, Sudip and Saeta, Brennan and Schuh, Parker and Sepassi, Ryan and Shafey, Laurent El and Thekkath, Chandramohan A. and Wu, Yonghui},
  PUBLISHER = {arXiv},
  URL = {https://arxiv.org/abs/2203.12533},
  DOI = {10.48550/ARXIV.2203.12533},
  KEYWORDS = {Distributed,Parallel,and Cluster Computing (cs.DC),Machine Learning (cs.LG),FOS: Computer and information sciences,FOS: Computer and information sciences},
  TITLE = {Pathways: Asynchronous Distributed Dataflow for ML},
}

@ARTICLE{explainaility,
  YEAR = {2021},
  AUTHOR = {Joshi, Gargi and Walambe, Rahee and Kotecha, Ketan},
  DOI = {10.1109/ACCESS.2021.3070212},
  JOURNALTITLE = {IEEE Access},
  PAGES = {59800--59821},
  TITLE = {A Review on Explainability in Multimodal Deep Neural Nets},
  VOLUME = {9},
}

@ARTICLE{ALIGN,
  YEAR = {2021},
  AUTHOR = {Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi{-}Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc V. and Sung, Yun{-}Hsuan and Li, Zhen and Duerig, Tom},
  URL = {https://arxiv.org/abs/2102.05918},
  EPRINT = {2102.05918},
  EPRINTTYPE = {arXiv},
  JOURNALTITLE = {CoRR},
  TITLE = {Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision},
}

@ARTICLE{yuan2021florence,
  YEAR = {2021},
  AUTHOR = {Yuan, Lu and Chen, Dongdong and Chen, Yi-Ling and Codella, Noel and Dai, Xiyang and Gao, Jianfeng and Hu, Houdong and Huang, Xuedong and Li, Boxin and Li, Chunyuan and others},
  JOURNALTITLE = {arXiv preprint arXiv:2111.11432},
  TITLE = {Florence: A New Foundation Model for Computer Vision},
}

@ARTICLE{agirre2009study,
  YEAR = {2009},
  AUTHOR = {Agirre, Eneko and Alfonseca, Enrique and Hall, Keith and Kravalova, Jana and Pasca, Marius and Soroa, Aitor},
  TITLE = {A study on similarity and relatedness using distributional and wordnet-based approaches},
}

@ARTICLE{ailem2018probabilistic,
  YEAR = {2018},
  AUTHOR = {Ailem, Melissa and Zhang, Bowen and Bellet, Aurelien and Denis, Pascal and Sha, Fei},
  TITLE = {A probabilistic model for joint learning of word embeddings from texts and images},
}

@INPROCEEDINGS{bosch2007image,
  YEAR = {2007},
  AUTHOR = {Bosch, Anna and Zisserman, Andrew and Munoz, Xavier},
  ORGANIZATION = {Ieee},
  BOOKTITLE = {2007 IEEE 11th international conference on computer vision},
  PAGES = {1--8},
  TITLE = {Image classification using random forests and ferns},
}

@ARTICLE{bruni2014multimodal,
  YEAR = {2014},
  AUTHOR = {Bruni, Elia and Tran, Nam-Khanh and Baroni, Marco},
  JOURNALTITLE = {Journal of artificial intelligence research},
  PAGES = {1--47},
  TITLE = {Multimodal distributional semantics},
  VOLUME = {49},
}

@ARTICLE{brysbaert2014concreteness,
  YEAR = {2014},
  AUTHOR = {Brysbaert, Marc and Warriner, Amy Beth and Kuperman, Victor},
  PUBLISHER = {Springer},
  JOURNALTITLE = {Behavior research methods},
  NUMBER = {3},
  PAGES = {904--911},
  TITLE = {Concreteness ratings for 40 thousand generally known English word lemmas},
  VOLUME = {46},
}

@INPROCEEDINGS{collell2017imagined,
  YEAR = {2017},
  AUTHOR = {Collell, Guillem and Zhang, Ted and Moens, Marie-Francine},
  BOOKTITLE = {Proceedings of the AAAI Conference on Artificial Intelligence},
  NUMBER = {1},
  TITLE = {Imagined visual representations as multimodal embeddings},
  VOLUME = {31},
}

@ARTICLE{devlin2018bert,
  YEAR = {2018},
  AUTHOR = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  JOURNALTITLE = {arXiv preprint arXiv:1810.04805},
  TITLE = {Bert: Pre-training of deep bidirectional transformers for language understanding},
}

@ARTICLE{devereux2014centre,
  YEAR = {2014},
  AUTHOR = {Devereux, Barry J and Tyler, Lorraine K and Geertzen, Jeroen and Randall, Billi},
  PUBLISHER = {Springer},
  JOURNALTITLE = {Behavior research methods},
  NUMBER = {4},
  PAGES = {1119--1127},
  TITLE = {The Centre for Speech, Language and the Brain (CSLB) concept property norms},
  VOLUME = {46},
}

@INPROCEEDINGS{esser2021taming,
  YEAR = {2021},
  AUTHOR = {Esser, Patrick and Rombach, Robin and Ommer, Bjorn},
  BOOKTITLE = {Proceedings of the IEEE/CVF conference on computer vision and pattern recognition},
  PAGES = {12873--12883},
  TITLE = {Taming transformers for high-resolution image synthesis},
}

@ARTICLE{harnad1990symbol,
  YEAR = {1990},
  AUTHOR = {Harnad, Stevan},
  PUBLISHER = {Elsevier},
  JOURNALTITLE = {Physica D: Nonlinear Phenomena},
  NUMBER = {1-3},
  PAGES = {335--346},
  TITLE = {The symbol grounding problem},
  VOLUME = {42},
}

@ARTICLE{harris1954distributional,
  YEAR = {1954},
  AUTHOR = {Harris, Z and others},
  JOURNALTITLE = {Word World},
  NUMBER = {23},
  PAGES = {146--162},
  TITLE = {Distributional hypothesis},
  VOLUME = {10},
}

@INPROCEEDINGS{hill2014learning,
  YEAR = {2014},
  AUTHOR = {Hill, Felix and Korhonen, Anna},
  BOOKTITLE = {Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  PAGES = {255--265},
  TITLE = {Learning abstract concept embeddings from multi-modal data: Since you probably can’t see what I mean},
}

@ARTICLE{hill2015simlex,
  YEAR = {2015},
  AUTHOR = {Hill, Felix and Reichart, Roi and Korhonen, Anna},
  PUBLISHER = {MIT Press One Rogers Street, Cambridge, MA 02142-1209, USA journals-info~…},
  JOURNALTITLE = {Computational Linguistics},
  NUMBER = {4},
  PAGES = {665--695},
  TITLE = {Simlex-999: Evaluating semantic models with (genuine) similarity estimation},
  VOLUME = {41},
}

@ARTICLE{hochreiter1997long,
  YEAR = {1997},
  AUTHOR = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  PUBLISHER = {MIT Press},
  JOURNALTITLE = {Neural computation},
  NUMBER = {8},
  PAGES = {1735--1780},
  TITLE = {Long short-term memory},
  VOLUME = {9},
}

@INPROCEEDINGS{hu2021unit,
  YEAR = {2021},
  AUTHOR = {Hu, Ronghang and Singh, Amanpreet},
  BOOKTITLE = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
  PAGES = {1439--1449},
  TITLE = {Unit: Multimodal multitask learning with a unified transformer},
}

@ARTICLE{ive2019distilling,
  YEAR = {2019},
  AUTHOR = {Ive, Julia and Madhyastha, Pranava and Specia, Lucia},
  JOURNALTITLE = {arXiv preprint arXiv:1906.07701},
  TITLE = {Distilling translations with visual awareness},
}

@INPROCEEDINGS{kiela2014learning,
  YEAR = {2014},
  AUTHOR = {Kiela, Douwe and Bottou, Léon},
  BOOKTITLE = {Proceedings of the 2014 Conference on empirical methods in natural language processing (EMNLP)},
  PAGES = {36--45},
  TITLE = {Learning image embeddings using convolutional neural networks for improved multi-modal semantics},
}

@ARTICLE{kiela2017learning,
  YEAR = {2017},
  AUTHOR = {Kiela, Douwe and Conneau, Alexis and Jabri, Allan and Nickel, Maximilian},
  JOURNALTITLE = {arXiv preprint arXiv:1707.06320},
  TITLE = {Learning visually grounded sentence representations},
}

@INPROCEEDINGS{kiros2018illustrative,
  YEAR = {2018},
  AUTHOR = {Kiros, Jamie and Chan, William and Hinton, Geoffrey},
  BOOKTITLE = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  PAGES = {922--933},
  TITLE = {Illustrative language understanding: Large-scale visual grounding with image search},
}

@INPROCEEDINGS{kottur2016visual,
  YEAR = {2016},
  AUTHOR = {Kottur, Satwik and Vedantam, Ramakrishna and Moura, José MF and Parikh, Devi},
  BOOKTITLE = {Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition},
  PAGES = {4985--4994},
  TITLE = {Visual word2vec (vis-w2v): Learning visually grounded word embeddings using abstract scenes},
}

@ARTICLE{krizhevsky2012imagenet,
  YEAR = {2012},
  AUTHOR = {Krizhevsky, Alex and Sutskever, Ilya and Hinton, Geoffrey E},
  JOURNALTITLE = {Advances in neural information processing systems},
  TITLE = {Imagenet classification with deep convolutional neural networks},
  VOLUME = {25},
}

@ARTICLE{lazaridou2015combining,
  YEAR = {2015},
  AUTHOR = {Lazaridou, Angeliki and Pham, Nghia The and Baroni, Marco},
  JOURNALTITLE = {arXiv preprint arXiv:1501.02598},
  TITLE = {Combining language and vision with a multimodal skip-gram model},
}

@INPROCEEDINGS{lin2014microsoft,
  YEAR = {2014},
  AUTHOR = {Lin, Tsung-Yi and Maire, Michael and Belongie, Serge and Hays, James and Perona, Pietro and Ramanan, Deva and Dollár, Piotr and Zitnick, C Lawrence},
  ORGANIZATION = {Springer},
  BOOKTITLE = {European conference on computer vision},
  PAGES = {740--755},
  TITLE = {Microsoft coco: Common objects in context},
}

@ARTICLE{lu2022imagination,
  YEAR = {2022},
  AUTHOR = {Lu, Yujie and Zhu, Wanrong and Wang, Xin Eric and Eckstein, Miguel and Wang, William Yang},
  JOURNALTITLE = {arXiv preprint arXiv:2204.08535},
  TITLE = {Imagination-Augmented Natural Language Understanding},
}


@article{yu2022coca,
  title={CoCa: Contrastive Captioners are Image-Text Foundation Models},
  author={Yu, Jiahui and Wang, Zirui and Vasudevan, Vijay and Yeung, Legg and Seyedhosseini, Mojtaba and Wu, Yonghui},
  journal={arXiv preprint arXiv:2205.01917},
  year={2022}
}


@Article{Mikolov2013,
  author      = {Tomas Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean},
  title       = {Efficient Estimation of Word Representations in Vector Space},
  YEAR = {2013},
  JOURNALTITLE = {arXiv preprint arXiv:1301.3781},
  TITLE = {Efficient estimation of word representations in vector space},
}

@ARTICLE{mikolov2013efficient,
  AUTHOR = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
  YEAR = {2013},
  JOURNALTITLE = {arXiv preprint arXiv:1301.3781},
  TITLE = {Efficient estimation of word representations in vector space},
}

@Article{Mikolov2013a,
  author      = {Tomas Mikolov and Quoc V. Le and Ilya Sutskever},
  title       = {Exploiting Similarities among Languages for Machine Translation},

  year        = {2013},
  abstract    = {Dictionaries and phrase tables are the basis of modern statistical machine translation systems. This paper develops a method that can automate the process of generating and extending dictionaries and phrase tables. Our method can translate missing word and phrase entries by learning language structures based on large monolingual data and mapping between languages from small bilingual data. It uses distributed representation of words and learns a linear mapping between vector spaces of languages. Despite its simplicity, our method is surprisingly effective: we can achieve almost 90% precision@5 for translation of words between English and Spanish. This method makes little assumption about the languages, so it can be used to extend and refine dictionaries and translation tables for any language pairs.},
  date        = {2013-09-17},
  eprint      = {1309.4168},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1309.4168v1:PDF},
  keywords    = {cs.CL},
}

@Article{Mikolov2013b,
  author      = {Tomas Mikolov and Ilya Sutskever and Kai Chen and Greg Corrado and Jeffrey Dean},
  title       = {Distributed Representations of Words and Phrases and their Compositionality},
  year        = {2013},
  abstract    = {The recently introduced continuous Skip-gram model is an efficient method for learning high-quality distributed vector representations that capture a large number of precise syntactic and semantic word relationships. In this paper we present several extensions that improve both the quality of the vectors and the training speed. By subsampling of the frequent words we obtain significant speedup and also learn more regular word representations. We also describe a simple alternative to the hierarchical softmax called negative sampling. An inherent limitation of word representations is their indifference to word order and their inability to represent idiomatic phrases. For example, the meanings of "Canada" and "Air" cannot be easily combined to obtain "Air Canada". Motivated by this example, we present a simple method for finding phrases in text, and show that learning good vector representations for millions of phrases is possible.},
  date        = {2013-10-16},
  eprint      = {1310.4546},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1310.4546v1:PDF},
  keywords    = {cs.CL, cs.LG, stat.ML},
}

@Article{Bojanowski2016,
  author      = {Piotr Bojanowski and Edouard Grave and Armand Joulin and Tomas Mikolov},
  title       = {Enriching Word Vectors with Subword Information},
  year        = {2016},
  abstract    = {Continuous word representations, trained on large unlabeled corpora are useful for many natural language processing tasks. Popular models that learn such representations ignore the morphology of words, by assigning a distinct vector to each word. This is a limitation, especially for languages with large vocabularies and many rare words. In this paper, we propose a new approach based on the skipgram model, where each word is represented as a bag of character $n$-grams. A vector representation is associated to each character $n$-gram; words being represented as the sum of these representations. Our method is fast, allowing to train models on large corpora quickly and allows us to compute word representations for words that did not appear in the training data. We evaluate our word representations on nine different languages, both on word similarity and analogy tasks. By comparing to recently proposed morphological word representations, we show that our vectors achieve state-of-the-art performance on these tasks.},
  date        = {2016-07-15},
  eprint      = {1607.04606},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1607.04606v2:PDF},
  keywords    = {cs.CL, cs.LG},
}

@Article{Bahdanau2014,
  author      = {Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio},
  title       = {Neural Machine Translation by Jointly Learning to Align and Translate},
  year        = {2014},
  abstract    = {Neural machine translation is a recently proposed approach to machine translation. Unlike the traditional statistical machine translation, the neural machine translation aims at building a single neural network that can be jointly tuned to maximize the translation performance. The models proposed recently for neural machine translation often belong to a family of encoder-decoders and consists of an encoder that encodes a source sentence into a fixed-length vector from which a decoder generates a translation. In this paper, we conjecture that the use of a fixed-length vector is a bottleneck in improving the performance of this basic encoder-decoder architecture, and propose to extend this by allowing a model to automatically (soft-)search for parts of a source sentence that are relevant to predicting a target word, without having to form these parts as a hard segment explicitly. With this new approach, we achieve a translation performance comparable to the existing state-of-the-art phrase-based system on the task of English-to-French translation. Furthermore, qualitative analysis reveals that the (soft-)alignments found by the model agree well with our intuition.},
  date        = {2014-09-01},
  eprint      = {1409.0473},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1409.0473v7:PDF},
  keywords    = {cs.CL, cs.LG, cs.NE, stat.ML},
}

@Article{Sutskever2014,
  author      = {Ilya Sutskever and Oriol Vinyals and Quoc V. Le},
  title       = {Sequence to Sequence Learning with Neural Networks},
  year        = {2014},
  abstract    = {Deep Neural Networks (DNNs) are powerful models that have achieved excellent performance on difficult learning tasks. Although DNNs work well whenever large labeled training sets are available, they cannot be used to map sequences to sequences. In this paper, we present a general end-to-end approach to sequence learning that makes minimal assumptions on the sequence structure. Our method uses a multilayered Long Short-Term Memory (LSTM) to map the input sequence to a vector of a fixed dimensionality, and then another deep LSTM to decode the target sequence from the vector. Our main result is that on an English to French translation task from the WMT'14 dataset, the translations produced by the LSTM achieve a BLEU score of 34.8 on the entire test set, where the LSTM's BLEU score was penalized on out-of-vocabulary words. Additionally, the LSTM did not have difficulty on long sentences. For comparison, a phrase-based SMT system achieves a BLEU score of 33.3 on the same dataset. When we used the LSTM to rerank the 1000 hypotheses produced by the aforementioned SMT system, its BLEU score increases to 36.5, which is close to the previous best result on this task. The LSTM also learned sensible phrase and sentence representations that are sensitive to word order and are relatively invariant to the active and the passive voice. Finally, we found that reversing the order of the words in all source sentences (but not target sentences) improved the LSTM's performance markedly, because doing so introduced many short term dependencies between the source and the target sentence which made the optimization problem easier.},
  date        = {2014-09-10},
  eprint      = {1409.3215},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1409.3215v3:PDF},
  keywords    = {cs.CL, cs.LG},
}

@Article{Raffel2019,
  author      = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
  title       = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  year        = {2019},
  abstract    = {Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new ``Colossal Clean Crawled Corpus'', we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code.},
  date        = {2019-10-23},
  eprint      = {1910.10683},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1910.10683v3:PDF},
  keywords    = {cs.LG, cs.CL, stat.ML},
}

@Book{Pilehvar2021,
  author    = {Mohammad Taher Pilehvar and Jose Camacho-Collados},
  publisher = {Springer International Publishing},
  title     = {Embeddings in Natural Language Processing},
  year      = {2021},
  doi       = {10.1007/978-3-031-02177-0},
}

@Article{Cho2014,
  author      = {Kyunghyun Cho and Bart van Merrienboer and Caglar Gulcehre and Dzmitry Bahdanau and Fethi Bougares and Holger Schwenk and Yoshua Bengio},
  title       = {Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation},
  year        = {2014},
  abstract    = {In this paper, we propose a novel neural network model called RNN Encoder-Decoder that consists of two recurrent neural networks (RNN). One RNN encodes a sequence of symbols into a fixed-length vector representation, and the other decodes the representation into another sequence of symbols. The encoder and decoder of the proposed model are jointly trained to maximize the conditional probability of a target sequence given a source sequence. The performance of a statistical machine translation system is empirically found to improve by using the conditional probabilities of phrase pairs computed by the RNN Encoder-Decoder as an additional feature in the existing log-linear model. Qualitatively, we show that the proposed model learns a semantically and syntactically meaningful representation of linguistic phrases.},
  date        = {2014-06-03},
  eprint      = {1406.1078},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1406.1078v3:PDF},
  keywords    = {cs.CL, cs.LG, cs.NE, stat.ML},
}

@Article{luong17,
  author = {Minh{-}Thang Luong and Eugene Brevdo and Rui Zhao},
  title  = {Neural Machine Translation (seq2seq) Tutorial},
  year   = {2017},
  note   = {https://github.com/tensorflow/nmt},
}

@Article{Manning2022,
  author = {Chris Manning and Anna Goldie and John Hewitt},
  title  = {Stanford CS224n: Natural Language Processing with Deep Learning},
  year   = {2022},
  note   = {https://web.stanford.edu/class/cs224n/slides/},
}

@Article{Google2022,
  author = {Google},
  title  = {Embeddings: Translating to a Lower-Dimensional Space},
  year   = {2022},
  note   = {https://developers.google.com/machine-learning/crash-course/embeddings/translating-to-a-lower-dimensional-space},
}

@Article{Jumper2021,
  author    = {John Jumper and Richard Evans and Alexander Pritzel and Tim Green and Michael Figurnov and Olaf Ronneberger and Kathryn Tunyasuvunakool and Russ Bates and Augustin {\v{Z}}{\'{\i}}dek and Anna Potapenko and Alex Bridgland and Clemens Meyer and Simon A. A. Kohl and Andrew J. Ballard and Andrew Cowie and Bernardino Romera-Paredes and Stanislav Nikolov and Rishub Jain and Jonas Adler and Trevor Back and Stig Petersen and David Reiman and Ellen Clancy and Michal Zielinski and Martin Steinegger and Michalina Pacholska and Tamas Berghammer and Sebastian Bodenstein and David Silver and Oriol Vinyals and Andrew W. Senior and Koray Kavukcuoglu and Pushmeet Kohli and Demis Hassabis},
  journal   = {Nature},
  title     = {Highly accurate protein structure prediction with {AlphaFold}},
  year      = {2021},
  month     = {jul},
  number    = {7873},
  pages     = {583--589},
  volume    = {596},
  doi       = {10.1038/s41586-021-03819-2},
  publisher = {Springer Science and Business Media {LLC}},
}

@Article{Zhou2020,
  author       = {Yanqi Zhou and Sudip Roy and Amirali Abdolrashidi and Daniel Wong and Peter Ma and Qiumin Xu and Hanxiao Liu and Phitchaya Mangpo Phothilimthana and Shen Wang and Anna Goldie and Azalia Mirhoseini and James Laudon},
  title        = {Transferable Graph Optimizers for ML Compilers},
  year         = {2020},
  abstract     = {Most compilers for machine learning (ML) frameworks need to solve many correlated optimization problems to generate efficient machine code. Current ML compilers rely on heuristics based algorithms to solve these optimization problems one at a time. However, this approach is not only hard to maintain but often leads to sub-optimal solutions especially for newer model architectures. Existing learning based approaches in the literature are sample inefficient, tackle a single optimization problem, and do not generalize to unseen graphs making them infeasible to be deployed in practice. To address these limitations, we propose an end-to-end, transferable deep reinforcement learning method for computational graph optimization (GO), based on a scalable sequential attention mechanism over an inductive graph neural network. GO generates decisions on the entire graph rather than on each individual node autoregressively, drastically speeding up the search compared to prior methods. Moreover, we propose recurrent attention layers to jointly optimize dependent graph optimization tasks and demonstrate 33%-60% speedup on three graph optimization tasks compared to TensorFlow default optimization. On a diverse set of representative graphs consisting of up to 80,000 nodes, including Inception-v3, Transformer-XL, and WaveNet, GO achieves on average 21% improvement over human experts and 18% improvement over the prior state of the art with 15x faster convergence, on a device placement task evaluated in real systems.},
  date         = {2020-10-21},
  eprint       = {2010.12438},
  eprintclass  = {cs.LG},
  eprinttype   = {arXiv},
  file         = {:http\://arxiv.org/pdf/2010.12438v2:PDF},
  journaltitle = {NeurIPS 2020},
  keywords     = {cs.LG, cs.DC},
}

@Article{Sejnowski2020,
  author       = {Terrence J. Sejnowski},
  title        = {The Unreasonable Effectiveness of Deep Learning in Artificial Intelligence},
  year         = {2020},
  abstract     = {Deep learning networks have been trained to recognize speech, caption photographs and translate text between languages at high levels of performance. Although applications of deep learning networks to real world problems have become ubiquitous, our understanding of why they are so effective is lacking. These empirical results should not be possible according to sample complexity in statistics and non-convex optimization theory. However, paradoxes in the training and effectiveness of deep learning networks are being investigated and insights are being found in the geometry of high-dimensional spaces. A mathematical theory of deep learning would illuminate how they function, allow us to assess the strengths and weaknesses of different network architectures and lead to major improvements. Deep learning has provided natural ways for humans to communicate with digital devices and is foundational for building artificial general intelligence. Deep learning was inspired by the architecture of the cerebral cortex and insights into autonomy and general intelligence may be found in other brain regions that are essential for planning and survival, but major breakthroughs will be needed to achieve these goals.},
  date         = {2020-02-12},
  doi          = {10.1073/pnas.1907373117},
  eprint       = {2002.04806},
  eprintclass  = {q-bio.NC},
  eprinttype   = {arXiv},
  file         = {:http\://arxiv.org/pdf/2002.04806v1:PDF},
  journaltitle = {Proceedings of the National Academy of Sciences U.S.A. (2020) https://www.pnas.org/content/early/2020/01/23/1907373117},
  keywords     = {q-bio.NC, cs.AI, cs.LG, cs.NE},
}

@Article{Saifee2020,
  author = {Moiz Saifee},
  title  = {GPT-3: The New Mighty Language Model from OpenAI},
  year   = {2020},
  note   = {https://towardsdatascience.com/gpt-3-the-new-mighty-language-model-from-openai-a74ff35346fc},
}

@Article{Radford2018,
  author    = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
  title     = {Improving language understanding by generative pre-training},
  year      = {2018},
  added-at  = {2020-07-14T16:37:42.000+0200},
  biburl    = {https://www.bibsonomy.org/bibtex/273ced32c0d4588eb95b6986dc2c8147c/jonaskaiser},
  interhash = {5c343ed9a31ac52fd17a898f72af228f},
  intrahash = {73ced32c0d4588eb95b6986dc2c8147c},
  keywords  = {final thema:transformer},
  timestamp = {2020-07-14T16:49:42.000+0200},
}

@Article{Hart1995,
  author  = {B. Hart and T. R. Risley},
  journal = {Baltimore, MD: Paul H. Brookes Publishing Company,},
  title   = {Meaningful differences in the everyday experience of young American children},
  year    = {1995},
}

@InProceedings{Bender2021,
  author    = {Bender, Emily M. and Gebru, Timnit and McMillan-Major, Angelina and Shmitchell, Shmargaret},
  booktitle = {Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency},
  title     = {On the Dangers of Stochastic Parrots: Can Language Models Be Too Big?},
  year      = {2021},
  address   = {New York, NY, USA},
  pages     = {610–623},
  publisher = {Association for Computing Machinery},
  series    = {FAccT '21},
  abstract  = {The past 3 years of work in NLP have been characterized by the development and deployment of ever larger language models, especially for English. BERT, its variants, GPT-2/3, and others, most recently Switch-C, have pushed the boundaries of the possible both through architectural innovations and through sheer size. Using these pretrained models and the methodology of fine-tuning them for specific tasks, researchers have extended the state of the art on a wide array of tasks as measured by leaderboards on specific benchmarks for English. In this paper, we take a step back and ask: How big is too big? What are the possible risks associated with this technology and what paths are available for mitigating those risks? We provide recommendations including weighing the environmental and financial costs first, investing resources into curating and carefully documenting datasets rather than ingesting everything on the web, carrying out pre-development exercises evaluating how the planned approach fits into research and development goals and supports stakeholder values, and encouraging research directions beyond ever larger language models.},
  doi       = {10.1145/3442188.3445922},
  isbn      = {9781450383097},
  location  = {Virtual Event, Canada},
  numpages  = {14},
  url       = {https://doi.org/10.1145/3442188.3445922},
}

@Article{Rogers2020,
  author      = {Anna Rogers and Olga Kovaleva and Anna Rumshisky},
  title       = {A Primer in BERTology: What we know about how BERT works},
  year        = {2020},
  abstract    = {Transformer-based models have pushed state of the art in many areas of NLP, but our understanding of what is behind their success is still limited. This paper is the first survey of over 150 studies of the popular BERT model. We review the current state of knowledge about how BERT works, what kind of information it learns and how it is represented, common modifications to its training objectives and architecture, the overparameterization issue and approaches to compression. We then outline directions for future research.},
  date        = {2020-02-27},
  eprint      = {2002.12327},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/2002.12327v3:PDF},
  keywords    = {cs.CL},
}

@Article{Clark2019,
  author      = {Kevin Clark and Urvashi Khandelwal and Omer Levy and Christopher D. Manning},
  title       = {What Does BERT Look At? An Analysis of BERT's Attention},
  year        = {2019},
  abstract    = {Large pre-trained neural networks such as BERT have had great recent success in NLP, motivating a growing body of research investigating what aspects of language they are able to learn from unlabeled data. Most recent analysis has focused on model outputs (e.g., language model surprisal) or internal vector representations (e.g., probing classifiers). Complementary to these works, we propose methods for analyzing the attention mechanisms of pre-trained models and apply them to BERT. BERT's attention heads exhibit patterns such as attending to delimiter tokens, specific positional offsets, or broadly attending over the whole sentence, with heads in the same layer often exhibiting similar behaviors. We further show that certain attention heads correspond well to linguistic notions of syntax and coreference. For example, we find heads that attend to the direct objects of verbs, determiners of nouns, objects of prepositions, and coreferent mentions with remarkably high accuracy. Lastly, we propose an attention-based probing classifier and use it to further demonstrate that substantial syntactic information is captured in BERT's attention.},
  date        = {2019-06-11},
  eprint      = {1906.04341},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1906.04341v1:PDF},
  keywords    = {cs.CL},
}

@Article{Lialin2022,
  author      = {Vladislav Lialin and Kevin Zhao and Namrata Shivagunde and Anna Rumshisky},
  title       = {Life after BERT: What do Other Muppets Understand about Language?},
  year        = {2022},
  abstract    = {Existing pre-trained transformer analysis works usually focus only on one or two model families at a time, overlooking the variability of the architecture and pre-training objectives. In our work, we utilize the oLMpics benchmark and psycholinguistic probing datasets for a diverse set of 29 models including T5, BART, and ALBERT. Additionally, we adapt the oLMpics zero-shot setup for autoregressive models and evaluate GPT networks of different sizes. Our findings show that none of these models can resolve compositional questions in a zero-shot fashion, suggesting that this skill is not learnable using existing pre-training objectives. Furthermore, we find that global model decisions such as architecture, directionality, size of the dataset, and pre-training objective are not predictive of a model's linguistic capabilities.},
  date        = {2022-05-21},
  eprint      = {2205.10696},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/2205.10696v1:PDF},
  keywords    = {cs.CL},
}

@Article{Narang2021,
  author      = {Sharan Narang and Hyung Won Chung and Yi Tay and William Fedus and Thibault Fevry and Michael Matena and Karishma Malkan and Noah Fiedel and Noam Shazeer and Zhenzhong Lan and Yanqi Zhou and Wei Li and Nan Ding and Jake Marcus and Adam Roberts and Colin Raffel},
  title       = {Do Transformer Modifications Transfer Across Implementations and Applications?},
  year        = {2021},
  abstract    = {The research community has proposed copious modifications to the Transformer architecture since it was introduced over three years ago, relatively few of which have seen widespread adoption. In this paper, we comprehensively evaluate many of these modifications in a shared experimental setting that covers most of the common uses of the Transformer in natural language processing. Surprisingly, we find that most modifications do not meaningfully improve performance. Furthermore, most of the Transformer variants we found beneficial were either developed in the same codebase that we used or are relatively minor changes. We conjecture that performance improvements may strongly depend on implementation details and correspondingly make some recommendations for improving the generality of experimental results.},
  date        = {2021-02-23},
  eprint      = {2102.11972},
  eprintclass = {cs.LG},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/2102.11972v2:PDF},
  keywords    = {cs.LG, cs.CL},
 }

 @Article{Ritchie2020,
  author  = {Hannah Ritchie and Max Roser and Pablo Rosado},
  journal = {Our World in Data},
  title   = {$CO_2$ and Greenhouse Gas Emissions},
  year    = {2020},
  note    = {https://ourworldindata.org/co2-and-other-greenhouse-gas-emissions},
}

@InProceedings{Lin2019,
  author    = {Yongjie Lin and Yi Chern Tan and Robert Frank},
  booktitle = {Proceedings of the 2019 {ACL} Workshop {BlackboxNLP}: Analyzing and Interpreting Neural Networks for {NLP}},
  title     = {Open Sesame: Getting inside {BERT}'s Linguistic Knowledge},
  year      = {2019},
  publisher = {Association for Computational Linguistics},
  doi       = {10.18653/v1/w19-4825},
}

@Article{Ettinger2019,
  author      = {Allyson Ettinger},
  title       = {What BERT is not: Lessons from a new suite of psycholinguistic diagnostics for language models},
  year        = {2019},
  abstract    = {Pre-training by language modeling has become a popular and successful approach to NLP tasks, but we have yet to understand exactly what linguistic capacities these pre-training processes confer upon models. In this paper we introduce a suite of diagnostics drawn from human language experiments, which allow us to ask targeted questions about the information used by language models for generating predictions in context. As a case study, we apply these diagnostics to the popular BERT model, finding that it can generally distinguish good from bad completions involving shared category or role reversal, albeit with less sensitivity than humans, and it robustly retrieves noun hypernyms, but it struggles with challenging inferences and role-based event prediction -- and in particular, it shows clear insensitivity to the contextual impacts of negation.},
  date        = {2019-07-31},
  eprint      = {1907.13528},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1907.13528v2:PDF},
  keywords    = {cs.CL, cs.AI},
}

@Article{Forbes2019,
  author      = {Maxwell Forbes and Ari Holtzman and Yejin Choi},
  title       = {Do Neural Language Representations Learn Physical Commonsense?},
  year        = {2019},
  abstract    = {Humans understand language based on the rich background knowledge about how the physical world works, which in turn allows us to reason about the physical world through language. In addition to the properties of objects (e.g., boats require fuel) and their affordances, i.e., the actions that are applicable to them (e.g., boats can be driven), we can also reason about if-then inferences between what properties of objects imply the kind of actions that are applicable to them (e.g., that if we can drive something then it likely requires fuel). In this paper, we investigate the extent to which state-of-the-art neural language representations, trained on a vast amount of natural language text, demonstrate physical commonsense reasoning. While recent advancements of neural language models have demonstrated strong performance on various types of natural language inference tasks, our study based on a dataset of over 200k newly collected annotations suggests that neural language representations still only learn associations that are explicitly written down.},
  date        = {2019-08-08},
  eprint      = {1908.02899},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1908.02899v1:PDF},
  keywords    = {cs.CL},
}

@Article{Voita2019,
  author      = {Elena Voita and David Talbot and Fedor Moiseev and Rico Sennrich and Ivan Titov},
  title       = {Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned},
  year        = {2019},
  abstract    = {Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning method removes the vast majority of heads without seriously affecting performance. For example, on the English-Russian WMT dataset, pruning 38 out of 48 encoder heads results in a drop of only 0.15 BLEU.},
  date        = {2019-05-23},
  eprint      = {1905.09418},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1905.09418v2:PDF},
  keywords    = {cs.CL},
}

@Article{Liu2019,
  author      = {Nelson F. Liu and Matt Gardner and Yonatan Belinkov and Matthew E. Peters and Noah A. Smith},
  title       = {Linguistic Knowledge and Transferability of Contextual Representations},
  year        = {2019},
  abstract    = {Contextual word representations derived from large-scale neural language models are successful across a diverse set of NLP tasks, suggesting that they encode useful and transferable features of language. To shed light on the linguistic knowledge they capture, we study the representations produced by several recent pretrained contextualizers (variants of ELMo, the OpenAI transformer language model, and BERT) with a suite of seventeen diverse probing tasks. We find that linear models trained on top of frozen contextual representations are competitive with state-of-the-art task-specific models in many cases, but fail on tasks requiring fine-grained linguistic knowledge (e.g., conjunct identification). To investigate the transferability of contextual word representations, we quantify differences in the transferability of individual layers within contextualizers, especially between recurrent neural networks (RNNs) and transformers. For instance, higher layers of RNNs are more task-specific, while transformer layers do not exhibit the same monotonic trend. In addition, to better understand what makes contextual word representations transferable, we compare language model pretraining with eleven supervised pretraining tasks. For any given task, pretraining on a closely related task yields better performance than language model pretraining (which is better on average) when the pretraining dataset is fixed. However, language model pretraining on more data gives the best results.},
  date        = {2019-03-21},
  eprint      = {1903.08855},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1903.08855v5:PDF},
  keywords    = {cs.CL},
}

@Article{Da2019,
  author      = {Jeff Da and Jungo Kasai},
  title       = {Cracking the Contextual Commonsense Code: Understanding Commonsense Reasoning Aptitude of Deep Contextual Representations},
  year        = {2019},
  abstract    = {Pretrained deep contextual representations have advanced the state-of-the-art on various commonsense NLP tasks, but we lack a concrete understanding of the capability of these models. Thus, we investigate and challenge several aspects of BERT's commonsense representation abilities. First, we probe BERT's ability to classify various object attributes, demonstrating that BERT shows a strong ability in encoding various commonsense features in its embedding space, but is still deficient in many areas. Next, we show that, by augmenting BERT's pretraining data with additional data related to the deficient attributes, we are able to improve performance on a downstream commonsense reasoning task while using a minimal amount of data. Finally, we develop a method of fine-tuning knowledge graphs embeddings alongside BERT and show the continued importance of explicit knowledge graphs.},
  date        = {2019-10-02},
  eprint      = {1910.01157},
  eprintclass = {cs.CL},
  eprinttype  = {arXiv},
  file        = {:http\://arxiv.org/pdf/1910.01157v2:PDF},
  keywords    = {cs.CL},
}

@INPROCEEDINGS{pennington2014glove,
  YEAR = {2014},
  AUTHOR = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D},
  BOOKTITLE = {Proceedings of the 2014 conference on empirical methods in natural language processing (EMNLP)},
  PAGES = {1532--1543},
  TITLE = {Glove: Global vectors for word representation},
}

@ARTICLE{pezzelle2021word,
  YEAR = {2021},
  AUTHOR = {Pezzelle, Sandro and Takmaz, Ece and Fernández, Raquel},
  PUBLISHER = {MIT Press},
  JOURNALTITLE = {Transactions of the Association for Computational Linguistics},
  PAGES = {1563--1579},
  TITLE = {Word representation learning in multimodal pre-trained transformers: An intrinsic evaluation},
  VOLUME = {9},
}

@ARTICLE{russakovsky2015imagenet,
  AUTHOR = {Russakovsky, Olga and Deng, Jia and Su, Hao and Krause, Jonathan and Satheesh, Sanjeev and Ma, Sean and Huang, Zhiheng and Karpathy, Andrej and Khosla, Aditya and Bernstein, Michael and others},
  PUBLISHER = {Springer},
  JOURNALTITLE = {International journal of computer vision},
  NUMBER = {3},
  PAGES = {211--252},
  TITLE = {Imagenet large scale visual recognition challenge},
  VOLUME = {115},
  YEAR = {2015}
}

@INPROCEEDINGS{singh2022flava,
  YEAR = {2022},
  AUTHOR = {Singh, Amanpreet and Hu, Ronghang and Goswami, Vedanuj and Couairon, Guillaume and Galuba, Wojciech and Rohrbach, Marcus and Kiela, Douwe},
  BOOKTITLE = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  PAGES = {15638--15650},
  TITLE = {Flava: A foundational language and vision alignment model},
}

@INPROCEEDINGS{silberer2014learning,
  YEAR = {2014},
  AUTHOR = {Silberer, Carina and Lapata, Mirella},
  BOOKTITLE = {Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  PAGES = {721--732},
  TITLE = {Learning grounded meaning representations with autoencoders},
}

@ARTICLE{sun2021multimodal,
  YEAR = {2021},
  AUTHOR = {Sun, Qingfeng and Wang, Yujing and Xu, Can and Zheng, Kai and Yang, Yaming and Hu, Huang and Xu, Fei and Zhang, Jessica and Geng, Xiubo and Jiang, Daxin},
  JOURNALTITLE = {arXiv preprint arXiv:2110.08515},
  TITLE = {Multimodal dialogue response generation},
}

@ARTICLE{tan2020vokenization,
  YEAR = {2020},
  AUTHOR = {Tan, Hao and Bansal, Mohit},
  JOURNALTITLE = {arXiv preprint arXiv:2010.06775},
  TITLE = {Vokenization: Improving language understanding with contextualized, visual-grounded supervision},
}

@ARTICLE{wang2018glue,
  YEAR = {2018},
  AUTHOR = {Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
  JOURNALTITLE = {arXiv preprint arXiv:1804.07461},
  TITLE = {GLUE: A multi-task benchmark and analysis platform for natural language understanding},
}

@ARTICLE{zellers2018swag,
  YEAR = {2018},
  AUTHOR = {Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin},
  JOURNALTITLE = {arXiv preprint arXiv:1808.05326},
  TITLE = {Swag: A large-scale adversarial dataset for grounded commonsense inference},
}

@MANUAL{rlang,
  YEAR = {2018},
  AUTHOR = {{R Core Team}},
  LOCATION = {Vienna, Austria},
  ORGANIZATION = {R Foundation for Statistical Computing},
  URL = {https://www.R-project.org/},
  TITLE = {R: A Language and Environment for Statistical Computing},
}

@INPROCEEDINGS{radford2021learning,
  YEAR = {2021},
  AUTHOR = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and others},
  ORGANIZATION = {PMLR},
  BOOKTITLE = {International Conference on Machine Learning},
  PAGES = {8748--8763},
  TITLE = {Learning transferable visual models from natural language supervision},
}

@ARTICLE{baevski2022data2vec,
  YEAR = {2022},
  AUTHOR = {Baevski, Alexei and Hsu, Wei-Ning and Xu, Qiantong and Babu, Arun and Gu, Jiatao and Auli, Michael},
  JOURNALTITLE = {arXiv preprint arXiv:2202.03555},
  TITLE = {Data2vec: A general framework for self-supervised learning in speech, vision and language},
}

@ARTICLE{bao2021beit,
  YEAR = {2021},
  AUTHOR = {Bao, Hangbo and Dong, Li and Wei, Furu},
  JOURNALTITLE = {arXiv preprint arXiv:2106.08254},
  TITLE = {Beit: Bert pre-training of image transformers},
}

@ARTICLE{radford2019language,
  YEAR = {2019},
  AUTHOR = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya and others},
  JOURNALTITLE = {OpenAI blog},
  NUMBER = {8},
  PAGES = {9},
  TITLE = {Language models are unsupervised multitask learners},
  VOLUME = {1},
}

@ARTICLE{shen2021much,
  YEAR = {2021},
  AUTHOR = {Shen, Sheng and Li, Liunian Harold and Tan, Hao and Bansal, Mohit and Rohrbach, Anna and Chang, Kai-Wei and Yao, Zhewei and Keutzer, Kurt},
  JOURNALTITLE = {arXiv preprint arXiv:2107.06383},
  TITLE = {How Much Can CLIP Benefit Vision-and-Language Tasks?},
}

@INPROCEEDINGS{silberer2012grounded,
  YEAR = {2012},
  AUTHOR = {Silberer, Carina and Lapata, Mirella},
  ORGANIZATION = {ACL (Association for Computational Linguistics)},
  BOOKTITLE = {Tsujii J, Henderson J, Paşca M, editors. Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning; 2012 Jul 12--14; Jeju Island, Korea. Stroudsburg: ACL; 2012. p. 1423-33.},
  TITLE = {Grounded models of semantic representation},
}

@ARTICLE{sennrich2015neural,
  YEAR = {2015},
  AUTHOR = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  JOURNALTITLE = {arXiv preprint arXiv:1508.07909},
  TITLE = {Neural machine translation of rare words with subword units},
}

@ARTICLE{baevski2020wav2vec,
  YEAR = {2020},
  AUTHOR = {Baevski, Alexei and Zhou, Yuhao and Mohamed, Abdelrahman and Auli, Michael},
  JOURNALTITLE = {Advances in Neural Information Processing Systems},
  PAGES = {12449--12460},
  TITLE = {wav2vec 2.0: A framework for self-supervised learning of speech representations},
  VOLUME = {33},
}

@INPROCEEDINGS{chen2021empirical,
  YEAR = {2021},
  AUTHOR = {Chen, Xinlei and Xie, Saining and He, Kaiming},
  BOOKTITLE = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
  PAGES = {9640--9649},
  TITLE = {An empirical study of training self-supervised vision transformers},
}

@INPROCEEDINGS{he2022masked,
  YEAR = {2022},
  AUTHOR = {He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Dollár, Piotr and Girshick, Ross},
  BOOKTITLE = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  PAGES = {16000--16009},
  TITLE = {Masked autoencoders are scalable vision learners},
}

@ARTICLE{zhang2020contrastive,
  YEAR = {2020},
  AUTHOR = {Zhang, Yuhao and Jiang, Hang and Miura, Yasuhide and Manning, Christopher D and Langlotz, Curtis P},
  JOURNALTITLE = {arXiv preprint arXiv:2010.00747},
  TITLE = {Contrastive learning of medical visual representations from paired images and text},
}

@ONLINE{sutton2019bitterlesson,
  YEAR = {2019},
  AUTHOR = {Sutton, R. S.},
  URL = {http://www.incompleteideas.net/IncIdeas/BitterLesson.html},
  TITLE = {The Bitter Lesson},
}

@ONLINE{openai2021clipblog,
  AUTHOR = {OpenAI},
  URL = {https://openai.com/blog/clip/},
  TITLE = {CLIP: Connection Text and Images},
}

@ONLINE{alford2021alignparams,
  YEAR = {2021},
  AUTHOR = {Alford, A.},
  URL = {https://www.infoq.com/news/2021/07/google-vision-language-ai/},
  TITLE = {Google Announces 800M Parameter Vision-Language AI Model ALIGN},
}

@ONLINE{schuhmann2022laion,
  YEAR = {2022},
  AUTHOR = {Schuhmann, C.},
  URL = {https://laion.ai/blog/laion-400-open-dataset/},
  TITLE = {Laion-400-Million Open Dataset},
}

@ONLINE{solawetz2021florenceopen,
  AUTHOR = {Solawetz, J.},
  URL = {https://blog.roboflow.com/florence-a-new-foundational-model-for-computer-vision/},
  TITLE = {Florence: A New Foundation for Computer Vision},
}

@INPROCEEDINGS{wei2022masked,
  YEAR = {2022},
  AUTHOR = {Wei, Chen and Fan, Haoqi and Xie, Saining and Wu, Chao-Yuan and Yuille, Alan and Feichtenhofer, Christoph},
  BOOKTITLE = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
  PAGES = {14668--14678},
  TITLE = {Masked feature prediction for self-supervised visual pre-training},
}

@INPROCEEDINGS{jaegle2021perceiver,
  YEAR = {2021},
  AUTHOR = {Jaegle, Andrew and Gimeno, Felix and Brock, Andy and Vinyals, Oriol and Zisserman, Andrew and Carreira, Joao},
  ORGANIZATION = {PMLR},
  BOOKTITLE = {International conference on machine learning},
  PAGES = {4651--4664},
  TITLE = {Perceiver: General perception with iterative attention},
}

@ARTICLE{alayrac2022flamingo,
  YEAR = {2022},
  AUTHOR = {Alayrac, Jean-Baptiste and Donahue, Jeff and Luc, Pauline and Miech, Antoine and Barr, Iain and Hasson, Yana and Lenc, Karel and Mensch, Arthur and Millican, Katie and Reynolds, Malcolm and others},
  JOURNALTITLE = {arXiv preprint arXiv:2204.14198},
  TITLE = {Flamingo: a visual language model for few-shot learning},
}

@ARTICLE{lu2019vilbert,
  YEAR = {2019},
  AUTHOR = {Lu, Jiasen and Batra, Dhruv and Parikh, Devi and Lee, Stefan},
  JOURNALTITLE = {Advances in neural information processing systems},
  TITLE = {Vilbert: Pretraining task-agnostic visiolinguistic representations for vision-and-language tasks},
  VOLUME = {32},
}

@ARTICLE{uppal2022multimodal,
  YEAR = {2022},
  AUTHOR = {Uppal, Shagun and Bhagat, Sarthak and Hazarika, Devamanyu and Majumder, Navonil and Poria, Soujanya and Zimmermann, Roger and Zadeh, Amir},
  PUBLISHER = {Elsevier},
  JOURNALTITLE = {Information Fusion},
  PAGES = {149--171},
  TITLE = {Multimodal research in vision and language: A review of current and emerging trends},
  VOLUME = {77},
}

@ARTICLE{hoffmann2022training,
  YEAR = {2022},
  AUTHOR = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and Cai, Trevor and Rutherford, Eliza and Casas, Diego de Las and Hendricks, Lisa Anne and Welbl, Johannes and Clark, Aidan and others},
  JOURNALTITLE = {arXiv preprint arXiv:2203.15556},
  TITLE = {Training Compute-Optimal Large Language Models},
}

@INPROCEEDINGS{jia2021scaling,
  YEAR = {2021},
  AUTHOR = {Jia, Chao and Yang, Yinfei and Xia, Ye and Chen, Yi-Ting and Parekh, Zarana and Pham, Hieu and Le, Quoc and Sung, Yun-Hsuan and Li, Zhen and Duerig, Tom},
  ORGANIZATION = {PMLR},
  BOOKTITLE = {International Conference on Machine Learning},
  PAGES = {4904--4916},
  TITLE = {Scaling up visual and vision-language representation learning with noisy text supervision},
}

@ARTICLE{perez2021true,
  YEAR = {2021},
  AUTHOR = {Perez, Ethan and Kiela, Douwe and Cho, Kyunghyun},
  JOURNALTITLE = {Advances in Neural Information Processing Systems},
  PAGES = {11054--11070},
  TITLE = {True few-shot learning with language models},
  VOLUME = {34},
}

@ARTICLE{zeng2022socratic,
  YEAR = {2022},
  AUTHOR = {Zeng, Andy and Wong, Adrian and Welker, Stefan and Choromanski, Krzysztof and Tombari, Federico and Purohit, Aveek and Ryoo, Michael and Sindhwani, Vikas and Lee, Johnny and Vanhoucke, Vincent and others},
  JOURNALTITLE = {arXiv preprint arXiv:2204.00598},
  TITLE = {Socratic models: Composing zero-shot multimodal reasoning with language},
}

@ARTICLE{ren2015faster,
  YEAR = {2015},
  AUTHOR = {Ren, Shaoqing and He, Kaiming and Girshick, Ross and Sun, Jian},
  JOURNALTITLE = {Advances in neural information processing systems},
  TITLE = {Faster r-cnn: Towards real-time object detection with region proposal networks},
  VOLUME = {28},
}

@INPROCEEDINGS{krishnavisualgenome,
  YEAR = {2016},
  AUTHOR = {Krishna, Ranjay and Zhu, Yuke and Groth, Oliver and Johnson, Justin and Hata, Kenji and Kravitz, Joshua and Chen, Stephanie and Kalantidis, Yannis and Li, Li-Jia and Shamma, David A and Bernstein, Michael and Fei-Fei, Li},
  URL = {https://arxiv.org/abs/1602.07332},
  TITLE = {Visual Genome: Connecting Language and Vision Using Crowdsourced Dense Image Annotations},
}

@ARTICLE{sikarwar2022efficacy,
  YEAR = {2022},
  AUTHOR = {Sikarwar, Ankur and Kreiman, Gabriel},
  JOURNALTITLE = {arXiv preprint arXiv:2201.03965},
  TITLE = {On the efficacy of co-attention transformer layers in visual question answering},
}

@ARTICLE{chowdhery2022palm,
  YEAR = {2022},
  AUTHOR = {Chowdhery, Aakanksha and Narang, Sharan and Devlin, Jacob and Bosma, Maarten and Mishra, Gaurav and Roberts, Adam and Barham, Paul and Chung, Hyung Won and Sutton, Charles and Gehrmann, Sebastian and others},
  JOURNALTITLE = {arXiv preprint arXiv:2204.02311},
  TITLE = {Palm: Scaling language modeling with pathways},
}

@ARTICLE{das2017human,
  YEAR = {2017},
  AUTHOR = {Das, Abhishek and Agrawal, Harsh and Zitnick, Larry and Parikh, Devi and Batra, Dhruv},
  PUBLISHER = {Elsevier},
  JOURNALTITLE = {Computer Vision and Image Understanding},
  PAGES = {90--100},
  TITLE = {Human attention in visual question answering: Do humans and deep networks look at the same regions?},
  VOLUME = {163},
}

@article{HuangFusion2020,
  title={Fusion of medical imaging and electronic health records using deep learning: a systematic review and implementation guidelines},
  author={Huang, Shih-Cheng and Pareek, Anuj and Seyyedi, Saeed and Banerjee, Imon and Lungren, Matthew P},
  journal={NPJ digital medicine},
  volume={3},
  number={1},
  pages={1--9},
  year={2020},
  publisher={Nature Publishing Group}
}

@article{Katzman2018,
  title={DeepSurv: personalized treatment recommender system using a Cox proportional hazards deep neural network},
  author={Katzman, Jared L and Shaham, Uri and Cloninger, Alexander and Bates, Jonathan and Jiang, Tingting and Kluger, Yuval},
  journal={BMC medical research methodology},
  volume={18},
  number={1},
  pages={1--12},
  year={2018},
  publisher={BioMed Central}
}

@inproceedings{DeepConvSurv,
  title={Deep convolutional neural network for survival analysis with pathological images},
  author={Zhu, Xinliang and Yao, Jiawen and Huang, Junzhou},
  booktitle={2016 IEEE International Conference on Bioinformatics and Biomedicine (BIBM)},
  pages={544--547},
  year={2016},
  organization={IEEE}
}

@article{TongAE,
  title={An efficient deep model for day-ahead electricity load forecasting with stacked denoising auto-encoders},
  author={Tong, Chao and Li, Jun and Lang, Chao and Kong, Fanxin and Niu, Jianwei and Rodrigues, Joel JPC},
  journal={Journal of parallel and distributed computing},
  volume={117},
  pages={267--273},
  year={2018},
  publisher={Elsevier}
}

@article{carreira2022hierarchical,
  title   = {Hierarchical Perceiver},
  author  = {Joao Carreira and Skanda Koppula and Daniel Zoran and Adria Recasens and Catalin Ionescu and Olivier Henaff and Evan Shelhamer and Relja Arandjelovic and Matt Botvinick and Oriol Vinyals and Karen Simonyan and Andrew Zisserman and Andrew Jaegle},
  year    = {2022},
  journal = {arXiv preprint arXiv: Arxiv-2202.10890}
}

@inproceedings{DBLP:conf/icml/JaegleGBVZC21,
  author    = {Andrew Jaegle and Felix Gimeno and Andy Brock and Oriol Vinyals and Andrew Zisserman and Jo{\~{a}}o Carreira},
  editor    = {Marina Meila and Tong Zhang},
  title     = {Perceiver: General Perception with Iterative Attention},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning, {ICML} 2021, 18-24 July 2021, Virtual Event},
  series    = {Proceedings of Machine Learning Research},
  volume    = {139},
  pages     = {4651-4664},
  publisher = {{PMLR}},
  year      = {2021},
  url       = {http://proceedings.mlr.press/v139/jaegle21a.html},
  timestamp = {Wed, 25 Aug 2021 17:11:17 +0200},
  biburl    = {https://dblp.org/rec/conf/icml/JaegleGBVZC21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{lecun2022path,
  title={A Path Towards Autonomous Machine Intelligence Version 0.9. 2, 2022-06-27},
  author={LeCun, Yann},
  year={2022}
}

@article{bachmann2022multimae,
  title   = {MultiMAE: Multi-modal Multi-task Masked Autoencoders},
  author  = {Roman Bachmann and David Mizrahi and Andrei Atanov and Amir Zamir},
  year    = {2022},
  journal = {arXiv preprint arXiv: Arxiv-2204.01678}
}

@article{yu2022coca,
  title   = {CoCa: Contrastive Captioners are Image-Text Foundation Models},
  author  = {Jiahui Yu and Zirui Wang and Vijay Vasudevan and Legg Yeung and Mojtaba Seyedhosseini and Yonghui Wu},
  year    = {2022},
  journal = {arXiv preprint arXiv: Arxiv-2205.01917}
}

@inproceedings{DBLP:conf/nips/AkbariYQCCCG21,
  author    = {Hassan Akbari and Liangzhe Yuan and Rui Qian and Wei{-}Hong Chuang and Shih{-}Fu Chang and Yin Cui and Boqing Gong},
  editor    = {Marc'Aurelio Ranzato and Alina Beygelzimer and Yann N. Dauphin and Percy Liang and Jennifer Wortman Vaughan},
  title     = {{VATT:} Transformers for Multimodal Self-Supervised Learning from Raw Video, Audio and Text},
  booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
  pages     = {24206-24221},
  year      = {2021},
  url       = {https://proceedings.neurips.cc/paper/2021/hash/cb3213ada48302953cb0f166464ab356-Abstract.html},
  timestamp = {Tue, 03 May 2022 16:20:49 +0200},
  biburl    = {https://dblp.org/rec/conf/nips/AkbariYQCCCG21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/jstsp/ZhangYHD20,
  author    = {Chao Zhang and Zichao Yang and Xiaodong He and Li Deng},
  title     = {Multimodal Intelligence: Representation Learning, Information Fusion, and Applications},
  journal   = {{IEEE} J. Sel. Top. Signal Process.},
  volume    = {14},
  number    = {3},
  pages     = {478-493},
  year      = {2020},
  url       = {https://doi.org/10.1109/JSTSP.2020.2987728},
  doi       = {10.1109/JSTSP.2020.2987728},
  timestamp = {Thu, 06 Aug 2020 21:45:41 +0200},
  biburl    = {https://dblp.org/rec/journals/jstsp/ZhangYHD20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{iv2021multimodal,
  title   = {Multimodal Classification: Current Landscape, Taxonomy and Future Directions},
  author  = {William C. Sleeman IV and Rishabh Kapoor and Preetam Ghosh},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2109.09020}
}

@article{baltrušaitis2017multimodal,
  title   = {Multimodal Machine Learning: A Survey and Taxonomy},
  author  = {Tadas Baltrušaitis and Chaitanya Ahuja and Louis-Philippe Morency},
  year    = {2017},
  journal = {arXiv preprint arXiv: Arxiv-1705.09406}
}

@article{DBLP:journals/pami/BengioCV13,
  author    = {Yoshua Bengio and Aaron C. Courville and Pascal Vincent},
  title     = {Representation Learning: {A} Review and New Perspectives},
  journal   = {{IEEE} Trans. Pattern Anal. Mach. Intell.},
  volume    = {35},
  number    = {8},
  pages     = {1798-1828},
  year      = {2013},
  url       = {https://doi.org/10.1109/TPAMI.2013.50},
  doi       = {10.1109/TPAMI.2013.50},
  timestamp = {Wed, 14 Nov 2018 10:51:00 +0100},
  biburl    = {https://dblp.org/rec/journals/pami/BengioCV13.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{wu2022nuwainfinity,
  title   = {NUWA-Infinity: Autoregressive over Autoregressive Generation for Infinite Visual Synthesis},
  author  = {Chenfei Wu and Jian Liang and Xiaowei Hu and Zhe Gan and Jianfeng Wang and Lijuan Wang and Zicheng Liu and Yuejian Fang and Nan Duan},
  year    = {2022},
  journal = {arXiv preprint arXiv: Arxiv-2207.09814}
}

@article{wu2021nwa,
  title   = {NÜWA: Visual Synthesis Pre-training for Neural visUal World creAtion},
  author  = {Chenfei Wu and Jian Liang and Lei Ji and Fan Yang and Yuejian Fang and Daxin Jiang and Nan Duan},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2111.12417}
}

@inproceedings{Zhu_2022_CVPR,
  author    = {Zhu, Xizhou and Zhu, Jinguo and Li, Hao and Wu, Xiaoshi and Li, Hongsheng and Wang, Xiaohua and Dai, Jifeng},
  title     = {Uni-Perceiver: Pre-Training Unified Architecture for Generic Perception for Zero-Shot and Few-Shot Tasks},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month     = {June},
  year      = {2022},
  pages     = {16804-16815}
}

@article{likhosherstov2021polyvit,
  title   = {PolyViT: Co-training Vision Transformers on Images, Videos and Audio},
  author  = {Valerii Likhosherstov and Anurag Arnab and Krzysztof Choromanski and Mario Lucic and Yi Tay and Adrian Weller and Mostafa Dehghani},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2111.12993}
}

@inproceedings{DBLP:conf/aaai/ZhangZ0CAP22,
  author    = {Zizhao Zhang and Han Zhang and Long Zhao and Ting Chen and Sercan {\"{O}}. Arik and Tomas Pfister},
  title     = {Nested Hierarchical Transformer: Towards Accurate, Data-Efficient and Interpretable Visual Understanding},
  booktitle = {Thirty-Sixth {AAAI} Conference on Artificial Intelligence, {AAAI} 2022, Thirty-Fourth Conference on Innovative Applications of Artificial Intelligence, {IAAI} 2022, The Twelveth Symposium on Educational Advances in Artificial Intelligence, {EAAI} 2022 Virtual Event, February 22 - March 1, 2022},
  pages     = {3417-3425},
  publisher = {{AAAI} Press},
  year      = {2022},
  url       = {https://ojs.aaai.org/index.php/AAAI/article/view/20252},
  timestamp = {Mon, 11 Jul 2022 16:09:32 +0200},
  biburl    = {https://dblp.org/rec/conf/aaai/ZhangZ0CAP22.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{wang2021multimodal,
  title   = {Multimodal Self-Supervised Learning of General Audio Representations},
  author  = {Luyu Wang and Pauline Luc and Adria Recasens and Jean-Baptiste Alayrac and Aaron van den Oord},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2104.12807}
}

@inproceedings{Zhang_2019_CVPR,
  author    = {Zhang, Wenxiao and Xiao, Chunxia},
  title     = {PCAN: 3D Attention Map Learning Using Contextual Information for Point Cloud Based Retrieval},
  booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
  month     = {June},
  year      = {2019}
}

@article{kahatapitiya2021swat,
  title   = {SWAT: Spatial Structure Within and Among Tokens},
  author  = {Kumara Kahatapitiya and Michael S. Ryoo},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2111.13677}
}

@article{yuan2021contextualized,
  title   = {Contextualized Spatio-Temporal Contrastive Learning with Self-Supervision},
  author  = {Liangzhe Yuan and Rui Qian and Yin Cui and Boqing Gong and Florian Schroff and Ming-Hsuan Yang and Hartwig Adam and Ting Liu},
  year    = {2022},
  journal = {CVPR}
}

@article{wang2022deformable,
  title   = {Deformable Video Transformer},
  author  = {Jue Wang and Lorenzo Torresani},
  year    = {2022},
  journal = {CVPR}
}

@article{shvetsova2021everything,
  title   = {Everything at Once - Multi-modal Fusion Transformer for Video Retrieval},
  author  = {Nina Shvetsova and Brian Chen and Andrew Rouditchenko and Samuel Thomas and Brian Kingsbury and Rogerio Feris and David Harwath and James Glass and Hilde Kuehne},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2112.04446}
}

@inproceedings{DBLP:conf/eccv/Gabeur0AS20,
  author    = {Valentin Gabeur and Chen Sun and Karteek Alahari and Cordelia Schmid},
  editor    = {Andrea Vedaldi and Horst Bischof and Thomas Brox and Jan{-}Michael Frahm},
  title     = {Multi-modal Transformer for Video Retrieval},
  booktitle = {Computer Vision - {ECCV} 2020 - 16th European Conference, Glasgow, UK, August 23-28, 2020, Proceedings, Part {IV}},
  series    = {Lecture Notes in Computer Science},
  volume    = {12349},
  pages     = {214-229},
  publisher = {Springer},
  year      = {2020},
  url       = {https://doi.org/10.1007/978-3-030-58548-8\_13},
  doi       = {10.1007/978-3-030-58548-8\_13},
  timestamp = {Thu, 29 Oct 2020 15:25:19 +0100},
  biburl    = {https://dblp.org/rec/conf/eccv/Gabeur0AS20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{Recasens_2021_ICCV,
  author    = {Recasens, Adri\`a and Luc, Pauline and Alayrac, Jean-Baptiste and Wang, Luyu and Strub, Florian and Tallec, Corentin and Malinowski, Mateusz and P\u{a}tr\u{a}ucean, Viorica and Altch\'e, Florent and Valko, Michal and Grill, Jean-Bastien and van den Oord, A\"aron and Zisserman, Andrew},
  title     = {Broaden Your Views for Self-Supervised Video Learning},
  booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
  month     = {October},
  year      = {2021},
  pages     = {1255-1265}
}

@inproceedings{DBLP:conf/nips/AlayracRSARFSDZ20,
  author    = {Jean{-}Baptiste Alayrac and Adri{\`{a}} Recasens and Rosalia Schneider and Relja Arandjelovic and Jason Ramapuram and Jeffrey De Fauw and Lucas Smaira and Sander Dieleman and Andrew Zisserman},
  editor    = {Hugo Larochelle and Marc'Aurelio Ranzato and Raia Hadsell and Maria{-}Florina Balcan and Hsuan{-}Tien Lin},
  title     = {Self-Supervised MultiModal Versatile Networks},
  booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
  year      = {2020},
  url       = {https://proceedings.neurips.cc/paper/2020/hash/0060ef47b12160b9198302ebdb144dcf-Abstract.html},
  timestamp = {Tue, 19 Jan 2021 15:56:58 +0100},
  biburl    = {https://dblp.org/rec/conf/nips/AlayracRSARFSDZ20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{li2021towards,
  title   = {Towards a Unified Foundation Model: Jointly Pre-Training Transformers on Unpaired Images and Text},
  author  = {Qing Li and Boqing Gong and Yin Cui and Dan Kondratyuk and Xianzhi Du and Ming-Hsuan Yang and Matthew Brown},
  year    = {2021},
  journal = {arXiv preprint arXiv: Arxiv-2112.07074}
}

@inproceedings{DBLP:conf/nips/HuangDXCZH21,
  author    = {Yu Huang and Chenzhuang Du and Zihui Xue and Xuanyao Chen and Hang Zhao and Longbo Huang},
  editor    = {Marc'Aurelio Ranzato and Alina Beygelzimer and Yann N. Dauphin and Percy Liang and Jennifer Wortman Vaughan},
  title     = {What Makes Multi-Modal Learning Better than Single (Provably)},
  booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
  pages     = {10944-10956},
  year      = {2021},
  url       = {https://proceedings.neurips.cc/paper/2021/hash/5aa3405a3f865c10f420a4a7b55cbff3-Abstract.html},
  timestamp = {Tue, 03 May 2022 16:20:47 +0200},
  biburl    = {https://dblp.org/rec/conf/nips/HuangDXCZH21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/cvpr/YeR0W19,
  author    = {Linwei Ye and Mrigank Rochan and Zhi Liu and Yang Wang},
  title     = {Cross-Modal Self-Attention Network for Referring Image Segmentation},
  booktitle = {{IEEE} Conference on Computer Vision and Pattern Recognition, {CVPR} 2019, Long Beach, CA, USA, June 16-20, 2019},
  pages     = {10502-10511},
  publisher = {Computer Vision Foundation / {IEEE}},
  year      = {2019},
  url       = {http://openaccess.thecvf.com/content\_CVPR\_2019/html/Ye\_Cross-Modal\_Self-Attention\_Network\_for\_Referring\_Image\_Segmentation\_CVPR\_2019\_paper.html},
  doi       = {10.1109/CVPR.2019.01075},
  timestamp = {Mon, 30 Aug 2021 17:01:14 +0200},
  biburl    = {https://dblp.org/rec/conf/cvpr/YeR0W19.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{yang2022visionlanguage,
  title   = {Vision-Language Pre-Training with Triple Contrastive Learning},
  author  = {Jinyu Yang and Jiali Duan and Son Tran and Yi Xu and Sampath Chanda and Liqun Chen and Belinda Zeng and Trishul Chilimbi and Junzhou Huang},
  year    = {2022},
  journal = {CVPR}
}

@inproceedings{NIPS2017_7a98af17,
  author    = {van den Oord, Aaron and Vinyals, Oriol and kavukcuoglu, koray},
  booktitle = {Advances in Neural Information Processing Systems},
  editor    = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
  publisher = {Curran Associates, Inc.},
  title     = {Neural Discrete Representation Learning},
  url       = {https://proceedings.neurips.cc/paper/2017/file/7a98af17e63a0ac09ce2e96d03992fbc-Paper.pdf},
  volume    = {30},
  year      = {2017}
}

@inproceedings{DBLP:conf/icml/RameshPGGVRCS21,
  author    = {Aditya Ramesh and Mikhail Pavlov and Gabriel Goh and Scott Gray and Chelsea Voss and Alec Radford and Mark Chen and Ilya Sutskever},
  editor    = {Marina Meila and Tong Zhang},
  title     = {Zero-Shot Text-to-Image Generation},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning, {ICML} 2021, 18-24 July 2021, Virtual Event},
  series    = {Proceedings of Machine Learning Research},
  volume    = {139},
  pages     = {8821-8831},
  publisher = {{PMLR}},
  year      = {2021},
  url       = {http://proceedings.mlr.press/v139/ramesh21a.html},
  timestamp = {Wed, 25 Aug 2021 17:11:17 +0200},
  biburl    = {https://dblp.org/rec/conf/icml/RameshPGGVRCS21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icml/RadfordKHRGASAM21,
  author    = {Alec Radford and Jong Wook Kim and Chris Hallacy and Aditya Ramesh and Gabriel Goh and Sandhini Agarwal and Girish Sastry and Amanda Askell and Pamela Mishkin and Jack Clark and Gretchen Krueger and Ilya Sutskever},
  editor    = {Marina Meila and Tong Zhang},
  title     = {Learning Transferable Visual Models From Natural Language Supervision},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning, {ICML} 2021, 18-24 July 2021, Virtual Event},
  series    = {Proceedings of Machine Learning Research},
  volume    = {139},
  pages     = {8748-8763},
  publisher = {{PMLR}},
  year      = {2021},
  url       = {http://proceedings.mlr.press/v139/radford21a.html},
  timestamp = {Wed, 25 Aug 2021 17:11:17 +0200},
  biburl    = {https://dblp.org/rec/conf/icml/RadfordKHRGASAM21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/HoJA20,
  author    = {Jonathan Ho and Ajay Jain and Pieter Abbeel},
  editor    = {Hugo Larochelle and Marc'Aurelio Ranzato and Raia Hadsell and Maria{-}Florina Balcan and Hsuan{-}Tien Lin},
  title     = {Denoising Diffusion Probabilistic Models},
  booktitle = {Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems 2020, NeurIPS 2020, December 6-12, 2020, virtual},
  year      = {2020},
  url       = {https://proceedings.neurips.cc/paper/2020/hash/4c5bcfec8584af0d967f1ab10179ca4b-Abstract.html},
  timestamp = {Tue, 19 Jan 2021 15:57:09 +0100},
  biburl    = {https://dblp.org/rec/conf/nips/HoJA20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{saharia2022photorealistic,
  title   = {Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},
  author  = {Chitwan Saharia and William Chan and Saurabh Saxena and Lala Li and Jay Whang and Emily Denton and Seyed Kamyar Seyed Ghasemipour and Burcu Karagol Ayan and S. Sara Mahdavi and Rapha Gontijo Lopes and Tim Salimans and Jonathan Ho and David J Fleet and Mohammad Norouzi},
  year    = {2022},
  journal = {arXiv preprint arXiv: Arxiv-2205.11487}
}

@article{grill2020bootstrap,
  title     = {Bootstrap your own latent: A new approach to self-supervised Learning},
  author    = {Jean-Bastien Grill and Florian Strub and Florent Altch'e and Corentin Tallec and Pierre H. Richemond and Elena Buchatskaya and Carl Doersch and B. '. Pires and Z. Guo and M. G. Azar and Bilal Piot and K. Kavukcuoglu and R. Munos and Michal Valko},
  journal   = {neurips},
  year      = {2020},
  bibSource = {Semantic Scholar https://www.semanticscholar.org/paper/38f93092ece8eee9771e61c1edaf11b1293cae1b}
}

@inproceedings{DosovitskiyB0WZ21,
  author    = {Alexey Dosovitskiy and Lucas Beyer and Alexander Kolesnikov and Dirk Weissenborn and Xiaohua Zhai and Thomas Unterthiner and Mostafa Dehghani and Matthias Minderer and Georg Heigold and Sylvain Gelly and Jakob Uszkoreit and Neil Houlsby},
  title     = {An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
  booktitle = {9th International Conference on Learning Representations, {ICLR} 2021, Virtual Event, Austria, May 3-7, 2021},
  publisher = {OpenReview.net},
  year      = {2021},
  url       = {https://openreview.net/forum?id=YicbFdNTTy},
  timestamp = {Wed, 23 Jun 2021 17:36:39 +0200},
  biburl    = {https://dblp.org/rec/conf/iclr/DosovitskiyB0WZ21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/icml/ChenK0H20,
  author    = {Ting Chen and Simon Kornblith and Mohammad Norouzi and Geoffrey E. Hinton},
  title     = {A Simple Framework for Contrastive Learning of Visual Representations},
  booktitle = {Proceedings of the 37th International Conference on Machine Learning, {ICML} 2020, 13-18 July 2020, Virtual Event},
  series    = {Proceedings of Machine Learning Research},
  volume    = {119},
  pages     = {1597-1607},
  publisher = {{PMLR}},
  year      = {2020},
  url       = {http://proceedings.mlr.press/v119/chen20j.html},
  timestamp = {Tue, 15 Dec 2020 17:40:18 +0100},
  biburl    = {https://dblp.org/rec/conf/icml/ChenK0H20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/NagraniYAJSS21,
  author    = {Arsha Nagrani and Shan Yang and Anurag Arnab and Aren Jansen and Cordelia Schmid and Chen Sun},
  editor    = {Marc'Aurelio Ranzato and Alina Beygelzimer and Yann N. Dauphin and Percy Liang and Jennifer Wortman Vaughan},
  title     = {Attention Bottlenecks for Multimodal Fusion},
  booktitle = {Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems 2021, NeurIPS 2021, December 6-14, 2021, virtual},
  pages     = {14200-14213},
  year      = {2021},
  url       = {https://proceedings.neurips.cc/paper/2021/hash/76ba9f564ebbc35b1014ac498fafadd0-Abstract.html},
  timestamp = {Tue, 03 May 2022 16:20:48 +0200},
  biburl    = {https://dblp.org/rec/conf/nips/NagraniYAJSS21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{oord2018representation,
  title   = {Representation Learning with Contrastive Predictive Coding},
  author  = {Aaron van den Oord and Yazhe Li and Oriol Vinyals},
  year    = {2018},
  journal = {arXiv preprint arXiv: Arxiv-1807.03748}
}

@inproceedings{DBLP:conf/icml/ZbontarJMLD21,
  author    = {Jure Zbontar and Li Jing and Ishan Misra and Yann LeCun and St{\'{e}}phane Deny},
  editor    = {Marina Meila and Tong Zhang},
  title     = {Barlow Twins: Self-Supervised Learning via Redundancy Reduction},
  booktitle = {Proceedings of the 38th International Conference on Machine Learning, {ICML} 2021, 18-24 July 2021, Virtual Event},
  series    = {Proceedings of Machine Learning Research},
  volume    = {139},
  pages     = {12310-12320},
  publisher = {{PMLR}},
  year      = {2021},
  url       = {http://proceedings.mlr.press/v139/zbontar21a.html},
  timestamp = {Wed, 25 Aug 2021 17:11:17 +0200},
  biburl    = {https://dblp.org/rec/conf/icml/ZbontarJMLD21.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{DBLP:journals/mt/SulubacakCGREST20,
  author    = {Umut Sulubacak and Ozan Caglayan and Stig{-}Arne Gr{\"{o}}nroos and Aku Rouhe and Desmond Elliott and Lucia Specia and J{\"{o}}rg Tiedemann},
  title     = {Multimodal machine translation through visuals and speech},
  journal   = {Mach. Transl.},
  volume    = {34},
  number    = {2-3},
  pages     = {97-147},
  year      = {2020},
  url       = {https://doi.org/10.1007/s10590-020-09250-0},
  doi       = {10.1007/s10590-020-09250-0},
  timestamp = {Wed, 01 Sep 2021 12:45:25 +0200},
  biburl    = {https://dblp.org/rec/journals/mt/SulubacakCGREST20.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@inproceedings{DBLP:conf/nips/VaswaniSPUJGKP17,
  author    = {Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
  editor    = {Isabelle Guyon and Ulrike von Luxburg and Samy Bengio and Hanna M. Wallach and Rob Fergus and S. V. N. Vishwanathan and Roman Garnett},
  title     = {Attention is All you Need},
  booktitle = {Advances in Neural Information Processing Systems 30: Annual Conference on Neural Information Processing Systems 2017, December 4-9, 2017, Long Beach, CA, {USA}},
  pages     = {5998-6008},
  year      = {2017},
  url       = {https://proceedings.neurips.cc/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html},
  timestamp = {Thu, 21 Jan 2021 15:15:21 +0100},
  biburl    = {https://dblp.org/rec/conf/nips/VaswaniSPUJGKP17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@article{Cheerla2019,
  title={Deep learning with multimodal representation for pancancer prognosis prediction},
  author={Cheerla, Anika and Gevaert, Olivier},
  journal={Bioinformatics},
  volume={35},
  number={14},
  pages={i446--i454},
  year={2019},
  publisher={Oxford University Press}
}

@article{MultiSurv2021,
  title={Long-term cancer survival prediction using multimodal deep learning},
  author={Vale-Silva, Lu{\'\i}s A and Rohr, Karl},
  journal={Scientific Reports},
  volume={11},
  number={1},
  pages={1--12},
  year={2021},
  publisher={Nature Publishing Group}
}

@inproceedings{WideDeepNN2016,
  title={Wide \& deep learning for recommender systems},
  author={Cheng, Heng-Tze and Koc, Levent and Harmsen, Jeremiah and Shaked, Tal and Chandra, Tushar and Aradhye, Hrishi and Anderson, Glen and Corrado, Greg and Chai, Wei and Ispir, Mustafa and others},
  booktitle={Proceedings of the 1st workshop on deep learning for recommender systems},
  pages={7--10},
  year={2016}
}

@inproceedings{DeepPAMM2022,
  title={DeepPAMM: Deep Piecewise Exponential Additive Mixed Models for Complex Hazard Structures in Survival Analysis},
  author={Kopper, Philipp and Wiegrebe, Simon and Bischl, Bernd and Bender, Andreas and R{\"u}gamer, David},
  booktitle={Pacific-Asia Conference on Knowledge Discovery and Data Mining},
  pages={249--261},
  year={2022},
  organization={Springer}
}

@inproceedings{Poelsterl2020,
  title={A wide and deep neural network for survival analysis from anatomical shape and tabular clinical data},
  author={P{\"o}lsterl, Sebastian and Sarasua, Ignacio and Guti{\'e}rrez-Becker, Benjam{\'\i}n and Wachinger, Christian},
  booktitle={Joint European Conference on Machine Learning and Knowledge Discovery in Databases},
  pages={453--464},
  year={2019},
  organization={Springer}
}

@article{SSDDR2020,
  title={Semi-structured deep distributional regression: Combining structured additive models and deep learning},
  author={R{\"u}gamer, David and Kolb, Chris and Klein, Nadja},
  journal={arXiv preprint arXiv:2002.05777},
  year={2020}
}


@article{GAWWN2016,
  author    = {Scott E. Reed and
               Zeynep Akata and
               Santosh Mohan and
               Samuel Tenka and
               Bernt Schiele and
               Honglak Lee},
  title     = {Learning What and Where to Draw},
  journal   = {CoRR},
  volume    = {abs/1610.02454},
  year      = {2016},
  url       = {http://arxiv.org/abs/1610.02454},
  eprinttype = {arXiv},
  eprint    = {1610.02454},
  timestamp = {Mon, 13 Aug 2018 16:49:04 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ReedAMTSL16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{MirrorGAN2019,
  author    = {Tingting Qiao and
               Jing Zhang and
               Duanqing Xu and
               Dacheng Tao},
  title     = {MirrorGAN: Learning Text-to-image Generation by Redescription},
  journal   = {CoRR},
  volume    = {abs/1903.05854},
  year      = {2019},
  url       = {http://arxiv.org/abs/1903.05854},
  eprinttype = {arXiv},
  eprint    = {1903.05854},
  timestamp = {Wed, 05 Feb 2020 15:10:45 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1903-05854.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{LAFITE2021,
  author    = {Yufan Zhou and
               Ruiyi Zhang and
               Changyou Chen and
               Chunyuan Li and
               Chris Tensmeyer and
               Tong Yu and
               Jiuxiang Gu and
               Jinhui Xu and
               Tong Sun},
  title     = {{LAFITE:} Towards Language-Free Training for Text-to-Image Generation},
  journal   = {CoRR},
  volume    = {abs/2111.13792},
  year      = {2021},
  url       = {https://arxiv.org/abs/2111.13792},
  eprinttype = {arXiv},
  eprint    = {2111.13792},
  timestamp = {Thu, 02 Dec 2021 09:37:06 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2111-13792.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{MakeAScene2022,
  doi = {10.48550/ARXIV.2203.13131},

  url = {https://arxiv.org/abs/2203.13131},

  author = {Gafni, Oran and Polyak, Adam and Ashual, Oron and Sheynin, Shelly and Parikh, Devi and Taigman, Yaniv},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), Artificial Intelligence (cs.AI), Computation and Language (cs.CL), Graphics (cs.GR), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Make-A-Scene: Scene-Based Text-to-Image Generation with Human Priors},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{CogView2021,
  author    = {Ming Ding and
               Zhuoyi Yang and
               Wenyi Hong and
               Wendi Zheng and
               Chang Zhou and
               Da Yin and
               Junyang Lin and
               Xu Zou and
               Zhou Shao and
               Hongxia Yang and
               Jie Tang},
  title     = {CogView: Mastering Text-to-Image Generation via Transformers},
  journal   = {CoRR},
  volume    = {abs/2105.13290},
  year      = {2021},
  url       = {https://arxiv.org/abs/2105.13290},
  eprinttype = {arXiv},
  eprint    = {2105.13290},
  timestamp = {Tue, 28 Dec 2021 16:50:17 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2105-13290.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{Evaluation2015,
  doi = {10.48550/ARXIV.1511.01844},

  url = {https://arxiv.org/abs/1511.01844},

  author = {Theis, Lucas and Oord, Aäron van den and Bethge, Matthias},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {A note on the evaluation of generative models},

  publisher = {arXiv},

  year = {2015},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{EvaluationComparison2018,
  author    = {Ali Borji},
  title     = {Pros and Cons of {GAN} Evaluation Measures},
  journal   = {CoRR},
  volume    = {abs/1802.03446},
  year      = {2018},
  url       = {http://arxiv.org/abs/1802.03446},
  eprinttype = {arXiv},
  eprint    = {1802.03446},
  timestamp = {Mon, 13 Aug 2018 16:46:34 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1802-03446.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{InceptionScore2016,
  author    = {Tim Salimans and
               Ian J. Goodfellow and
               Wojciech Zaremba and
               Vicki Cheung and
               Alec Radford and
               Xi Chen},
  title     = {Improved Techniques for Training GANs},
  journal   = {CoRR},
  volume    = {abs/1606.03498},
  year      = {2016},
  url       = {http://arxiv.org/abs/1606.03498},
  eprinttype = {arXiv},
  eprint    = {1606.03498},
  timestamp = {Sun, 16 Sep 2018 14:17:04 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/SalimansGZCRC16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{InceptionNet2015,
  author    = {Christian Szegedy and
               Vincent Vanhoucke and
               Sergey Ioffe and
               Jonathon Shlens and
               Zbigniew Wojna},
  title     = {Rethinking the Inception Architecture for Computer Vision},
  journal   = {CoRR},
  volume    = {abs/1512.00567},
  year      = {2015},
  url       = {http://arxiv.org/abs/1512.00567},
  eprinttype = {arXiv},
  eprint    = {1512.00567},
  timestamp = {Mon, 13 Aug 2018 16:49:07 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/SzegedyVISW15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@inproceedings{FID2017,
 author = {Heusel, Martin and Ramsauer, Hubert and Unterthiner, Thomas and Nessler, Bernhard and Hochreiter, Sepp},
 booktitle = {Advances in Neural Information Processing Systems},
 editor = {I. Guyon and U. Von Luxburg and S. Bengio and H. Wallach and R. Fergus and S. Vishwanathan and R. Garnett},
 pages = {},
 publisher = {Curran Associates, Inc.},
 title = {GANs Trained by a Two Time-Scale Update Rule Converge to a Local Nash Equilibrium},
 url = {https://proceedings.neurips.cc/paper/2017/file/8a1d694707eb0fefe65871369074926d-Paper.pdf},
 volume = {30},
 year = {2017}
}


@misc{GenerativePrecisionRecall2018,
  doi = {10.48550/ARXIV.1806.00035},

  url = {https://arxiv.org/abs/1806.00035},

  author = {Sajjadi, Mehdi S. M. and Bachem, Olivier and Lucic, Mario and Bousquet, Olivier and Gelly, Sylvain},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Assessing Generative Models via Precision and Recall},

  publisher = {arXiv},

  year = {2018},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{ImprovedPrecisionRecall2019,
  doi = {10.48550/ARXIV.1904.06991},

  url = {https://arxiv.org/abs/1904.06991},

  author = {Kynkäänniemi, Tuomas and Karras, Tero and Laine, Samuli and Lehtinen, Jaakko and Aila, Timo},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), Neural and Evolutionary Computing (cs.NE), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Improved Precision and Recall Metric for Assessing Generative Models},

  publisher = {arXiv},

  year = {2019},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{CLIP2021,
  author    = {Alec Radford and
               Jong Wook Kim and
               Chris Hallacy and
               Aditya Ramesh and
               Gabriel Goh and
               Sandhini Agarwal and
               Girish Sastry and
               Amanda Askell and
               Pamela Mishkin and
               Jack Clark and
               Gretchen Krueger and
               Ilya Sutskever},
  title     = {Learning Transferable Visual Models From Natural Language Supervision},
  journal   = {CoRR},
  volume    = {abs/2103.00020},
  year      = {2021},
  url       = {https://arxiv.org/abs/2103.00020},
  eprinttype = {arXiv},
  eprint    = {2103.00020},
  timestamp = {Thu, 04 Mar 2021 17:00:40 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2103-00020.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{GAN2014,
  doi = {10.48550/ARXIV.1406.2661},

  url = {https://arxiv.org/abs/1406.2661},

  author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Generative Adversarial Networks},

  publisher = {arXiv},

  year = {2014},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{GANTextToImage2016,
  author    = {Scott E. Reed and
               Zeynep Akata and
               Xinchen Yan and
               Lajanugen Logeswaran and
               Bernt Schiele and
               Honglak Lee},
  title     = {Generative Adversarial Text to Image Synthesis},
  journal   = {CoRR},
  volume    = {abs/1605.05396},
  year      = {2016},
  url       = {http://arxiv.org/abs/1605.05396},
  eprinttype = {arXiv},
  eprint    = {1605.05396},
  timestamp = {Mon, 13 Aug 2018 16:48:35 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ReedAYLSL16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{JointRepresentations2016,
  author    = {Scott E. Reed and
               Zeynep Akata and
               Bernt Schiele and
               Honglak Lee},
  title     = {Learning Deep Representations of Fine-grained Visual Descriptions},
  journal   = {CoRR},
  volume    = {abs/1605.05395},
  year      = {2016},
  url       = {http://arxiv.org/abs/1605.05395},
  eprinttype = {arXiv},
  eprint    = {1605.05395},
  timestamp = {Mon, 13 Aug 2018 16:47:30 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/ReedASL16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{StackGAN2016,
  author    = {Han Zhang and
               Tao Xu and
               Hongsheng Li and
               Shaoting Zhang and
               Xiaolei Huang and
               Xiaogang Wang and
               Dimitris N. Metaxas},
  title     = {StackGAN: Text to Photo-realistic Image Synthesis with Stacked Generative
               Adversarial Networks},
  journal   = {CoRR},
  volume    = {abs/1612.03242},
  year      = {2016},
  url       = {http://arxiv.org/abs/1612.03242},
  eprinttype = {arXiv},
  eprint    = {1612.03242},
  timestamp = {Fri, 26 Feb 2021 08:54:50 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/ZhangXLZHWM16.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{AttentionIsAllYouNeed2017,
  author    = {Ashish Vaswani and
               Noam Shazeer and
               Niki Parmar and
               Jakob Uszkoreit and
               Llion Jones and
               Aidan N. Gomez and
               Lukasz Kaiser and
               Illia Polosukhin},
  title     = {Attention Is All You Need},
  journal   = {CoRR},
  volume    = {abs/1706.03762},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.03762},
  eprinttype = {arXiv},
  eprint    = {1706.03762},
  timestamp = {Sat, 23 Jan 2021 01:20:40 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{AttnGAN2017,
  author    = {Tao Xu and
               Pengchuan Zhang and
               Qiuyuan Huang and
               Han Zhang and
               Zhe Gan and
               Xiaolei Huang and
               Xiaodong He},
  title     = {AttnGAN: Fine-Grained Text to Image Generation with Attentional Generative
               Adversarial Networks},
  journal   = {CoRR},
  volume    = {abs/1711.10485},
  year      = {2017},
  url       = {http://arxiv.org/abs/1711.10485},
  eprinttype = {arXiv},
  eprint    = {1711.10485},
  timestamp = {Mon, 13 Aug 2018 16:46:25 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1711-10485.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DMGAN2019,
  author    = {Minfeng Zhu and
               Pingbo Pan and
               Wei Chen and
               Yi Yang},
  title     = {{DM-GAN:} Dynamic Memory Generative Adversarial Networks for Text-to-Image
               Synthesis},
  journal   = {CoRR},
  volume    = {abs/1904.01310},
  year      = {2019},
  url       = {http://arxiv.org/abs/1904.01310},
  eprinttype = {arXiv},
  eprint    = {1904.01310},
  timestamp = {Fri, 11 Oct 2019 13:11:40 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1904-01310.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DFGAN2020,
  author    = {Ming Tao and
               Hao Tang and
               Songsong Wu and
               Nicu Sebe and
               Fei Wu and
               Xiao{-}Yuan Jing},
  title     = {{DF-GAN:} Deep Fusion Generative Adversarial Networks for Text-to-Image
               Synthesis},
  journal   = {CoRR},
  volume    = {abs/2008.05865},
  year      = {2020},
  url       = {https://arxiv.org/abs/2008.05865},
  eprinttype = {arXiv},
  eprint    = {2008.05865},
  timestamp = {Tue, 18 Aug 2020 10:39:01 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2008-05865.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DALLE1,
  author    = {Aditya Ramesh and
               Mikhail Pavlov and
               Gabriel Goh and
               Scott Gray and
               Chelsea Voss and
               Alec Radford and
               Mark Chen and
               Ilya Sutskever},
  title     = {Zero-Shot Text-to-Image Generation},
  journal   = {CoRR},
  volume    = {abs/2102.12092},
  year      = {2021},
  url       = {https://arxiv.org/abs/2102.12092},
  eprinttype = {arXiv},
  eprint    = {2102.12092},
  timestamp = {Tue, 02 Mar 2021 12:11:01 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2102-12092.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{VAE2013,
  doi = {10.48550/ARXIV.1312.6114},

  url = {https://arxiv.org/abs/1312.6114},

  author = {Kingma, Diederik P and Welling, Max},

  keywords = {Machine Learning (stat.ML), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Auto-Encoding Variational Bayes},

  publisher = {arXiv},

  year = {2013},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{weng2018VAE,
  title   = "From Autoencoder to Beta-VAE",
  author  = "Weng, Lilian",
  journal = "lilianweng.github.io",
  year    = "2018",
  url     = "https://lilianweng.github.io/posts/2018-08-12-vae/"
}


@article{VQVAE2017,
  author    = {A{\"{a}}ron van den Oord and
               Oriol Vinyals and
               Koray Kavukcuoglu},
  title     = {Neural Discrete Representation Learning},
  journal   = {CoRR},
  volume    = {abs/1711.00937},
  year      = {2017},
  url       = {http://arxiv.org/abs/1711.00937},
  eprinttype = {arXiv},
  eprint    = {1711.00937},
  timestamp = {Mon, 13 Aug 2018 16:48:11 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1711-00937.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{UnderstandingVQVAE,
  author = {Snell, Charlie},
  title = {Understanding VQ-VAE},
  howpublished = {\url{https://ml.berkeley.edu/blog/posts/vq-vae/}},
  year = "2021",
  note = {Accessed: 2022-09-12}
}


@article{BPE2015,
  author    = {Rico Sennrich and
               Barry Haddow and
               Alexandra Birch},
  title     = {Neural Machine Translation of Rare Words with Subword Units},
  journal   = {CoRR},
  volume    = {abs/1508.07909},
  year      = {2015},
  url       = {http://arxiv.org/abs/1508.07909},
  eprinttype = {arXiv},
  eprint    = {1508.07909},
  timestamp = {Mon, 13 Aug 2018 16:47:17 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/SennrichHB15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{Diffusion2015,
  author    = {Jascha Sohl{-}Dickstein and
               Eric A. Weiss and
               Niru Maheswaranathan and
               Surya Ganguli},
  title     = {Deep Unsupervised Learning using Nonequilibrium Thermodynamics},
  journal   = {CoRR},
  volume    = {abs/1503.03585},
  year      = {2015},
  url       = {http://arxiv.org/abs/1503.03585},
  eprinttype = {arXiv},
  eprint    = {1503.03585},
  timestamp = {Mon, 13 Aug 2018 16:47:30 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/Sohl-DicksteinW15.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{DenoisingDiffusion2020,
  author    = {Jonathan Ho and
               Ajay Jain and
               Pieter Abbeel},
  title     = {Denoising Diffusion Probabilistic Models},
  journal   = {CoRR},
  volume    = {abs/2006.11239},
  year      = {2020},
  url       = {https://arxiv.org/abs/2006.11239},
  eprinttype = {arXiv},
  eprint    = {2006.11239},
  timestamp = {Tue, 23 Jun 2020 17:57:22 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2006-11239.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{weng2021diffusion,
  title   = "What are diffusion models?",
  author  = "Weng, Lilian",
  journal = "lilianweng.github.io",
  year    = "2021",
  month   = "Jul",
  url     = "https://lilianweng.github.io/posts/2021-07-11-diffusion-models/"
}


@misc{DALLE2,
  doi = {10.48550/ARXIV.2204.06125},

  url = {https://arxiv.org/abs/2204.06125},

  author = {Ramesh, Aditya and Dhariwal, Prafulla and Nichol, Alex and Chu, Casey and Chen, Mark},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Hierarchical Text-Conditional Image Generation with CLIP Latents},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution 4.0 International}
}


@misc{Imagen2022,
  doi = {10.48550/ARXIV.2205.11487},

  url = {https://arxiv.org/abs/2205.11487},

  author = {Saharia, Chitwan and Chan, William and Saxena, Saurabh and Li, Lala and Whang, Jay and Denton, Emily and Ghasemipour, Seyed Kamyar Seyed and Ayan, Burcu Karagol and Mahdavi, S. Sara and Lopes, Rapha Gontijo and Salimans, Tim and Ho, Jonathan and Fleet, David J and Norouzi, Mohammad},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@misc{Parti2022,
  doi = {10.48550/ARXIV.2206.10789},

  url = {https://arxiv.org/abs/2206.10789},

  author = {Yu, Jiahui and Xu, Yuanzhong and Koh, Jing Yu and Luong, Thang and Baid, Gunjan and Wang, Zirui and Vasudevan, Vijay and Ku, Alexander and Yang, Yinfei and Ayan, Burcu Karagol and Hutchinson, Ben and Han, Wei and Parekh, Zarana and Li, Xin and Zhang, Han and Baldridge, Jason and Wu, Yonghui},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), Machine Learning (cs.LG), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {Scaling Autoregressive Models for Content-Rich Text-to-Image Generation},

  publisher = {arXiv},

  year = {2022},

  copyright = {arXiv.org perpetual, non-exclusive license}
}


@article{T5XXL2019,
  author    = {Colin Raffel and
               Noam Shazeer and
               Adam Roberts and
               Katherine Lee and
               Sharan Narang and
               Michael Matena and
               Yanqi Zhou and
               Wei Li and
               Peter J. Liu},
  title     = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text
               Transformer},
  journal   = {CoRR},
  volume    = {abs/1910.10683},
  year      = {2019},
  url       = {http://arxiv.org/abs/1910.10683},
  eprinttype = {arXiv},
  eprint    = {1910.10683},
  timestamp = {Fri, 05 Feb 2021 15:43:41 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1910-10683.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@article{VitVQGAN2021,
  author    = {Jiahui Yu and
               Xin Li and
               Jing Yu Koh and
               Han Zhang and
               Ruoming Pang and
               James Qin and
               Alexander Ku and
               Yuanzhong Xu and
               Jason Baldridge and
               Yonghui Wu},
  title     = {Vector-quantized Image Modeling with Improved {VQGAN}},
  journal   = {CoRR},
  volume    = {abs/2110.04627},
  year      = {2021},
  url       = {https://arxiv.org/abs/2110.04627},
  eprinttype = {arXiv},
  eprint    = {2110.04627},
  timestamp = {Thu, 21 Oct 2021 16:20:08 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2110-04627.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{VQGANCLIP2022,
  doi = {10.48550/ARXIV.2204.08583},

  url = {https://arxiv.org/abs/2204.08583},

  author = {Crowson, Katherine and Biderman, Stella and Kornis, Daniel and Stander, Dashiell and Hallahan, Eric and Castricato, Louis and Raff, Edward},

  keywords = {Computer Vision and Pattern Recognition (cs.CV), FOS: Computer and information sciences, FOS: Computer and information sciences},

  title = {VQGAN-CLIP: Open Domain Image Generation and Editing with Natural Language Guidance},

  publisher = {arXiv},

  year = {2022},

  copyright = {Creative Commons Attribution Share Alike 4.0 International}
}


@misc{Midjourney,
  author = {Midjourney},
  title = {Midjourney},
  howpublished = {\url{https://www.midjourney.com/}},
  year = "2022",
  note = {Accessed: 2022-09-12}
}


@article{LatentDiffusion2021,
  author    = {Robin Rombach and
               Andreas Blattmann and
               Dominik Lorenz and
               Patrick Esser and
               Bj{\"{o}}rn Ommer},
  title     = {High-Resolution Image Synthesis with Latent Diffusion Models},
  journal   = {CoRR},
  volume    = {abs/2112.10752},
  year      = {2021},
  url       = {https://arxiv.org/abs/2112.10752},
  eprinttype = {arXiv},
  eprint    = {2112.10752},
  timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-2112-10752.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}


@misc{StableDiffusion2022,
  author    = {Robin Rombach and
               Andreas Blattmann and
               Dominik Lorenz and
               Patrick Esser and
               Bj{\"{o}}rn Ommer},
  title = {StableDiffusion},
  howpublished = {\url{https://github.com/CompVis/stable-diffusion}},
  year = "2022",
  note = {Accessed: 2022-09-12}
}


@article{mishkin2022risks,
  title={DALL·E 2 Preview - Risks and Limitations},
  author={Mishkin, Pamela and Ahmad, Lama and Brundage, Miles and Krueger, Gretchen and Sastry, Girish},
  year={2022},
  url={[https://github.com/openai/dalle-2-preview/blob/main/system-card.md](https://github.com/openai/dalle-2-preview/blob/main/system-card.md)}
}