diff --git a/docs/PRETRAINED.md b/docs/PRETRAINED.md index 2f899863e..670d71dcd 100644 --- a/docs/PRETRAINED.md +++ b/docs/PRETRAINED.md @@ -24,7 +24,7 @@ We replicate OpenAI's results on ViT-B/32, reaching a top-1 ImageNet-1k zero-sho -__Zero-shot comparison (courtesy of Andreas Fürst)__ +**Zero-shot comparison (courtesy of Andreas Fürst)** ViT-B/32 was trained with 128 A100 (40 GB) GPUs for ~36 hours, 4600 GPU-hours. The per-GPU batch size was 256 for a global batch size of 32768. 256 is much lower than it could have been (~320-384) due to being sized initially before moving to 'local' contrastive loss. @@ -44,9 +44,10 @@ ViT-B/16 was trained with 176 A100 (40 GB) GPUS for ~61 hours, 10700 GPU-hours. The B/16+ 240x240 LAION400M training reached a top-1 ImageNet-1k zero-shot validation score of 69.21. This model is the same depth as the B/16, but increases the - * vision width from 768 -> 896 - * text width from 512 -> 640 - * the resolution 224x224 -> 240x240 (196 -> 225 tokens) + +- vision width from 768 -> 896 +- text width from 512 -> 640 +- the resolution 224x224 -> 240x240 (196 -> 225 tokens) @@ -67,6 +68,7 @@ ViT-L/14 was trained with 400 A100 (40 GB) GPUS for ~127 hours, 50800 GPU-hours. A ~2B sample subset of LAION-5B with english captions (https://huggingface.co/datasets/laion/laion2B-en) #### ViT-B/32 224x224 + A ViT-B/32 trained on LAION-2B, reaching a top-1 ImageNet-1k zero-shot accuracy of 65.62%. @@ -91,7 +93,6 @@ A ViT-g/14 with a 76.6% top-1 ImageNet-1k zero-shot was trained on JUWELS Booste This model was trained with a shorted schedule than other LAION-2B models with 12B samples seen instead of 32+B. It matches LAION-400M training in samples seen. Many zero-shot results are lower as a result, but despite this it performs very well in some OOD zero-shot and retrieval tasks. - #### ViT-B/32 roberta base A ViT-B/32 with roberta base encoder with a 61.7% top-1 ImageNet-1k zero-shot was trained on stability. See model details here https://huggingface.co/laion/CLIP-ViT-B-32-roberta-base-laion2B-s12B-b32k @@ -113,22 +114,20 @@ See full english [metrics](https://huggingface.co/laion/CLIP-ViT-H-14-frozen-xlm On zero shot classification on imagenet with translated prompts this model reaches: -* 56% in italian (vs 21% for https://github.com/clip-italian/clip-italian) -* 53% in japanese (vs 54.6% for https://github.com/rinnakk/japanese-clip) -* 55.7% in chinese (to be compared with https://github.com/OFA-Sys/Chinese-CLIP) - +- 56% in italian (vs 21% for https://github.com/clip-italian/clip-italian) +- 53% in japanese (vs 54.6% for https://github.com/rinnakk/japanese-clip) +- 55.7% in chinese (to be compared with https://github.com/OFA-Sys/Chinese-CLIP) #### YFCC-15M Below are checkpoints of models trained on YFCC-15M, along with their zero-shot top-1 accuracies on ImageNet and ImageNetV2. These models were trained using 8 GPUs and the same hyperparameters described in the "Sample running code" section, with the exception of `lr=5e-4` and `epochs=32`. -* [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt) (32.7% / 27.9%) -* [ResNet-101](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt) (34.8% / 30.0%) +- [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-yfcc15m-455df137.pt) (32.7% / 27.9%) +- [ResNet-101](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn101-quickgelu-yfcc15m-3e04b30e.pt) (34.8% / 30.0%) #### CC12M - https://github.com/google-research-datasets/conceptual-12m -* [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt) (36.45%) - +- [ResNet-50](https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/rn50-quickgelu-cc12m-f000538c.pt) (36.45%) ### CommonPool and DataComp models @@ -138,14 +137,13 @@ The best performing models are specified below for the xlarge scale, see our pap Additional models and more information can be found at [/docs/datacomp_models.md](/docs/datacomp_models.md). +- `datacomp_xl_s13b_b90k`: A ViT-L/14 trained on DataComp-1B for 12.8B steps and batch size 90k. Achieves 79.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K. -* `datacomp_xl_s13b_b90k`: A ViT-L/14 trained on DataComp-1B for 12.8B steps and batch size 90k. Achieves 79.2% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K. - -* `commonpool_xl_clip_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using CLIP scores, for 12.8B steps and batch size 90k. Achieves 76.4% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K. +- `commonpool_xl_clip_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using CLIP scores, for 12.8B steps and batch size 90k. Achieves 76.4% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K. -* `commonpool_xl_laion_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using the LAION-2B filtering scheme, for 12.8B steps and batch size 90k. Achieves 75.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K. +- `commonpool_xl_laion_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL filtered using the LAION-2B filtering scheme, for 12.8B steps and batch size 90k. Achieves 75.5% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL.laion-s13B-b90K. -* `commonpool_xl_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL without any filtering, for 12.8B steps and batch size 90k. Achieves 72.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K. +- `commonpool_xl_s13b_b90k`: A ViT-L/14 trained on CommonPool-XL without any filtering, for 12.8B steps and batch size 90k. Achieves 72.3% zero-shot accuracy on ImageNet. Available at https://huggingface.co/laion/CLIP-ViT-L-14-CommonPool.XL-s13B-b90K. If you use models trained on DataComp-1B or CommonPool variations, please consider citing the following: @@ -158,15 +156,13 @@ If you use models trained on DataComp-1B or CommonPool variations, please consid } ``` - ### MetaCLIP MetaCLIP models are described in the paper [Demystifying CLIP Data](https://arxiv.org/abs/2309.16671). These models were developed by Hu Xu, Saining Xie, Xiaoqing Ellen Tan, Po-Yao Huang, Russell Howes, Vasu Sharma, Shang-Wen Li, Gargi Ghosh, Luke Zettlemoyer and Christoph Feichtenhofer from Meta, New York University and the University of Washington. Models are licensed under CC-BY-NC. -More details are available at https://github.com/facebookresearch/MetaCLIP. - +More details are available at https://github.com/facebookresearch/MetaCLIP. If you use MetaCLIP models, please cite the following: @@ -179,7 +175,6 @@ If you use MetaCLIP models, please cite the following: } ``` - ### EVA-CLIP EVA-CLIP models are described in the paper [EVA-CLIP: Improved Training Techniques for CLIP at Scale](https://arxiv.org/abs/2303.15389). @@ -188,7 +183,6 @@ These models were developed by Quan Sun, Yuxin Fang, Ledell Wu, Xinlong Wang and Models are licensed under the MIT License. More details are available at https://github.com/baaivision/EVA/tree/master/EVA-CLIP. - If you use EVA models, please cite the following: ```bibtex @@ -200,15 +194,21 @@ If you use EVA models, please cite the following: } ``` +### NLLB-CLIP + +NLLB-CLIP models are described in the paper [NLLB-CLIP - train performant multilingual image retrieval model on a budget](https://arxiv.org/abs/2309.01859) by Alexander Visheratin. + +The model was trained following the [LiT](https://arxiv.org/abs/2111.07991) methodology: the image tower was frozen, the text tower was initialized from the [NLLB](https://arxiv.org/abs/2207.04672) encoder and unfrozen. -### NLLB +The model was trained on the [LAION-COCO-NLLB](https://huggingface.co/datasets/visheratin/laion-coco-nllb) dataset. -NLLB models are described in the paper [NLLB-CLIP -- train performant multilingual image retrieval model on a budget -](https://arxiv.org/abs/2309.01859) by Alexander Visheratin. +The first version of the model (`nllb-clip`) described in the paper was trained using the OpenAI CLIP image encoder. + +The second version of the model (`nllb-clip-siglip`) was trained using the [SigLIP](https://arxiv.org/abs/2303.15343) image encoder. Models are licensed under CC-BY-NC. -If you use NLLB models, please cite the following: +If you use NLLB-CLIP models, please cite the following: ```bibtex @article{visheratin2023nllb, @@ -219,7 +219,6 @@ If you use NLLB models, please cite the following: } ``` - ### CLIPA CLIPA models are described in the following papers by Xianhang Li, Zeyu Wang, Cihang Xie from UC Santa Cruz: @@ -230,12 +229,11 @@ CLIPA models are described in the following papers by Xianhang Li, Zeyu Wang, Ci Models are licensed under Apache 2.0. More details are available at https://github.com/UCSC-VLAA/CLIPA and [here](clipa.md). - If you use CLIPA models, please cite the following: ```bibtex @inproceedings{li2023clipa, - title={An Inverse Scaling Law for CLIP Training}, + title={An Inverse Scaling Law for CLIP Training}, author={Xianhang Li and Zeyu Wang and Cihang Xie}, booktitle={NeurIPS}, year={2023}, @@ -244,7 +242,7 @@ If you use CLIPA models, please cite the following: ```bibtex @article{li2023clipav2, - title={CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy}, + title={CLIPA-v2: Scaling CLIP Training with 81.1% Zero-shot ImageNet Accuracy within a $10,000 Budget; An Extra $4,000 Unlocks 81.8% Accuracy}, author={Xianhang Li and Zeyu Wang and Cihang Xie}, journal={arXiv preprint arXiv:2306.15658}, year={2023}, @@ -259,7 +257,6 @@ These models were developed by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov Models are licensed under the Apache 2 license. More details are available at hhttps://github.com/google-research/big_vision. - If you use SigLIP models, please cite the following: ```bibtex diff --git a/docs/model_profile.csv b/docs/model_profile.csv index b5376adfb..b561623f5 100644 --- a/docs/model_profile.csv +++ b/docs/model_profile.csv @@ -65,6 +65,7 @@ EVA02-L-14-336,336,768,768,768,428.08,304.43,123.65,395.16,381.86,13.3 ViT-L-14-336,336,1024,768,768,427.94,304.29,123.65,395.22,381.92,13.3 ViT-L-16-SigLIP-384,384,768,1024,1024,652.48,316.28,336.19,422.91,383.85,39.06 convnext_xxlarge,256,768,1024,1024,1200.58,846.54,354.03,443.03,395.94,47.09 +nllb-clip-base-siglip,384,768,512,768,507.47,93.18,414.3,472.91,112.13,360.78 mt5-xl-ViT-H-14,224,1280,512,1024,2306.75,632.08,1674.68,514.04,334.59,179.45 EVA01-g-14,224,768,768,1024,1136.44,1012.59,123.85,547.36,534.06,13.3 RN50x64,448,128,1024,1024,623.26,420.38,202.88,552.65,529.11,23.55 @@ -78,6 +79,7 @@ ViT-bigG-14-CLIPA,224,1664,1280,1280,2517.22,1844.9,672.32,1007.93,967.5,40.44 ViT-H-14-378-quickgelu,378,1280,1024,1024,986.71,632.68,354.03,1054.05,1006.96,47.09 ViT-bigG-14,224,1664,1280,1280,2539.57,1844.91,694.66,1065.36,967.5,97.86 nllb-clip-large,224,1280,512,1024,1399.22,632.08,767.14,1468.46,334.59,1133.87 +nllb-clip-large-siglip,384,768,512,1152,1195.5,428.23,767.27,1804.22,670.35,1133.87 ViT-e-14,224,1792,1280,1280,4581.09,3807.72,773.37,2091.45,1981.35,110.1 ViT-bigG-14-CLIPA-336,336,1664,1280,1280,2517.76,1845.44,672.32,2271.58,2231.15,40.44 EVA02-E-14,224,768,1024,1024,4704.59,4350.56,354.03,2311.42,2264.33,47.09 diff --git a/docs/openclip_classification_results.csv b/docs/openclip_classification_results.csv index b135810a1..25cdce1e5 100644 --- a/docs/openclip_classification_results.csv +++ b/docs/openclip_classification_results.csv @@ -84,6 +84,7 @@ ViT-B-32-quickgelu,laion400m_e31,151.28,14.78,0.5273,0.6294,0.9121,0.9060,0.7021 ViT-B-32,openai,151.28,14.78,0.5265,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.0732,0.6061,0.1676,0.5386,0.8217 ViT-B-32-quickgelu,openai,151.28,14.78,0.5265,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.0732,0.6061,0.1676,0.5386,0.8217 RN50x4,openai,178.3,51.82,0.5191,0.6627,0.8661,0.7943,0.4514,0.2045,0.0905,0.2039,0.4862,0.3354,0.2102,0.8640,0.3622,0.4468,0.5944,0.4145,0.4955,0.7274,0.2335,0.4903,0.5141,0.6766,0.8829,0.6814,0.5675,0.6716,0.5338,0.6673,0.9658,0.6089,0.3190,0.0870,0.5435,0.1130,0.5654,0.8376 +nllb-clip-large-siglip,v1,1195.5,1804.22,0.5148,0.5175,0.8392,0.9651,0.7626,0.1737,0.2211,0.1549,0.4394,0.4941,0.0451,0.6312,0.4700,0.5050,0.4631,0.5611,0.1825,0.8325,0.4290,0.6203,0.6492,0.2846,0.4082,0.7823,0.5004,0.5601,0.5656,0.6451,0.9939,0.6355,0.4258,0.0950,0.5000,0.1415,0.6390,0.8855 ViT-B-32,laion400m_e31,151.28,14.78,0.5070,0.6022,0.8916,0.8825,0.6781,0.1549,0.2261,0.1356,0.5218,0.4694,0.1437,0.7814,0.4082,0.4648,0.5234,0.1957,0.5085,0.7079,0.1224,0.4108,0.4281,0.6319,0.8541,0.7312,0.5495,0.5162,0.5108,0.7436,0.9494,0.6508,0.2891,0.0745,0.4975,0.1076,0.5491,0.8328 ViT-B-32,laion400m_e32,151.28,14.78,0.5067,0.6024,0.8918,0.8840,0.6773,0.1536,0.2261,0.1349,0.5229,0.4754,0.1467,0.7817,0.4070,0.4646,0.5237,0.1953,0.5080,0.7084,0.1181,0.4000,0.4292,0.6323,0.8513,0.7328,0.5490,0.5206,0.5094,0.7454,0.9498,0.6509,0.2759,0.0741,0.5084,0.1068,0.5444,0.8326 RN101,openai,119.69,25.5,0.5036,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.0888,0.4723,0.1615,0.5631,0.8164 @@ -95,6 +96,7 @@ ViT-B-16,commonpool_l_image_s1b_b8k,149.62,41.09,0.4812,0.5719,0.8856,0.9321,0.6 ViT-B-16,commonpool_l_text_s1b_b8k,149.62,41.09,0.4758,0.5605,0.8720,0.9391,0.7054,0.1843,0.2373,0.0995,0.3941,0.3830,0.0451,0.7724,0.2317,0.4437,0.4835,0.2220,0.4770,0.6708,0.2686,0.2593,0.4911,0.5164,0.7049,0.7669,0.4857,0.4931,0.4663,0.6525,0.9523,0.6088,0.2122,0.0623,0.5697,0.0000,0.5643,0.8564 ViT-B-16,commonpool_l_basic_s1b_b8k,149.62,41.09,0.4566,0.5155,0.8444,0.8289,0.5251,0.2061,0.2277,0.1173,0.4133,0.3820,0.0481,0.7461,0.2021,0.3932,0.4325,0.1913,0.4600,0.6087,0.3333,0.2809,0.4493,0.4357,0.6956,0.7151,0.5899,0.5387,0.4313,0.7216,0.9373,0.5974,0.1173,0.0436,0.5712,0.0000,0.5421,0.8384 ViT-B-16,commonpool_l_s1b_b8k,149.62,41.09,0.4386,0.4593,0.8089,0.9133,0.6421,0.1594,0.2203,0.1177,0.3383,0.3348,0.0316,0.6735,0.2766,0.3448,0.3914,0.1592,0.4335,0.5265,0.2686,0.3603,0.4126,0.3681,0.5587,0.7093,0.5516,0.5118,0.4154,0.6060,0.9339,0.5713,0.3047,0.0399,0.5102,0.0000,0.5654,0.8305 +nllb-clip-base-siglip,v1,507.47,472.91,0.4377,0.3909,0.7507,0.9043,0.5939,0.1453,0.2254,0.0583,0.3617,0.3744,0.0090,0.4961,0.3429,0.3886,0.3439,0.3165,0.1695,0.6846,0.1927,0.5007,0.5001,0.1567,0.1868,0.7599,0.6692,0.5859,0.5049,0.4703,0.9818,0.5640,0.4033,0.0694,0.6500,0.0956,0.6320,0.8392 nllb-clip-large,v1,1399.22,1468.46,0.4163,0.3672,0.7234,0.9634,0.6797,0.2389,0.2254,0.0691,0.3447,0.5454,0.0216,0.4447,0.2462,0.3316,0.3233,0.2632,0.1725,0.5624,0.3727,0.2716,0.5268,0.0978,0.1283,0.7551,0.5417,0.5585,0.4983,0.3865,0.9811,0.5512,0.1725,0.0403,0.5181,0.1419,0.6752,0.8305 ViT-B-32,datacomp_m_s128m_b4k,151.28,14.78,0.3364,0.2972,0.7159,0.8252,0.5476,0.1365,0.2249,0.0453,0.2133,0.3393,0.0304,0.4168,0.1366,0.1930,0.2440,0.0493,0.4085,0.3402,0.2110,0.1147,0.1971,0.2965,0.4311,0.5459,0.5862,0.5316,0.2778,0.2803,0.8365,0.3637,0.1500,0.0142,0.6669,0.0000,0.4498,0.6559 ViT-B-32,commonpool_m_clip_s128m_b4k,151.28,14.78,0.3344,0.2725,0.6678,0.8405,0.5549,0.1402,0.2238,0.0458,0.2176,0.2589,0.0215,0.3999,0.1586,0.1844,0.2247,0.0420,0.3925,0.3297,0.3235,0.1778,0.2093,0.2551,0.3828,0.6074,0.5210,0.5014,0.2641,0.4123,0.8370,0.3875,0.1931,0.0154,0.5369,0.0000,0.4451,0.6610 diff --git a/docs/openclip_multilingual_retrieval_results.csv b/docs/openclip_multilingual_retrieval_results.csv new file mode 100644 index 000000000..a0704239f --- /dev/null +++ b/docs/openclip_multilingual_retrieval_results.csv @@ -0,0 +1,4 @@ +model,version,avg,crossmodal3600 image_retrieval_recall@1 avg,crossmodal3600 image_retrieval_recall@1 ar,crossmodal3600 image_retrieval_recall@1 bn,crossmodal3600 image_retrieval_recall@1 cs,crossmodal3600 image_retrieval_recall@1 da,crossmodal3600 image_retrieval_recall@1 de,crossmodal3600 image_retrieval_recall@1 el,crossmodal3600 image_retrieval_recall@1 en,crossmodal3600 image_retrieval_recall@1 es,crossmodal3600 image_retrieval_recall@1 fa,crossmodal3600 image_retrieval_recall@1 fi,crossmodal3600 image_retrieval_recall@1 fil,crossmodal3600 image_retrieval_recall@1 fr,crossmodal3600 image_retrieval_recall@1 he,crossmodal3600 image_retrieval_recall@1 hi,crossmodal3600 image_retrieval_recall@1 hr,crossmodal3600 image_retrieval_recall@1 hu,crossmodal3600 image_retrieval_recall@1 id,crossmodal3600 image_retrieval_recall@1 it,crossmodal3600 image_retrieval_recall@1 ja,crossmodal3600 image_retrieval_recall@1 ko,crossmodal3600 image_retrieval_recall@1 mi,crossmodal3600 image_retrieval_recall@1 nl,crossmodal3600 image_retrieval_recall@1 no,crossmodal3600 image_retrieval_recall@1 pl,crossmodal3600 image_retrieval_recall@1 pt,crossmodal3600 image_retrieval_recall@1 quz,crossmodal3600 image_retrieval_recall@1 ro,crossmodal3600 image_retrieval_recall@1 ru,crossmodal3600 image_retrieval_recall@1 sv,crossmodal3600 image_retrieval_recall@1 sw,crossmodal3600 image_retrieval_recall@1 te,crossmodal3600 image_retrieval_recall@1 th,crossmodal3600 image_retrieval_recall@1 tr,crossmodal3600 image_retrieval_recall@1 uk,crossmodal3600 image_retrieval_recall@1 vi,crossmodal3600 image_retrieval_recall@1 zh,crossmodal3600 image_retrieval_recall@10 avg,crossmodal3600 image_retrieval_recall@10 ar,crossmodal3600 image_retrieval_recall@10 bn,crossmodal3600 image_retrieval_recall@10 cs,crossmodal3600 image_retrieval_recall@10 da,crossmodal3600 image_retrieval_recall@10 de,crossmodal3600 image_retrieval_recall@10 el,crossmodal3600 image_retrieval_recall@10 en,crossmodal3600 image_retrieval_recall@10 es,crossmodal3600 image_retrieval_recall@10 fa,crossmodal3600 image_retrieval_recall@10 fi,crossmodal3600 image_retrieval_recall@10 fil,crossmodal3600 image_retrieval_recall@10 fr,crossmodal3600 image_retrieval_recall@10 he,crossmodal3600 image_retrieval_recall@10 hi,crossmodal3600 image_retrieval_recall@10 hr,crossmodal3600 image_retrieval_recall@10 hu,crossmodal3600 image_retrieval_recall@10 id,crossmodal3600 image_retrieval_recall@10 it,crossmodal3600 image_retrieval_recall@10 ja,crossmodal3600 image_retrieval_recall@10 ko,crossmodal3600 image_retrieval_recall@10 mi,crossmodal3600 image_retrieval_recall@10 nl,crossmodal3600 image_retrieval_recall@10 no,crossmodal3600 image_retrieval_recall@10 pl,crossmodal3600 image_retrieval_recall@10 pt,crossmodal3600 image_retrieval_recall@10 quz,crossmodal3600 image_retrieval_recall@10 ro,crossmodal3600 image_retrieval_recall@10 ru,crossmodal3600 image_retrieval_recall@10 sv,crossmodal3600 image_retrieval_recall@10 sw,crossmodal3600 image_retrieval_recall@10 te,crossmodal3600 image_retrieval_recall@10 th,crossmodal3600 image_retrieval_recall@10 tr,crossmodal3600 image_retrieval_recall@10 uk,crossmodal3600 image_retrieval_recall@10 vi,crossmodal3600 image_retrieval_recall@10 zh,crossmodal3600 image_retrieval_recall@5 avg,crossmodal3600 image_retrieval_recall@5 ar,crossmodal3600 image_retrieval_recall@5 bn,crossmodal3600 image_retrieval_recall@5 cs,crossmodal3600 image_retrieval_recall@5 da,crossmodal3600 image_retrieval_recall@5 de,crossmodal3600 image_retrieval_recall@5 el,crossmodal3600 image_retrieval_recall@5 en,crossmodal3600 image_retrieval_recall@5 es,crossmodal3600 image_retrieval_recall@5 fa,crossmodal3600 image_retrieval_recall@5 fi,crossmodal3600 image_retrieval_recall@5 fil,crossmodal3600 image_retrieval_recall@5 fr,crossmodal3600 image_retrieval_recall@5 he,crossmodal3600 image_retrieval_recall@5 hi,crossmodal3600 image_retrieval_recall@5 hr,crossmodal3600 image_retrieval_recall@5 hu,crossmodal3600 image_retrieval_recall@5 id,crossmodal3600 image_retrieval_recall@5 it,crossmodal3600 image_retrieval_recall@5 ja,crossmodal3600 image_retrieval_recall@5 ko,crossmodal3600 image_retrieval_recall@5 mi,crossmodal3600 image_retrieval_recall@5 nl,crossmodal3600 image_retrieval_recall@5 no,crossmodal3600 image_retrieval_recall@5 pl,crossmodal3600 image_retrieval_recall@5 pt,crossmodal3600 image_retrieval_recall@5 quz,crossmodal3600 image_retrieval_recall@5 ro,crossmodal3600 image_retrieval_recall@5 ru,crossmodal3600 image_retrieval_recall@5 sv,crossmodal3600 image_retrieval_recall@5 sw,crossmodal3600 image_retrieval_recall@5 te,crossmodal3600 image_retrieval_recall@5 th,crossmodal3600 image_retrieval_recall@5 tr,crossmodal3600 image_retrieval_recall@5 uk,crossmodal3600 image_retrieval_recall@5 vi,crossmodal3600 image_retrieval_recall@5 zh,crossmodal3600 text_retrieval_recall@1 avg,crossmodal3600 text_retrieval_recall@1 ar,crossmodal3600 text_retrieval_recall@1 bn,crossmodal3600 text_retrieval_recall@1 cs,crossmodal3600 text_retrieval_recall@1 da,crossmodal3600 text_retrieval_recall@1 de,crossmodal3600 text_retrieval_recall@1 el,crossmodal3600 text_retrieval_recall@1 en,crossmodal3600 text_retrieval_recall@1 es,crossmodal3600 text_retrieval_recall@1 fa,crossmodal3600 text_retrieval_recall@1 fi,crossmodal3600 text_retrieval_recall@1 fil,crossmodal3600 text_retrieval_recall@1 fr,crossmodal3600 text_retrieval_recall@1 he,crossmodal3600 text_retrieval_recall@1 hi,crossmodal3600 text_retrieval_recall@1 hr,crossmodal3600 text_retrieval_recall@1 hu,crossmodal3600 text_retrieval_recall@1 id,crossmodal3600 text_retrieval_recall@1 it,crossmodal3600 text_retrieval_recall@1 ja,crossmodal3600 text_retrieval_recall@1 ko,crossmodal3600 text_retrieval_recall@1 mi,crossmodal3600 text_retrieval_recall@1 nl,crossmodal3600 text_retrieval_recall@1 no,crossmodal3600 text_retrieval_recall@1 pl,crossmodal3600 text_retrieval_recall@1 pt,crossmodal3600 text_retrieval_recall@1 quz,crossmodal3600 text_retrieval_recall@1 ro,crossmodal3600 text_retrieval_recall@1 ru,crossmodal3600 text_retrieval_recall@1 sv,crossmodal3600 text_retrieval_recall@1 sw,crossmodal3600 text_retrieval_recall@1 te,crossmodal3600 text_retrieval_recall@1 th,crossmodal3600 text_retrieval_recall@1 tr,crossmodal3600 text_retrieval_recall@1 uk,crossmodal3600 text_retrieval_recall@1 vi,crossmodal3600 text_retrieval_recall@1 zh,crossmodal3600 text_retrieval_recall@10 avg,crossmodal3600 text_retrieval_recall@10 ar,crossmodal3600 text_retrieval_recall@10 bn,crossmodal3600 text_retrieval_recall@10 cs,crossmodal3600 text_retrieval_recall@10 da,crossmodal3600 text_retrieval_recall@10 de,crossmodal3600 text_retrieval_recall@10 el,crossmodal3600 text_retrieval_recall@10 en,crossmodal3600 text_retrieval_recall@10 es,crossmodal3600 text_retrieval_recall@10 fa,crossmodal3600 text_retrieval_recall@10 fi,crossmodal3600 text_retrieval_recall@10 fil,crossmodal3600 text_retrieval_recall@10 fr,crossmodal3600 text_retrieval_recall@10 he,crossmodal3600 text_retrieval_recall@10 hi,crossmodal3600 text_retrieval_recall@10 hr,crossmodal3600 text_retrieval_recall@10 hu,crossmodal3600 text_retrieval_recall@10 id,crossmodal3600 text_retrieval_recall@10 it,crossmodal3600 text_retrieval_recall@10 ja,crossmodal3600 text_retrieval_recall@10 ko,crossmodal3600 text_retrieval_recall@10 mi,crossmodal3600 text_retrieval_recall@10 nl,crossmodal3600 text_retrieval_recall@10 no,crossmodal3600 text_retrieval_recall@10 pl,crossmodal3600 text_retrieval_recall@10 pt,crossmodal3600 text_retrieval_recall@10 quz,crossmodal3600 text_retrieval_recall@10 ro,crossmodal3600 text_retrieval_recall@10 ru,crossmodal3600 text_retrieval_recall@10 sv,crossmodal3600 text_retrieval_recall@10 sw,crossmodal3600 text_retrieval_recall@10 te,crossmodal3600 text_retrieval_recall@10 th,crossmodal3600 text_retrieval_recall@10 tr,crossmodal3600 text_retrieval_recall@10 uk,crossmodal3600 text_retrieval_recall@10 vi,crossmodal3600 text_retrieval_recall@10 zh,crossmodal3600 text_retrieval_recall@5 avg,crossmodal3600 text_retrieval_recall@5 ar,crossmodal3600 text_retrieval_recall@5 bn,crossmodal3600 text_retrieval_recall@5 cs,crossmodal3600 text_retrieval_recall@5 da,crossmodal3600 text_retrieval_recall@5 de,crossmodal3600 text_retrieval_recall@5 el,crossmodal3600 text_retrieval_recall@5 en,crossmodal3600 text_retrieval_recall@5 es,crossmodal3600 text_retrieval_recall@5 fa,crossmodal3600 text_retrieval_recall@5 fi,crossmodal3600 text_retrieval_recall@5 fil,crossmodal3600 text_retrieval_recall@5 fr,crossmodal3600 text_retrieval_recall@5 he,crossmodal3600 text_retrieval_recall@5 hi,crossmodal3600 text_retrieval_recall@5 hr,crossmodal3600 text_retrieval_recall@5 hu,crossmodal3600 text_retrieval_recall@5 id,crossmodal3600 text_retrieval_recall@5 it,crossmodal3600 text_retrieval_recall@5 ja,crossmodal3600 text_retrieval_recall@5 ko,crossmodal3600 text_retrieval_recall@5 mi,crossmodal3600 text_retrieval_recall@5 nl,crossmodal3600 text_retrieval_recall@5 no,crossmodal3600 text_retrieval_recall@5 pl,crossmodal3600 text_retrieval_recall@5 pt,crossmodal3600 text_retrieval_recall@5 quz,crossmodal3600 text_retrieval_recall@5 ro,crossmodal3600 text_retrieval_recall@5 ru,crossmodal3600 text_retrieval_recall@5 sv,crossmodal3600 text_retrieval_recall@5 sw,crossmodal3600 text_retrieval_recall@5 te,crossmodal3600 text_retrieval_recall@5 th,crossmodal3600 text_retrieval_recall@5 tr,crossmodal3600 text_retrieval_recall@5 uk,crossmodal3600 text_retrieval_recall@5 vi,crossmodal3600 text_retrieval_recall@5 zh,multilingual_mscoco_captions image_retrieval_recall@1 avg,multilingual_mscoco_captions image_retrieval_recall@1 de,multilingual_mscoco_captions image_retrieval_recall@1 en,multilingual_mscoco_captions image_retrieval_recall@1 es,multilingual_mscoco_captions image_retrieval_recall@1 fr,multilingual_mscoco_captions image_retrieval_recall@1 it,multilingual_mscoco_captions image_retrieval_recall@1 jp,multilingual_mscoco_captions image_retrieval_recall@1 ko,multilingual_mscoco_captions image_retrieval_recall@1 pl,multilingual_mscoco_captions image_retrieval_recall@1 ru,multilingual_mscoco_captions image_retrieval_recall@1 tr,multilingual_mscoco_captions image_retrieval_recall@1 zh,multilingual_mscoco_captions image_retrieval_recall@10 avg,multilingual_mscoco_captions image_retrieval_recall@10 de,multilingual_mscoco_captions image_retrieval_recall@10 en,multilingual_mscoco_captions image_retrieval_recall@10 es,multilingual_mscoco_captions image_retrieval_recall@10 fr,multilingual_mscoco_captions image_retrieval_recall@10 it,multilingual_mscoco_captions image_retrieval_recall@10 jp,multilingual_mscoco_captions image_retrieval_recall@10 ko,multilingual_mscoco_captions image_retrieval_recall@10 pl,multilingual_mscoco_captions image_retrieval_recall@10 ru,multilingual_mscoco_captions image_retrieval_recall@10 tr,multilingual_mscoco_captions image_retrieval_recall@10 zh,multilingual_mscoco_captions image_retrieval_recall@5 avg,multilingual_mscoco_captions image_retrieval_recall@5 de,multilingual_mscoco_captions image_retrieval_recall@5 en,multilingual_mscoco_captions image_retrieval_recall@5 es,multilingual_mscoco_captions image_retrieval_recall@5 fr,multilingual_mscoco_captions image_retrieval_recall@5 it,multilingual_mscoco_captions image_retrieval_recall@5 jp,multilingual_mscoco_captions image_retrieval_recall@5 ko,multilingual_mscoco_captions image_retrieval_recall@5 pl,multilingual_mscoco_captions image_retrieval_recall@5 ru,multilingual_mscoco_captions image_retrieval_recall@5 tr,multilingual_mscoco_captions image_retrieval_recall@5 zh,multilingual_mscoco_captions text_retrieval_recall@1 avg,multilingual_mscoco_captions text_retrieval_recall@1 de,multilingual_mscoco_captions text_retrieval_recall@1 en,multilingual_mscoco_captions text_retrieval_recall@1 es,multilingual_mscoco_captions text_retrieval_recall@1 fr,multilingual_mscoco_captions text_retrieval_recall@1 it,multilingual_mscoco_captions text_retrieval_recall@1 jp,multilingual_mscoco_captions text_retrieval_recall@1 ko,multilingual_mscoco_captions text_retrieval_recall@1 pl,multilingual_mscoco_captions text_retrieval_recall@1 ru,multilingual_mscoco_captions text_retrieval_recall@1 tr,multilingual_mscoco_captions text_retrieval_recall@1 zh,multilingual_mscoco_captions text_retrieval_recall@10 avg,multilingual_mscoco_captions text_retrieval_recall@10 de,multilingual_mscoco_captions text_retrieval_recall@10 en,multilingual_mscoco_captions text_retrieval_recall@10 es,multilingual_mscoco_captions text_retrieval_recall@10 fr,multilingual_mscoco_captions text_retrieval_recall@10 it,multilingual_mscoco_captions text_retrieval_recall@10 jp,multilingual_mscoco_captions text_retrieval_recall@10 ko,multilingual_mscoco_captions text_retrieval_recall@10 pl,multilingual_mscoco_captions text_retrieval_recall@10 ru,multilingual_mscoco_captions text_retrieval_recall@10 tr,multilingual_mscoco_captions text_retrieval_recall@10 zh,multilingual_mscoco_captions text_retrieval_recall@5 avg,multilingual_mscoco_captions text_retrieval_recall@5 de,multilingual_mscoco_captions text_retrieval_recall@5 en,multilingual_mscoco_captions text_retrieval_recall@5 es,multilingual_mscoco_captions text_retrieval_recall@5 fr,multilingual_mscoco_captions text_retrieval_recall@5 it,multilingual_mscoco_captions text_retrieval_recall@5 jp,multilingual_mscoco_captions text_retrieval_recall@5 ko,multilingual_mscoco_captions text_retrieval_recall@5 pl,multilingual_mscoco_captions text_retrieval_recall@5 ru,multilingual_mscoco_captions text_retrieval_recall@5 tr,multilingual_mscoco_captions text_retrieval_recall@5 zh +xlm-roberta-large-ViT-H-14,frozen_laion5b_s13b_b90k,0.7063,0.5022,0.4606,0.0542,0.4608,0.6678,0.7331,0.5225,0.5133,0.6217,0.5639,0.5711,0.0997,0.6853,0.6403,0.215,0.6542,0.6494,0.6431,0.6642,0.7261,0.5153,0.0044,0.5608,0.5564,0.6308,0.615,0.0361,0.7008,0.7028,0.5394,0.0267,0.0064,0.5433,0.5881,0.6703,0.6475,0.5906,0.7599,0.8122,0.1856,0.8228,0.9358,0.9594,0.8442,0.8514,0.9203,0.8783,0.88,0.2417,0.9444,0.9181,0.5225,0.9275,0.9272,0.9192,0.9361,0.96,0.8511,0.0183,0.8875,0.8883,0.9094,0.9103,0.1106,0.9494,0.9511,0.8592,0.0683,0.0283,0.8769,0.8997,0.9378,0.9303,0.8933,0.7049,0.7225,0.1297,0.7406,0.8953,0.9242,0.7728,0.7783,0.8631,0.8047,0.8142,0.1958,0.9044,0.8642,0.4183,0.8758,0.8758,0.8658,0.8892,0.9225,0.7681,0.0136,0.8208,0.8186,0.8533,0.8483,0.0844,0.9117,0.9131,0.7867,0.0567,0.0175,0.8011,0.8292,0.8881,0.8781,0.8317,0.5338,0.4869,0.1206,0.4989,0.7314,0.7694,0.5608,0.5267,0.6197,0.6028,0.6203,0.1344,0.7103,0.6822,0.2778,0.7061,0.6664,0.6756,0.6936,0.7556,0.5608,0.0097,0.5533,0.6272,0.645,0.6153,0.0539,0.7433,0.6992,0.5861,0.0442,0.0144,0.5869,0.5992,0.7078,0.6989,0.6322,0.7891,0.8478,0.3214,0.87,0.9611,0.975,0.8808,0.8697,0.9297,0.9056,0.9175,0.2861,0.9617,0.9372,0.6089,0.9489,0.945,0.9431,0.9542,0.9719,0.8875,0.0261,0.8894,0.9219,0.9303,0.92,0.1517,0.9714,0.9564,0.8947,0.0953,0.0625,0.9094,0.9189,0.9594,0.9572,0.9206,0.7372,0.7622,0.2508,0.7811,0.9233,0.9483,0.8117,0.7889,0.87,0.8417,0.8544,0.2436,0.9172,0.8878,0.5169,0.9139,0.8872,0.8947,0.9114,0.9444,0.8136,0.0211,0.8158,0.8692,0.8778,0.8631,0.1214,0.9394,0.915,0.8347,0.0767,0.0411,0.8456,0.8553,0.9161,0.9172,0.8667,0.6456,0.648,0.735,0.659,0.646,0.649,0.643,0.565,0.687,0.624,0.627,0.619,0.9325,0.935,0.951,0.947,0.928,0.948,0.923,0.903,0.948,0.916,0.924,0.934,0.8714,0.874,0.908,0.891,0.867,0.879,0.862,0.832,0.895,0.854,0.866,0.857,0.6447,0.656,0.715,0.682,0.645,0.664,0.623,0.528,0.68,0.627,0.635,0.637,0.9483,0.952,0.973,0.96,0.945,0.96,0.94,0.903,0.964,0.939,0.948,0.947,0.8836,0.896,0.924,0.899,0.885,0.897,0.871,0.813,0.913,0.86,0.878,0.884 +nllb-clip-base-siglip,v1,0.7103,0.5068,0.5019,0.4842,0.4314,0.6344,0.6447,0.4506,0.4486,0.5383,0.5058,0.5625,0.3708,0.6106,0.6147,0.3258,0.6247,0.5833,0.5978,0.5772,0.5953,0.5031,0.2183,0.5194,0.53,0.5397,0.5136,0.1642,0.6206,0.59,0.5003,0.4028,0.3422,0.5283,0.5483,0.5658,0.5669,0.4883,0.8352,0.8553,0.8264,0.8128,0.9217,0.9233,0.8108,0.8075,0.8728,0.8478,0.8839,0.7339,0.9142,0.9033,0.6858,0.915,0.8892,0.9047,0.9003,0.8925,0.8394,0.5389,0.8683,0.8672,0.8531,0.87,0.4286,0.9178,0.9025,0.8417,0.7706,0.7286,0.8583,0.8839,0.8819,0.8919,0.8242,0.7598,0.7694,0.7422,0.7183,0.8664,0.8725,0.7194,0.7222,0.7997,0.7669,0.8139,0.6478,0.8558,0.8392,0.5808,0.8628,0.8278,0.8417,0.8267,0.8314,0.7544,0.435,0.7883,0.7994,0.7861,0.795,0.3464,0.8617,0.8369,0.7617,0.6714,0.6208,0.7883,0.81,0.8192,0.8256,0.7469,0.467,0.4492,0.4606,0.3964,0.5958,0.6,0.4242,0.4156,0.4931,0.4581,0.5106,0.3531,0.5706,0.5478,0.3372,0.5736,0.5256,0.5533,0.5183,0.5478,0.4739,0.1956,0.4756,0.4944,0.4831,0.4833,0.1625,0.5564,0.5356,0.4583,0.3806,0.3206,0.4842,0.5031,0.5206,0.5211,0.4314,0.8172,0.8281,0.8317,0.7936,0.9117,0.9106,0.7808,0.7864,0.855,0.8194,0.8597,0.7139,0.8997,0.8864,0.7061,0.9056,0.8778,0.8897,0.8797,0.8875,0.8272,0.5081,0.8361,0.86,0.8336,0.8397,0.4128,0.9036,0.8803,0.8194,0.7447,0.6911,0.8472,0.8589,0.8592,0.8808,0.7931,0.7317,0.7369,0.7361,0.6833,0.8475,0.8444,0.6886,0.6908,0.7697,0.7289,0.7875,0.6117,0.8364,0.8114,0.5986,0.8447,0.8014,0.8192,0.7861,0.8117,0.7414,0.4056,0.7433,0.7703,0.7542,0.7492,0.3322,0.8361,0.8064,0.7222,0.6406,0.5753,0.7625,0.7767,0.7867,0.8003,0.7019,0.6258,0.627,0.686,0.652,0.648,0.643,0.546,0.599,0.654,0.607,0.638,0.584,0.9275,0.935,0.952,0.939,0.935,0.944,0.878,0.922,0.944,0.886,0.943,0.924,0.8574,0.87,0.887,0.867,0.87,0.878,0.808,0.836,0.876,0.824,0.868,0.847,0.5701,0.588,0.624,0.574,0.582,0.591,0.494,0.547,0.58,0.553,0.59,0.548,0.9135,0.916,0.939,0.935,0.921,0.936,0.845,0.899,0.927,0.883,0.931,0.917,0.8383,0.843,0.875,0.851,0.844,0.856,0.765,0.826,0.866,0.812,0.855,0.828 +nllb-clip-large-siglip,v1,0.7391,0.5457,0.5203,0.4908,0.4636,0.6558,0.6872,0.5133,0.4586,0.5789,0.5356,0.6192,0.4308,0.6483,0.6569,0.3514,0.6636,0.6461,0.6375,0.6183,0.6258,0.5442,0.2606,0.5644,0.5769,0.5958,0.5767,0.1925,0.6883,0.6275,0.5442,0.4322,0.3731,0.5569,0.5769,0.6156,0.5964,0.5206,0.8605,0.8647,0.8256,0.8439,0.9303,0.94,0.8556,0.82,0.8964,0.8775,0.9172,0.7881,0.9331,0.9311,0.7033,0.9378,0.9275,0.9186,0.9158,0.9144,0.8631,0.5908,0.8936,0.8967,0.8894,0.8972,0.4731,0.9442,0.9256,0.8719,0.7939,0.745,0.8808,0.9011,0.9133,0.9139,0.8444,0.7912,0.7892,0.7511,0.7572,0.8714,0.8956,0.7789,0.7311,0.8264,0.8042,0.8644,0.6981,0.8789,0.8786,0.6028,0.8867,0.8789,0.8631,0.8594,0.8606,0.7872,0.4897,0.8169,0.8261,0.8275,0.8272,0.3906,0.9003,0.8644,0.7933,0.7094,0.6542,0.8111,0.8339,0.8544,0.8556,0.7656,0.5053,0.4789,0.4878,0.4311,0.6167,0.6364,0.4767,0.4353,0.5383,0.4928,0.5706,0.3833,0.6153,0.6064,0.36,0.6203,0.5869,0.5975,0.5644,0.5753,0.4919,0.2325,0.5161,0.5425,0.5411,0.5236,0.1853,0.6281,0.5872,0.4906,0.3906,0.3364,0.5142,0.5467,0.5697,0.5506,0.4708,0.8456,0.8392,0.8303,0.825,0.9236,0.9275,0.8222,0.8119,0.8825,0.8528,0.9114,0.7567,0.9281,0.9161,0.7125,0.9267,0.9142,0.9114,0.9014,0.8975,0.8511,0.5689,0.8711,0.8858,0.8739,0.875,0.4494,0.9414,0.9083,0.8494,0.7764,0.7203,0.8733,0.8794,0.9017,0.9028,0.8236,0.7691,0.7583,0.7558,0.7297,0.8658,0.875,0.7478,0.7078,0.8119,0.7639,0.8425,0.6594,0.8664,0.8567,0.6189,0.8694,0.8525,0.8433,0.835,0.8403,0.7675,0.4608,0.7928,0.8208,0.7958,0.7933,0.3633,0.8883,0.8464,0.7658,0.6772,0.6128,0.7956,0.8039,0.8322,0.8314,0.7383,0.6502,0.669,0.7,0.665,0.66,0.674,0.571,0.633,0.679,0.606,0.672,0.623,0.9395,0.942,0.965,0.946,0.94,0.956,0.884,0.936,0.953,0.915,0.952,0.945,0.8698,0.882,0.894,0.882,0.876,0.893,0.809,0.852,0.894,0.835,0.883,0.868,0.5847,0.589,0.628,0.605,0.604,0.6,0.506,0.537,0.615,0.569,0.607,0.572,0.9267,0.931,0.958,0.939,0.921,0.952,0.867,0.919,0.945,0.892,0.938,0.932,0.8472,0.843,0.883,0.858,0.863,0.867,0.782,0.832,0.869,0.809,0.871,0.842 diff --git a/docs/openclip_results.csv b/docs/openclip_results.csv index 29626609f..2a32aefd5 100644 --- a/docs/openclip_results.csv +++ b/docs/openclip_results.csv @@ -84,6 +84,7 @@ ViT-B-32-quickgelu,laion400m_e31,151.28,14.78,0.5263,0.6294,0.9121,0.9060,0.7021 ViT-B-32,openai,151.28,14.78,0.5245,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.6884,0.4028,0.4125,0.0732,0.6061,0.1676,0.5386,0.8217 ViT-B-32-quickgelu,openai,151.28,14.78,0.5245,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.1720,0.4436,0.5044,0.1953,0.8400,0.3258,0.4229,0.5592,0.3155,0.4775,0.6933,0.2743,0.4839,0.4431,0.6670,0.8700,0.7640,0.6224,0.5865,0.5362,0.5963,0.9713,0.6248,0.3159,0.6884,0.4028,0.4125,0.0732,0.6061,0.1676,0.5386,0.8217 RN50x4,openai,178.3,51.82,0.5188,0.6627,0.8661,0.7943,0.4514,0.2045,0.0905,0.2039,0.4862,0.3354,0.2102,0.8640,0.3622,0.4468,0.5944,0.4145,0.4955,0.7274,0.2335,0.4903,0.5141,0.6766,0.8829,0.6814,0.5675,0.6716,0.5338,0.6673,0.9658,0.6089,0.3190,0.7234,0.4318,0.3912,0.0870,0.5435,0.1130,0.5654,0.8376 +nllb-clip-large-siglip,v1,1195.5,1804.22,0.5184,0.5175,0.8392,0.9651,0.7626,0.1737,0.2211,0.1549,0.4394,0.4941,0.0451,0.6312,0.4700,0.5050,0.4631,0.5611,0.1825,0.8325,0.4290,0.6203,0.6492,0.2846,0.4082,0.7823,0.5004,0.5601,0.5656,0.6451,0.9939,0.6355,0.4258,0.7803,0.4949,0.4035,0.0950,0.5000,0.1415,0.6390,0.8855 ViT-B-32,laion400m_e31,151.28,14.78,0.5077,0.6022,0.8916,0.8825,0.6781,0.1549,0.2261,0.1356,0.5218,0.4694,0.1437,0.7814,0.4082,0.4648,0.5234,0.1957,0.5085,0.7079,0.1224,0.4108,0.4281,0.6319,0.8541,0.7312,0.5495,0.5162,0.5108,0.7436,0.9494,0.6508,0.2891,0.6890,0.4327,0.4262,0.0745,0.4975,0.1076,0.5491,0.8328 ViT-B-32,laion400m_e32,151.28,14.78,0.5074,0.6024,0.8918,0.8840,0.6773,0.1536,0.2261,0.1349,0.5229,0.4754,0.1467,0.7817,0.4070,0.4646,0.5237,0.1953,0.5080,0.7084,0.1181,0.4000,0.4292,0.6323,0.8513,0.7328,0.5490,0.5206,0.5094,0.7454,0.9498,0.6509,0.2759,0.6866,0.4337,0.4265,0.0741,0.5084,0.1068,0.5444,0.8326 RN101,openai,119.69,25.5,0.5033,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.6852,0.4025,0.4130,0.0888,0.4723,0.1615,0.5631,0.8164 @@ -94,6 +95,7 @@ RN50,openai,102.01,18.18,0.4810,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542 RN50-quickgelu,openai,102.01,18.18,0.4810,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.6868,0.3869,0.3622,0.0623,0.5624,0.0000,0.5222,0.8129 ViT-B-16,commonpool_l_text_s1b_b8k,149.62,41.09,0.4760,0.5605,0.8720,0.9391,0.7054,0.1843,0.2373,0.0995,0.3941,0.3830,0.0451,0.7724,0.2317,0.4437,0.4835,0.2220,0.4770,0.6708,0.2686,0.2593,0.4911,0.5164,0.7049,0.7669,0.4857,0.4931,0.4663,0.6525,0.9523,0.6088,0.2122,0.6078,0.3730,0.4570,0.0623,0.5697,0.0000,0.5643,0.8564 ViT-B-16,commonpool_l_basic_s1b_b8k,149.62,41.09,0.4585,0.5155,0.8444,0.8289,0.5251,0.2061,0.2277,0.1173,0.4133,0.3820,0.0481,0.7461,0.2021,0.3932,0.4325,0.1913,0.4600,0.6087,0.3333,0.2809,0.4493,0.4357,0.6956,0.7151,0.5899,0.5387,0.4313,0.7216,0.9373,0.5974,0.1173,0.6015,0.3583,0.4812,0.0436,0.5712,0.0000,0.5421,0.8384 +nllb-clip-base-siglip,v1,507.47,472.91,0.4451,0.3909,0.7507,0.9043,0.5939,0.1453,0.2254,0.0583,0.3617,0.3744,0.0090,0.4961,0.3429,0.3886,0.3439,0.3165,0.1695,0.6846,0.1927,0.5007,0.5001,0.1567,0.1868,0.7599,0.6692,0.5859,0.5049,0.4703,0.9818,0.5640,0.4033,0.7421,0.4716,0.3815,0.0694,0.6500,0.0956,0.6320,0.8392 ViT-B-16,commonpool_l_s1b_b8k,149.62,41.09,0.4370,0.4593,0.8089,0.9133,0.6421,0.1594,0.2203,0.1177,0.3383,0.3348,0.0316,0.6735,0.2766,0.3448,0.3914,0.1592,0.4335,0.5265,0.2686,0.3603,0.4126,0.3681,0.5587,0.7093,0.5516,0.5118,0.4154,0.6060,0.9339,0.5713,0.3047,0.4948,0.2855,0.4777,0.0399,0.5102,0.0000,0.5654,0.8305 nllb-clip-large,v1,1399.22,1468.46,0.4227,0.3672,0.7234,0.9634,0.6797,0.2389,0.2254,0.0691,0.3447,0.5454,0.0216,0.4447,0.2462,0.3316,0.3233,0.2632,0.1725,0.5624,0.3727,0.2716,0.5268,0.0978,0.1283,0.7551,0.5417,0.5585,0.4983,0.3865,0.9811,0.5512,0.1725,0.6625,0.4004,0.4299,0.0403,0.5181,0.1419,0.6752,0.8305 nllb-clip-base,v1,501.89,369.6,0.3351,0.2432,0.5914,0.8435,0.4839,0.1531,0.2254,0.0312,0.2782,0.4104,0.0185,0.2962,0.1852,0.1838,0.2029,0.0921,0.2195,0.3656,0.3741,0.1821,0.2874,0.0850,0.0784,0.6802,0.5509,0.5420,0.3603,0.1921,0.9514,0.4708,0.1441,0.5200,0.3081,0.3904,0.0463,0.4873,0.0000,0.5456,0.7136 diff --git a/docs/openclip_retrieval_results.csv b/docs/openclip_retrieval_results.csv index 7b1f71832..65055f2e6 100644 --- a/docs/openclip_retrieval_results.csv +++ b/docs/openclip_retrieval_results.csv @@ -60,6 +60,7 @@ ViT-L-14,commonpool_xl_s13b_b90k,427.62,175.33,0.6817,0.6478,0.8656,0.9210,0.820 ViT-B-32,laion2b_e16,151.28,14.78,0.6777,0.6638,0.8830,0.9322,0.8440,0.9650,0.9840,0.3913,0.6467,0.7481,0.5624,0.7956,0.8708,0.5343,0.4800,0.4602,0.4404,0.5669,0.5577,0.5490 xlm-roberta-base-ViT-B-32,laion5b_s13b_b90k,366.12,105.87,0.6759,0.6448,0.8628,0.9168,0.8270,0.9640,0.9780,0.3778,0.6344,0.7348,0.5354,0.7798,0.8640,0.5519,0.4912,0.4827,0.4742,0.5834,0.5737,0.5645 ViT-B-32,laion2b_s34b_b79k,151.28,14.78,0.6750,0.6678,0.8838,0.9310,0.8410,0.9620,0.9830,0.3934,0.6543,0.7561,0.5632,0.7984,0.8712,0.5254,0.4603,0.4479,0.4356,0.5603,0.5498,0.5398 +nllb-clip-large-siglip,v1,1195.5,1804.22,0.6738,0.7276,0.9224,0.9560,0.8330,0.9710,0.9910,0.4513,0.7084,0.8016,0.5386,0.7920,0.8712,0.4871,0.4138,0.4035,0.3933,0.5230,0.5135,0.5045 ViT-L-14,laion400m_e32,427.62,175.33,0.6734,0.7022,0.9094,0.9458,0.8760,0.9780,0.9950,0.4300,0.6803,0.7740,0.5974,0.8218,0.8938,0.4815,0.4006,0.3932,0.3858,0.5245,0.5094,0.4950 ViT-L-14,laion400m_e31,427.62,175.33,0.6728,0.7050,0.9068,0.9464,0.8720,0.9760,0.9950,0.4284,0.6797,0.7731,0.5974,0.8216,0.8934,0.4805,0.4022,0.3949,0.3877,0.5262,0.5075,0.4897 ViT-B-32-256,datacomp_s34b_b86k,151.29,17.46,0.6712,0.6492,0.8722,0.9216,0.8480,0.9670,0.9850,0.3993,0.6543,0.7524,0.5792,0.8056,0.8810,0.5165,0.4554,0.4300,0.4048,0.5428,0.5437,0.5446 @@ -74,6 +75,7 @@ ViT-B-16,laion400m_e31,149.62,41.09,0.6621,0.6572,0.8808,0.9314,0.8330,0.9660,0. ViT-B-32,datacomp_xl_s13b_b90k,151.28,14.78,0.6594,0.6108,0.8492,0.9092,0.7900,0.9390,0.9620,0.3714,0.6235,0.7268,0.5354,0.7778,0.8604,0.5317,0.4772,0.4594,0.4418,0.5589,0.5544,0.5502 ViT-L-14,openai,427.62,175.33,0.6588,0.6496,0.8724,0.9204,0.8520,0.9740,0.9920,0.3651,0.6106,0.7113,0.5634,0.7938,0.8660,0.5047,0.4307,0.4136,0.3966,0.5457,0.5334,0.5218 RN50x64,openai,623.26,552.65,0.6563,0.6898,0.8990,0.9432,0.8690,0.9820,0.9920,0.3524,0.5992,0.7033,0.5842,0.8046,0.8786,0.4779,0.4149,0.3936,0.3725,0.5280,0.5044,0.4820 +nllb-clip-base-siglip,v1,507.47,472.91,0.6545,0.6922,0.9002,0.9424,0.7920,0.9500,0.9820,0.4315,0.6913,0.7852,0.5116,0.7776,0.8628,0.4715,0.3776,0.3815,0.3854,0.5113,0.4999,0.4890 convnext_base,laion400m_s13b_b51k,151.52,36.67,0.6541,0.6496,0.8814,0.9304,0.8380,0.9710,0.9910,0.3760,0.6315,0.7337,0.5470,0.7990,0.8676,0.4811,0.4146,0.4045,0.3944,0.5145,0.5052,0.4964 RN50x16,openai,290.98,162.69,0.6518,0.6534,0.8710,0.9178,0.8570,0.9700,0.9880,0.3541,0.6002,0.7014,0.5536,0.7876,0.8670,0.4957,0.4311,0.3946,0.3584,0.5419,0.5275,0.5138 ViT-B-16,openai,149.62,41.09,0.6507,0.6216,0.8572,0.9192,0.8220,0.9660,0.9900,0.3309,0.5842,0.6899,0.5242,0.7670,0.8462,0.5171,0.4487,0.4316,0.4146,0.5550,0.5441,0.5337 diff --git a/src/open_clip/model_configs/nllb-clip-base-siglip.json b/src/open_clip/model_configs/nllb-clip-base-siglip.json new file mode 100644 index 000000000..f7152d0bb --- /dev/null +++ b/src/open_clip/model_configs/nllb-clip-base-siglip.json @@ -0,0 +1,18 @@ +{ + "embed_dim": 768, + "custom_text": true, + "init_logit_bias": -10, + "vision_cfg": { + "image_size": 384, + "timm_model_name": "vit_base_patch16_siglip_384", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "hf_model_name": "facebook/nllb-200-distilled-600M", + "hf_tokenizer_name": "facebook/nllb-200-distilled-600M", + "hf_proj_type": "linear", + "hf_pooler_type": "cls_pooler" + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/nllb-clip-large-siglip.json b/src/open_clip/model_configs/nllb-clip-large-siglip.json new file mode 100644 index 000000000..0ac348576 --- /dev/null +++ b/src/open_clip/model_configs/nllb-clip-large-siglip.json @@ -0,0 +1,18 @@ +{ + "embed_dim": 1152, + "custom_text": true, + "init_logit_bias": -10, + "vision_cfg": { + "image_size": 384, + "timm_model_name": "vit_so400m_patch14_siglip_384", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "hf_model_name": "facebook/nllb-200-distilled-1.3B", + "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B", + "hf_proj_type": "linear", + "hf_pooler_type": "cls_pooler" + } +} \ No newline at end of file diff --git a/src/open_clip/pretrained.py b/src/open_clip/pretrained.py index 8cc8d41b4..e7cd74fe1 100644 --- a/src/open_clip/pretrained.py +++ b/src/open_clip/pretrained.py @@ -7,8 +7,14 @@ from tqdm import tqdm -from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD, INCEPTION_MEAN, INCEPTION_STD, \ - IMAGENET_MEAN, IMAGENET_STD +from .constants import ( + IMAGENET_MEAN, + IMAGENET_STD, + INCEPTION_MEAN, + INCEPTION_STD, + OPENAI_DATASET_MEAN, + OPENAI_DATASET_STD, +) from .version import __version__ try: @@ -423,6 +429,13 @@ def _apcfg(url='', hf_hub='', **kwargs): ), "nllb-clip-large": dict( v1=_pcfg(hf_hub='visheratin/nllb-clip-large-oc/'), + ), + + "nllb-clip-base-siglip": dict( + v1=_slpcfg(hf_hub='visheratin/nllb-clip-base-siglip/'), + ), + "nllb-clip-large-siglip": dict( + v1=_slpcfg(hf_hub='visheratin/nllb-clip-large-siglip/'), ) } diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py index 985c0e030..e24a3def5 100644 --- a/src/open_clip/tokenizer.py +++ b/src/open_clip/tokenizer.py @@ -8,7 +8,7 @@ import random import string from functools import lru_cache, partial -from typing import Callable, Optional, List, Union +from typing import Callable, List, Optional, Union import ftfy import numpy as np @@ -422,7 +422,7 @@ def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = assert context_length, 'Please set a valid context length in class init or call.' texts = [self.clean_fn(text) for text in texts] - input_ids = self.tokenizer( + input_ids = self.tokenizer.batch_encode_plus( texts, return_tensors='pt', max_length=context_length, @@ -459,8 +459,9 @@ def __init__( if tokenizer_name in self.VOCAB_FILES: # FIXME temporary hack? - import fsspec import tempfile + + import fsspec vocab_file = self.VOCAB_FILES[tokenizer_name] with tempfile.NamedTemporaryFile('wb') as dst: with fsspec.open(vocab_file, 'rb') as src: