From a5f3ae971ee62e6f18e7a62b54f8c35946752b4c Mon Sep 17 00:00:00 2001 From: Ross Wightman Date: Fri, 20 Oct 2023 08:52:07 -0700 Subject: [PATCH] Combining CLIPA-v2 and SigLIP (both big_vision based) models (#660) * merge changes for clipa inference * update get_tokenizer to pass CI test; replace gelu_appoximate with act_kwargs * Temporary, cannot have a force tf dependency * Supporting SigLIP and CLIPA-v2 models (both sourced from big_vision jax based modelling code). * Fix some test failures, remove old v1 CLIPA configs, add add 336 H14 CLIPA * Fix torchscript * Fix CoCa expand typo, force final LN after attentional pool * Used wrong default clean fn in SimpleTokenizer, put lower case back * Attempt to fix xlm roberta test w/ pretrained hf weight difference * SigLIP weights working. More changes to support differing image preprocessing / text tokenization sensibly. * A typo and unused import * Fix two small issues, add hf_tokenizer_name to SigLIP models for non hf-hub use * CLIPA reference temppory rwightman/ models for testing * Rename profile->profiler to avoid python naming conflict * More tokenizer rework, add context_len as class attr set in factory, default __call__() arg to None. Clean up reduction masking logic and fix #680 * fix ViT-SO400M-14-SigLIP name * Fix CoCa pool LN, improve clarity of ViT pooling logic * Exclude first/last tokens from tokens output of text models, should match prev CoCa behaviour, but at odds with argmax which leaves special tokens in (not consistent) * Add eval results for CLIPA + SigLIP models * Fixup bigG CLIPA config, 83.03 top-1 IN-1k --------- Co-authored-by: zw <26880977+zw615@users.noreply.github.com> Co-authored-by: Gabriel Ilharco --- docs/openclip_results.csv | 29 +- scripts/clipav1_vit_l16_i37_t8.sh | 6 + ...vit_h14_i84_224_336_cl32_gap_datacomp1b.sh | 10 + src/open_clip/__init__.py | 3 +- src/open_clip/big_vision.py | 136 ++++++ src/open_clip/coca_model.py | 25 +- src/open_clip/constants.py | 4 + src/open_clip/factory.py | 147 +++--- src/open_clip/hf_model.py | 8 +- src/open_clip/model.py | 121 ++++- .../model_configs/ViT-B-16-CL16.json | 16 - .../model_configs/ViT-B-16-SigLIP-256.json | 29 ++ .../model_configs/ViT-B-16-SigLIP-384.json | 29 ++ .../model_configs/ViT-B-16-SigLIP-512.json | 29 ++ .../ViT-B-16-SigLIP-i18n-256.json | 29 ++ .../model_configs/ViT-B-16-SigLIP.json | 29 ++ .../model_configs/ViT-H-14-CL32-GAP.json | 18 - .../ViT-H-14-CL8-SyntaxMask-GAP.json | 19 - .../model_configs/ViT-H-14-CLIPA-336.json | 26 ++ .../model_configs/ViT-H-14-CLIPA.json | 26 ++ .../model_configs/ViT-L-14-CLIPA-336.json | 25 + .../model_configs/ViT-L-14-CLIPA.json | 25 + .../model_configs/ViT-L-16-CL16-GAP.json | 17 - .../ViT-L-16-CL8-Syntax-GAP.json | 18 - .../model_configs/ViT-L-16-SigLIP-256.json | 29 ++ .../model_configs/ViT-L-16-SigLIP-384.json | 29 ++ .../ViT-SO400M-14-SigLIP-384.json | 30 ++ .../model_configs/ViT-SO400M-14-SigLIP.json | 30 ++ .../model_configs/ViT-bigG-14-CLIPA-336.json | 27 ++ .../model_configs/coca_roberta-ViT-B-32.json | 2 +- .../model_configs/mt5-base-ViT-B-32.json | 3 +- .../model_configs/mt5-xl-ViT-H-14.json | 3 +- .../model_configs/nllb-clip-base.json | 4 +- .../model_configs/nllb-clip-large.json | 4 +- .../model_configs/roberta-ViT-B-32.json | 3 +- .../xlm-roberta-base-ViT-B-32.json | 3 +- .../xlm-roberta-large-ViT-H-14.json | 3 +- src/open_clip/pos_embed.py | 96 ++++ src/open_clip/pretrained.py | 103 ++++- src/open_clip/push_to_hf_hub.py | 45 +- src/open_clip/timm_model.py | 11 +- src/open_clip/tokenizer.py | 433 ++++++++++++------ src/open_clip/transform.py | 302 ++++++++++-- src/open_clip/transformer.py | 215 ++++++--- src/training/main.py | 17 +- src/training/params.py | 11 + src/training/{profile.py => profiler.py} | 0 src/training/train.py | 4 +- src/training/zero_shot.py | 6 +- tests/test_hf_model.py | 2 +- 50 files changed, 1761 insertions(+), 478 deletions(-) create mode 100644 scripts/clipav1_vit_l16_i37_t8.sh create mode 100644 scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh create mode 100644 src/open_clip/big_vision.py delete mode 100644 src/open_clip/model_configs/ViT-B-16-CL16.json create mode 100644 src/open_clip/model_configs/ViT-B-16-SigLIP-256.json create mode 100644 src/open_clip/model_configs/ViT-B-16-SigLIP-384.json create mode 100644 src/open_clip/model_configs/ViT-B-16-SigLIP-512.json create mode 100644 src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json create mode 100644 src/open_clip/model_configs/ViT-B-16-SigLIP.json delete mode 100644 src/open_clip/model_configs/ViT-H-14-CL32-GAP.json delete mode 100644 src/open_clip/model_configs/ViT-H-14-CL8-SyntaxMask-GAP.json create mode 100644 src/open_clip/model_configs/ViT-H-14-CLIPA-336.json create mode 100644 src/open_clip/model_configs/ViT-H-14-CLIPA.json create mode 100644 src/open_clip/model_configs/ViT-L-14-CLIPA-336.json create mode 100644 src/open_clip/model_configs/ViT-L-14-CLIPA.json delete mode 100644 src/open_clip/model_configs/ViT-L-16-CL16-GAP.json delete mode 100644 src/open_clip/model_configs/ViT-L-16-CL8-Syntax-GAP.json create mode 100644 src/open_clip/model_configs/ViT-L-16-SigLIP-256.json create mode 100644 src/open_clip/model_configs/ViT-L-16-SigLIP-384.json create mode 100644 src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json create mode 100644 src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json create mode 100644 src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json create mode 100644 src/open_clip/pos_embed.py rename src/training/{profile.py => profiler.py} (100%) diff --git a/docs/openclip_results.csv b/docs/openclip_results.csv index f901e0ecc..8d27ffd32 100644 --- a/docs/openclip_results.csv +++ b/docs/openclip_results.csv @@ -1,35 +1,50 @@ name,pretrained,Average perf. on 38 datasets,ImageNet 1k,Caltech-101,CIFAR-10,CIFAR-100,CLEVR Counts,CLEVR Distance,Country211,Describable Textures,EuroSAT,FGVC Aircraft,Food-101,GTSRB,ImageNet Sketch,ImageNet v2,ImageNet-A,ImageNet-O,ImageNet-R,KITTI Vehicle Distance,MNIST,ObjectNet,Oxford Flowers-102,Oxford-IIIT Pet,Pascal VOC 2007,PatchCamelyon,Rendered SST2,RESISC45,Stanford Cars,STL-10,SUN397,SVHN,Flickr,MSCOCO,WinoGAViL,iWildCam,Camelyon17,FMoW,Dollar Street,GeoDE EVA02-E-14-plus,laion2b_s9b_b144k,0.6930,0.8201,0.9535,0.9934,0.9316,0.2991,0.1998,0.3564,0.6777,0.7574,0.5360,0.9496,0.6740,0.7162,0.7564,0.8223,0.3540,0.9456,0.1842,0.7463,0.7937,0.8433,0.9567,0.8569,0.6442,0.6271,0.7490,0.9457,0.9926,0.7510,0.7560,0.8648,0.5991,0.4403,0.2591,0.6948,0.2668,0.6951,0.9244 +ViT-SO400M-14-SigLIP-384,webli,0.6921,0.8308,0.9599,0.9672,0.8357,0.4071,0.2246,0.3645,0.7303,0.6354,0.6069,0.9635,0.6429,0.7454,0.7717,0.8247,0.2775,0.9575,0.2082,0.8862,0.7694,0.9114,0.9680,0.7171,0.5268,0.7002,0.7211,0.9521,0.9930,0.7541,0.5151,0.8863,0.6331,0.5754,0.2294,0.6149,0.3309,0.7301,0.9328 +ViT-SO400M-14-SigLIP,webli,0.6808,0.8203,0.9600,0.9679,0.8417,0.4210,0.2213,0.3243,0.7106,0.6274,0.6029,0.9556,0.6382,0.7402,0.7607,0.7185,0.2960,0.9506,0.2489,0.8929,0.7061,0.8982,0.9522,0.7034,0.5057,0.6936,0.7257,0.9032,0.9939,0.7436,0.5670,0.8313,0.6071,0.5665,0.1915,0.6215,0.3163,0.7173,0.9278 +ViT-bigG-14-CLIPA-336,datacomp1b,0.6789,0.8230,0.9516,0.9901,0.9099,0.1593,0.2033,0.4041,0.7362,0.6398,0.5401,0.9591,0.6093,0.7359,0.7654,0.8536,0.3025,0.9489,0.3024,0.8513,0.7940,0.8715,0.9545,0.8210,0.5471,0.5261,0.7076,0.9517,0.9966,0.7524,0.6740,0.8450,0.5682,0.4600,0.2363,0.5842,0.1820,0.6998,0.9424 EVA02-E-14,laion2b_s4b_b115k,0.6690,0.8196,0.9541,0.9925,0.9258,0.1632,0.2499,0.3482,0.6878,0.7446,0.4892,0.9523,0.6729,0.7151,0.7566,0.8044,0.3340,0.9407,0.1294,0.7581,0.7674,0.8210,0.9569,0.8136,0.4972,0.5859,0.7324,0.9438,0.9926,0.7658,0.6381,0.8515,0.5892,0.4429,0.2289,0.4894,0.2801,0.6682,0.9182 +ViT-L-16-SigLIP-384,webli,0.6683,0.8207,0.9611,0.9605,0.8188,0.3275,0.2077,0.2470,0.7080,0.5817,0.5312,0.9564,0.6385,0.7360,0.7593,0.7663,0.3130,0.9507,0.2222,0.8525,0.7284,0.8934,0.9681,0.7172,0.5466,0.5634,0.6789,0.9493,0.9924,0.7250,0.5672,0.8756,0.6290,0.5550,0.2236,0.6637,0.1489,0.6916,0.9207 +ViT-H-14-CLIPA-336,datacomp1b,0.6677,0.8180,0.9467,0.9890,0.8968,0.1326,0.2254,0.3551,0.7197,0.6604,0.4718,0.9572,0.5816,0.7282,0.7562,0.8275,0.3115,0.9438,0.2574,0.8245,0.7742,0.8463,0.9573,0.8134,0.4979,0.6052,0.7114,0.9483,0.9955,0.7635,0.6599,0.8356,0.5822,0.4587,0.2239,0.4357,0.2500,0.6822,0.9278 ViT-H-14-quickgelu,metaclip_fullcc,0.6671,0.8051,0.9536,0.9804,0.8634,0.2115,0.1881,0.3716,0.7271,0.6450,0.5114,0.9423,0.6257,0.7052,0.7417,0.7533,0.3040,0.9342,0.2771,0.7266,0.7642,0.8448,0.9561,0.7495,0.6222,0.6925,0.7024,0.8990,0.9944,0.7440,0.5910,0.8507,0.5752,0.5312,0.1680,0.5782,0.2314,0.6811,0.9077 ViT-bigG-14,laion2b_s39b_b160k,0.6667,0.8009,0.9484,0.9824,0.8752,0.2989,0.2002,0.3379,0.6867,0.6919,0.4953,0.9309,0.6244,0.6894,0.7359,0.6933,0.3785,0.9213,0.1308,0.7157,0.7284,0.8163,0.9529,0.8077,0.6364,0.6535,0.7235,0.9460,0.9850,0.7450,0.6961,0.8623,0.5938,0.4488,0.1760,0.5905,0.2352,0.6857,0.9127 +ViT-H-14-CLIPA,datacomp1b,0.6653,0.8152,0.9458,0.9888,0.8991,0.1513,0.2255,0.3401,0.7090,0.7146,0.4751,0.9554,0.5538,0.7272,0.7498,0.7701,0.3135,0.9426,0.2461,0.8189,0.7423,0.8437,0.9559,0.8170,0.4958,0.6189,0.7098,0.9458,0.9948,0.7608,0.6622,0.8344,0.5804,0.4578,0.2160,0.4415,0.2684,0.6694,0.9236 ViT-L-14,datacomp_xl_s13b_b90k,0.6627,0.7921,0.9465,0.9824,0.8736,0.3555,0.2443,0.3157,0.6649,0.7124,0.4750,0.9452,0.5853,0.6795,0.7205,0.6959,0.3255,0.9083,0.2785,0.8661,0.7425,0.8262,0.9506,0.8247,0.5118,0.6101,0.6941,0.9305,0.9925,0.7427,0.6769,0.8119,0.5451,0.4666,0.1614,0.5089,0.2403,0.6624,0.9152 EVA01-g-14-plus,merged2b_s11b_b114k,0.6624,0.7933,0.9506,0.9910,0.9008,0.2302,0.2293,0.3087,0.6734,0.7280,0.3947,0.9366,0.6644,0.6814,0.7214,0.7416,0.3415,0.9246,0.1491,0.7176,0.7491,0.7959,0.9490,0.8285,0.6244,0.5854,0.7079,0.9073,0.9949,0.7426,0.5951,0.8535,0.5925,0.4684,0.1882,0.7100,0.2283,0.6589,0.9148 ViT-L-14-quickgelu,metaclip_fullcc,0.6592,0.7917,0.9527,0.9759,0.8410,0.3107,0.2260,0.3394,0.6862,0.5894,0.4537,0.9352,0.5623,0.6896,0.7256,0.7231,0.3010,0.9205,0.2785,0.6444,0.7457,0.8143,0.9461,0.8030,0.6197,0.6678,0.7360,0.8868,0.9933,0.7355,0.4681,0.8326,0.5576,0.5357,0.1581,0.7551,0.2592,0.6752,0.9140 EVA02-L-14-336,merged2b_s6b_b61k,0.6583,0.8039,0.9525,0.9892,0.8980,0.3635,0.2485,0.3354,0.6473,0.7139,0.3758,0.9421,0.5759,0.6891,0.7380,0.8289,0.2850,0.9324,0.2377,0.6421,0.7789,0.7645,0.9424,0.8267,0.5487,0.6463,0.6910,0.9158,0.9966,0.7480,0.4575,0.8381,0.5605,0.5053,0.2105,0.5691,0.2198,0.6811,0.9136 +ViT-L-14-CLIPA-336,datacomp1b,0.6570,0.8026,0.9439,0.9864,0.8826,0.1566,0.2439,0.3066,0.6856,0.5811,0.4281,0.9456,0.5695,0.7087,0.7346,0.7771,0.3290,0.9329,0.1997,0.7667,0.7317,0.8100,0.9495,0.7979,0.6028,0.5316,0.6884,0.9407,0.9929,0.7560,0.6290,0.8251,0.5640,0.4449,0.1937,0.6783,0.2500,0.6752,0.9240 +ViT-L-16-SigLIP-256,webli,0.6557,0.8045,0.9593,0.9619,0.8191,0.4065,0.2150,0.2141,0.7027,0.5598,0.5259,0.9463,0.6115,0.7209,0.7376,0.6213,0.3265,0.9396,0.1983,0.8499,0.6526,0.8827,0.9604,0.7409,0.5458,0.6172,0.6817,0.9386,0.9911,0.7253,0.5211,0.8542,0.6154,0.5748,0.1796,0.5757,0.1296,0.6904,0.9173 +ViT-L-14-CLIPA,datacomp1b,0.6536,0.7957,0.9453,0.9866,0.8850,0.1857,0.2449,0.2941,0.6963,0.6044,0.4299,0.9415,0.5906,0.7061,0.7305,0.7125,0.3370,0.9288,0.1927,0.7374,0.6988,0.8101,0.9497,0.8067,0.5915,0.5387,0.6843,0.9366,0.9919,0.7528,0.6390,0.8188,0.5604,0.4388,0.1724,0.6760,0.2457,0.6647,0.9152 convnext_xxlarge,laion2b_s34b_b82k_augreg_soup,0.6530,0.7947,0.9448,0.9822,0.8687,0.1454,0.2365,0.3170,0.7053,0.6128,0.4434,0.9321,0.5508,0.6840,0.7260,0.6719,0.4060,0.9160,0.2363,0.8277,0.7273,0.8241,0.9445,0.8090,0.5142,0.6952,0.7190,0.9409,0.9810,0.7458,0.6254,0.8521,0.5867,0.4702,0.1730,0.6071,0.0000,0.6764,0.9215 convnext_xxlarge,laion2b_s34b_b82k_augreg_rewind,0.6521,0.7931,0.9452,0.9823,0.8686,0.1651,0.2534,0.3155,0.7016,0.6331,0.4398,0.9308,0.5491,0.6825,0.7228,0.6657,0.3975,0.9139,0.2419,0.7930,0.7252,0.8241,0.9438,0.8100,0.5014,0.6897,0.7168,0.9406,0.9801,0.7459,0.6137,0.8498,0.5871,0.4741,0.1735,0.6071,0.0000,0.6799,0.9228 xlm-roberta-large-ViT-H-14,frozen_laion5b_s13b_b90k,0.6515,0.7695,0.9422,0.9718,0.8430,0.3358,0.2050,0.3172,0.6926,0.6793,0.4673,0.9236,0.6239,0.6581,0.6944,0.5935,0.3390,0.8940,0.1364,0.7804,0.6911,0.7532,0.9431,0.7995,0.5792,0.6436,0.6825,0.9362,0.9889,0.7551,0.5950,0.8461,0.5758,0.5206,0.1392,0.6749,0.2098,0.6460,0.9111 ViT-L-14,commonpool_xl_clip_s13b_b90k,0.6501,0.7637,0.9502,0.9797,0.8615,0.2547,0.2451,0.2984,0.6521,0.6681,0.3860,0.9355,0.5980,0.6538,0.6953,0.6197,0.3525,0.8924,0.2982,0.9040,0.7165,0.8006,0.9424,0.8336,0.5688,0.6178,0.6978,0.9352,0.9875,0.7351,0.6853,0.7768,0.5156,0.4728,0.1439,0.5100,0.1705,0.6776,0.9056 EVA02-L-14,merged2b_s4b_b131k,0.6488,0.7977,0.9512,0.9908,0.9071,0.3176,0.2462,0.3091,0.6319,0.6994,0.3638,0.9340,0.5718,0.6813,0.7295,0.7619,0.2880,0.9272,0.2518,0.6729,0.7489,0.7631,0.9398,0.8220,0.5431,0.6150,0.6968,0.9055,0.9961,0.7410,0.4793,0.8351,0.5556,0.5081,0.1886,0.5124,0.2017,0.6624,0.9073 convnext_xxlarge,laion2b_s34b_b82k_augreg,0.6479,0.7907,0.9429,0.9816,0.8677,0.1399,0.1195,0.3127,0.7096,0.6030,0.4250,0.9295,0.5454,0.6806,0.7223,0.6692,0.4025,0.9131,0.2616,0.8687,0.7235,0.8091,0.9455,0.8116,0.5340,0.6782,0.7100,0.9399,0.9824,0.7436,0.6379,0.8531,0.5834,0.4536,0.1616,0.5719,0.0000,0.6729,0.9228 +ViT-B-16-SigLIP-512,webli,0.6459,0.7914,0.9516,0.9265,0.7146,0.2411,0.2226,0.1927,0.6793,0.4007,0.4521,0.9394,0.5171,0.6990,0.7283,0.6769,0.3615,0.9264,0.3924,0.8288,0.6764,0.8677,0.9499,0.7139,0.6615,0.5722,0.6538,0.9249,0.9853,0.7152,0.5444,0.8578,0.5963,0.5696,0.1925,0.6606,0.1411,0.6928,0.9244 +ViT-H-14-CLIPA-336,laion2b,0.6439,0.7910,0.9438,0.9826,0.8643,0.1835,0.2158,0.3111,0.7160,0.6393,0.3437,0.9303,0.5007,0.6994,0.7241,0.7213,0.3655,0.9269,0.1561,0.6365,0.7022,0.8009,0.9444,0.7723,0.5787,0.6178,0.7029,0.9476,0.9894,0.7567,0.6255,0.8522,0.5883,0.4878,0.1853,0.5001,0.1666,0.6706,0.9257 ViT-g-14,laion2b_s34b_b88k,0.6427,0.7847,0.9452,0.9815,0.8465,0.3768,0.1870,0.3091,0.6856,0.6530,0.4441,0.9241,0.4964,0.6754,0.7158,0.6092,0.3705,0.9020,0.2700,0.7191,0.6908,0.8010,0.9379,0.8166,0.5384,0.5678,0.6960,0.9394,0.9893,0.7411,0.5611,0.8456,0.5758,0.4104,0.1524,0.4771,0.2090,0.6671,0.9090 ViT-H-14,laion2b_s32b_b79k,0.6419,0.7796,0.9421,0.9745,0.8473,0.2676,0.2358,0.2986,0.6782,0.7278,0.4265,0.9273,0.5832,0.6657,0.7090,0.5935,0.3825,0.8934,0.1097,0.7284,0.6941,0.7982,0.9438,0.7768,0.5430,0.6392,0.6995,0.9338,0.9848,0.7521,0.5252,0.8417,0.5770,0.4247,0.1528,0.5638,0.2264,0.6343,0.9086 convnext_large_d_320,laion2b_s29b_b131k_ft_soup,0.6387,0.7685,0.9348,0.9659,0.8304,0.4293,0.2010,0.2654,0.6830,0.7161,0.3621,0.9162,0.5822,0.6504,0.6944,0.6044,0.4410,0.8862,0.1027,0.7434,0.6898,0.7755,0.9358,0.8129,0.4814,0.5585,0.7078,0.9369,0.9856,0.7376,0.6712,0.8467,0.5665,0.4549,0.1786,0.4088,0.1901,0.6449,0.9094 +ViT-B-16-SigLIP-384,webli,0.6379,0.7849,0.9507,0.9276,0.7147,0.2195,0.2239,0.1858,0.6718,0.4307,0.4522,0.9362,0.5196,0.6955,0.7211,0.6233,0.3640,0.9214,0.3333,0.8088,0.6342,0.8624,0.9515,0.7162,0.7010,0.5607,0.6579,0.9245,0.9863,0.7096,0.5285,0.8559,0.5882,0.5719,0.1719,0.5931,0.1365,0.6846,0.9194 ViT-L-14,commonpool_xl_laion_s13b_b90k,0.6360,0.7545,0.9352,0.9796,0.8585,0.3819,0.2489,0.2503,0.6191,0.7378,0.2869,0.9200,0.6018,0.6352,0.6851,0.5747,0.3730,0.8708,0.1378,0.7740,0.6846,0.7435,0.9308,0.8107,0.5069,0.5986,0.7065,0.8912,0.9903,0.7327,0.5730,0.8130,0.5513,0.4966,0.1421,0.5671,0.2337,0.6600,0.9115 EVA01-g-14,laion400m_s11b_b41k,0.6358,0.7852,0.9477,0.9829,0.8865,0.1966,0.2467,0.2862,0.6144,0.7237,0.3226,0.9345,0.4913,0.6730,0.7152,0.7359,0.3285,0.9250,0.2405,0.6218,0.7200,0.7427,0.9414,0.8325,0.4987,0.5832,0.6976,0.9171,0.9889,0.7416,0.5889,0.8037,0.5293,0.4640,0.1975,0.4999,0.1859,0.6741,0.8969 convnext_large_d_320,laion2b_s29b_b131k_ft,0.6345,0.7660,0.9341,0.9647,0.8313,0.3688,0.1999,0.2673,0.6846,0.7131,0.3770,0.9160,0.5688,0.6472,0.6929,0.5933,0.4400,0.8823,0.1027,0.7695,0.6813,0.7696,0.9346,0.8002,0.4576,0.5623,0.6989,0.9348,0.9854,0.7355,0.6496,0.8415,0.5599,0.4558,0.1664,0.4342,0.1782,0.6355,0.9090 -coca_ViT-L-14,laion2b_s13b_b90k,0.6305,0.7564,0.9433,0.9717,0.8318,0.3565,0.2365,0.2546,0.6271,0.6850,0.3622,0.9045,0.5572,0.6459,0.6794,0.5345,0.3540,0.8819,0.1899,0.7567,0.6414,0.7628,0.9400,0.8112,0.5278,0.6661,0.6883,0.9282,0.9905,0.7394,0.6205,0.8155,0.5431,0.4701,0.1348,0.4125,0.1917,0.6495,0.8969 +coca_ViT-L-14,laion2b_s13b_b90k,0.6327,0.7561,0.9430,0.9722,0.8318,0.3781,0.2446,0.2551,0.6239,0.6752,0.3590,0.9038,0.5624,0.6453,0.6798,0.5336,0.3540,0.8812,0.1899,0.7790,0.6405,0.7643,0.9402,0.8096,0.5500,0.6634,0.6878,0.9276,0.9894,0.7406,0.6237,0.8134,0.5428,0.4739,0.1375,0.4268,0.1932,0.6542,0.8960 ViT-g-14,laion2b_s12b_b42k,0.6299,0.7663,0.9415,0.9706,0.8392,0.3317,0.2225,0.2878,0.6824,0.6469,0.3768,0.9155,0.4985,0.6516,0.6956,0.5716,0.3785,0.8869,0.1350,0.6840,0.6761,0.7800,0.9431,0.8108,0.5624,0.6425,0.7176,0.9292,0.9865,0.7541,0.3930,0.8366,0.5647,0.4427,0.1486,0.4948,0.2040,0.6542,0.9132 convnext_large_d,laion2b_s26b_b102k_augreg,0.6294,0.7591,0.9365,0.9655,0.8309,0.3461,0.1997,0.2525,0.6739,0.6959,0.3610,0.9055,0.5299,0.6430,0.6826,0.5352,0.4425,0.8767,0.1027,0.8063,0.6618,0.7667,0.9282,0.7891,0.5309,0.5612,0.6768,0.9316,0.9829,0.7307,0.6812,0.8384,0.5550,0.4646,0.1549,0.3964,0.1793,0.6402,0.9019 ViT-L-14-336,openai,0.6284,0.7656,0.9225,0.9493,0.7436,0.2003,0.1895,0.3445,0.5559,0.6144,0.3346,0.9386,0.5239,0.6100,0.7089,0.7748,0.3265,0.8905,0.2616,0.7916,0.7183,0.7852,0.9369,0.7815,0.6073,0.7057,0.6379,0.7932,0.9943,0.6865,0.5560,0.7730,0.4751,0.4145,0.1490,0.6456,0.2325,0.6390,0.9015 ViT-L-14-quickgelu,metaclip_400m,0.6252,0.7620,0.9464,0.9544,0.7727,0.2271,0.2514,0.3085,0.6245,0.6033,0.3983,0.9073,0.4755,0.6505,0.6977,0.6640,0.2895,0.8889,0.2419,0.6186,0.6923,0.7648,0.9381,0.7440,0.7039,0.6551,0.6848,0.8477,0.9928,0.7073,0.3239,0.7981,0.5191,0.5175,0.1408,0.6916,0.1874,0.6741,0.8931 +ViT-B-16-SigLIP,webli,0.6232,0.7604,0.9518,0.9234,0.7223,0.2373,0.2409,0.1594,0.6468,0.4428,0.4377,0.9162,0.5164,0.6792,0.6893,0.4541,0.3815,0.9030,0.4093,0.8354,0.5510,0.8549,0.9420,0.7212,0.5953,0.5244,0.6454,0.9081,0.9821,0.7001,0.5586,0.8189,0.5676,0.5738,0.1309,0.6045,0.1265,0.6589,0.9106 +ViT-B-16-SigLIP-256,webli,0.6226,0.7653,0.9496,0.9334,0.7327,0.2276,0.2340,0.1581,0.6574,0.4606,0.4473,0.9200,0.4940,0.6810,0.6920,0.4877,0.3785,0.9076,0.3685,0.8457,0.5723,0.8521,0.9424,0.7254,0.5657,0.5739,0.6440,0.9106,0.9818,0.7026,0.5399,0.8272,0.5724,0.5715,0.1493,0.4966,0.1253,0.6589,0.9061 ViT-L-14,commonpool_xl_s13b_b90k,0.6207,0.7229,0.9327,0.9801,0.8410,0.1985,0.2461,0.2962,0.6202,0.6889,0.1957,0.9107,0.5467,0.6118,0.6511,0.5625,0.2855,0.8594,0.3390,0.9084,0.7022,0.6966,0.9060,0.8076,0.5248,0.5953,0.5756,0.8939,0.9890,0.7103,0.6589,0.7339,0.4652,0.5072,0.1229,0.5246,0.1948,0.6811,0.8990 ViT-L-14,laion2b_s32b_b82k,0.6205,0.7525,0.9388,0.9662,0.8332,0.3123,0.2234,0.2631,0.6293,0.6459,0.3652,0.9100,0.5618,0.6328,0.6780,0.5385,0.3870,0.8742,0.2293,0.5410,0.6529,0.7479,0.9309,0.8053,0.5641,0.5925,0.6687,0.9263,0.9885,0.7434,0.4087,0.8251,0.5493,0.4385,0.1257,0.5972,0.2007,0.6402,0.8919 ViT-L-14,openai,0.6173,0.7554,0.9249,0.9559,0.7582,0.1943,0.2021,0.3187,0.5537,0.6263,0.3181,0.9305,0.5055,0.5959,0.6983,0.7075,0.3235,0.8784,0.2180,0.7634,0.6889,0.7923,0.9323,0.7828,0.5204,0.6881,0.6337,0.7788,0.9936,0.6756,0.5840,0.7508,0.4642,0.4136,0.1211,0.6741,0.2229,0.6297,0.8839 +coca_ViT-L-14,mscoco_finetuned_laion2b_s13b_b90k,0.6159,0.7204,0.9420,0.9630,0.7965,0.3765,0.2501,0.1800,0.6213,0.5867,0.2329,0.8436,0.5453,0.6114,0.6475,0.4548,0.3865,0.8574,0.3797,0.8292,0.6253,0.7074,0.9115,0.8106,0.4943,0.6107,0.6267,0.8865,0.9861,0.7398,0.5564,0.8373,0.6028,0.5146,0.1303,0.4294,0.1678,0.6636,0.8772 ViT-B-16,datacomp_xl_s13b_b90k,0.6147,0.7349,0.9380,0.9624,0.8212,0.3267,0.2461,0.2215,0.5793,0.5883,0.2970,0.9047,0.5523,0.6044,0.6598,0.4840,0.4285,0.8362,0.2883,0.7649,0.6350,0.7701,0.9254,0.8178,0.6002,0.5162,0.6535,0.8883,0.9811,0.7051,0.6272,0.7633,0.4880,0.4832,0.1181,0.4799,0.1504,0.6168,0.8990 -coca_ViT-L-14,mscoco_finetuned_laion2b_s13b_b90k,0.6138,0.7210,0.9459,0.9626,0.7966,0.3649,0.2488,0.1810,0.6218,0.5904,0.2344,0.8449,0.5532,0.6116,0.6486,0.4568,0.3905,0.8579,0.3502,0.8220,0.6257,0.7078,0.9104,0.8127,0.4687,0.6134,0.6232,0.8875,0.9864,0.7377,0.5317,0.8373,0.6038,0.5178,0.1309,0.4097,0.1682,0.6729,0.8768 ViT-B-32-256,datacomp_s34b_b86k,0.6087,0.7281,0.9348,0.9653,0.8287,0.2489,0.2271,0.1968,0.6064,0.6469,0.3645,0.8909,0.5152,0.6065,0.6481,0.3757,0.4635,0.8344,0.2658,0.7939,0.5960,0.7822,0.9115,0.7880,0.5880,0.5294,0.6505,0.8990,0.9731,0.7021,0.6708,0.7486,0.4892,0.4300,0.0910,0.6252,0.0000,0.6238,0.8923 +ViT-B-16-SigLIP-i18n-256,webli,0.6068,0.7513,0.9475,0.9118,0.7216,0.2552,0.1976,0.1593,0.6426,0.3826,0.3325,0.9171,0.5276,0.6588,0.6814,0.4585,0.3685,0.8920,0.3826,0.8301,0.5976,0.8387,0.9387,0.7536,0.5381,0.5700,0.5737,0.8926,0.9764,0.6978,0.4272,0.8088,0.5470,0.5710,0.1451,0.4899,0.1064,0.6472,0.9186 RN50x64,openai,0.6061,0.7391,0.9026,0.8510,0.5985,0.2254,0.1994,0.2981,0.5314,0.5765,0.3103,0.9205,0.4792,0.5593,0.6706,0.7077,0.3830,0.8441,0.3094,0.8583,0.6820,0.7745,0.9360,0.7398,0.5387,0.7106,0.6265,0.7581,0.9829,0.6661,0.6044,0.7794,0.4683,0.3936,0.1469,0.5280,0.1939,0.6472,0.8898 ViT-B-16-quickgelu,metaclip_fullcc,0.6041,0.7212,0.9328,0.9572,0.7891,0.2935,0.2260,0.2271,0.6223,0.5265,0.3059,0.8882,0.4659,0.6016,0.6505,0.4953,0.4150,0.8423,0.1871,0.6610,0.6138,0.7358,0.9175,0.7818,0.5915,0.5898,0.6744,0.8302,0.9841,0.6879,0.3909,0.7811,0.5035,0.5221,0.1227,0.6993,0.1932,0.6402,0.8868 ViT-L-14,laion400m_e32,0.5971,0.7277,0.9266,0.9464,0.7741,0.2421,0.2452,0.2302,0.6053,0.6233,0.2490,0.9007,0.4989,0.5964,0.6545,0.4647,0.4190,0.8467,0.1997,0.7612,0.5969,0.7306,0.9170,0.7561,0.4968,0.5601,0.6741,0.8962,0.9808,0.7258,0.4955,0.7891,0.5137,0.3932,0.1254,0.4555,0.1708,0.6168,0.8839 @@ -53,7 +68,7 @@ ViT-B-16,laion400m_e32,0.5621,0.6705,0.9131,0.9172,0.7116,0.2869,0.2451,0.1810,0 ViT-B-16,laion400m_e31,0.5617,0.6698,0.9159,0.9169,0.7130,0.2889,0.2451,0.1804,0.5138,0.5033,0.1742,0.8587,0.4353,0.5233,0.5943,0.3327,0.5035,0.7777,0.1997,0.6531,0.5128,0.6693,0.8911,0.7678,0.5925,0.5459,0.5849,0.8365,0.9703,0.6958,0.3388,0.7451,0.4674,0.4225,0.1056,0.5976,0.1546,0.5946,0.8534 ViT-B-32-quickgelu,metaclip_fullcc,0.5577,0.6766,0.9290,0.9518,0.7767,0.1871,0.2307,0.1764,0.5883,0.4991,0.2705,0.8309,0.3922,0.5599,0.5957,0.2993,0.4825,0.7805,0.1871,0.4272,0.5286,0.6935,0.9087,0.7652,0.5596,0.5310,0.6124,0.7738,0.9630,0.6689,0.3447,0.7295,0.4662,0.5238,0.0915,0.5656,0.1588,0.6051,0.8610 convnext_base,laion400m_s13b_b51k,0.5576,0.6627,0.9151,0.8899,0.6462,0.2386,0.2209,0.1700,0.5404,0.4850,0.1556,0.8515,0.4551,0.5196,0.5859,0.3092,0.4925,0.7575,0.2925,0.6114,0.5058,0.6900,0.8853,0.7528,0.6116,0.5376,0.5683,0.8409,0.9656,0.6845,0.4038,0.7438,0.4615,0.4045,0.1095,0.6565,0.1589,0.5537,0.8530 -coca_ViT-B-32,laion2b_s13b_b90k,0.5547,0.6359,0.9115,0.9389,0.7396,0.1889,0.2057,0.1444,0.5388,0.4615,0.1882,0.7901,0.4474,0.5139,0.5569,0.2160,0.4995,0.7352,0.2686,0.7148,0.4518,0.6296,0.8875,0.7805,0.5974,0.5772,0.6010,0.8414,0.9634,0.6751,0.5519,0.7297,0.4560,0.4588,0.0943,0.5609,0.1088,0.5736,0.8447 +coca_ViT-B-32,laion2b_s13b_b90k,0.5533,0.6331,0.9078,0.9387,0.7378,0.1831,0.2175,0.1450,0.5367,0.4602,0.1783,0.7893,0.4532,0.5121,0.5522,0.2149,0.4920,0.7376,0.2644,0.7097,0.4470,0.6226,0.8875,0.7832,0.5938,0.5766,0.5994,0.8397,0.9626,0.6736,0.5503,0.7248,0.4537,0.4698,0.0876,0.5749,0.1010,0.5724,0.8430 ViT-B-32,laion2b_e16,0.5483,0.6565,0.9104,0.9403,0.7544,0.1923,0.2310,0.1652,0.5383,0.5030,0.2298,0.8166,0.3655,0.5287,0.5739,0.2615,0.5030,0.7588,0.1758,0.6347,0.4877,0.6732,0.8903,0.7877,0.5072,0.5437,0.6190,0.8437,0.9653,0.6851,0.4164,0.7539,0.4768,0.4602,0.0971,0.4648,0.0000,0.5724,0.8526 roberta-ViT-B-32,laion2b_s12b_b32k,0.5411,0.6171,0.9039,0.9325,0.7505,0.1472,0.2007,0.1472,0.5920,0.5215,0.1725,0.7812,0.4082,0.4912,0.5331,0.2120,0.5075,0.7224,0.3854,0.6636,0.4499,0.5893,0.8670,0.7804,0.4985,0.5420,0.6117,0.8315,0.9564,0.6627,0.4526,0.7302,0.4590,0.4583,0.0606,0.4098,0.1161,0.5549,0.8426 ViT-B-32-quickgelu,metaclip_400m,0.5387,0.6558,0.9171,0.9125,0.7006,0.2175,0.2448,0.1716,0.5255,0.5239,0.2680,0.8106,0.3576,0.5330,0.5760,0.2863,0.4680,0.7477,0.2588,0.4144,0.5046,0.6811,0.8877,0.7081,0.6426,0.5338,0.5954,0.7060,0.9543,0.6345,0.2056,0.7007,0.4386,0.5097,0.0819,0.6443,0.0000,0.5970,0.8539 @@ -66,12 +81,12 @@ ViT-B-32-quickgelu,openai,0.5245,0.6332,0.8758,0.8983,0.6423,0.2320,0.2335,0.172 RN50x4,openai,0.5188,0.6627,0.8661,0.7943,0.4514,0.2045,0.0905,0.2039,0.4862,0.3354,0.2102,0.8640,0.3622,0.4468,0.5944,0.4145,0.4955,0.7274,0.2335,0.4903,0.5141,0.6766,0.8829,0.6814,0.5675,0.6716,0.5338,0.6673,0.9658,0.6089,0.3190,0.7234,0.4318,0.3912,0.0870,0.5435,0.1130,0.5654,0.8376 ViT-B-32,laion400m_e31,0.5077,0.6022,0.8916,0.8825,0.6781,0.1549,0.2261,0.1356,0.5218,0.4694,0.1437,0.7814,0.4082,0.4648,0.5234,0.1957,0.5085,0.7079,0.1224,0.4108,0.4281,0.6319,0.8541,0.7312,0.5495,0.5162,0.5108,0.7436,0.9494,0.6508,0.2891,0.6890,0.4327,0.4262,0.0745,0.4975,0.1076,0.5491,0.8328 ViT-B-32,laion400m_e32,0.5074,0.6024,0.8918,0.8840,0.6773,0.1536,0.2261,0.1349,0.5229,0.4754,0.1467,0.7817,0.4070,0.4646,0.5237,0.1953,0.5080,0.7084,0.1181,0.4000,0.4292,0.6323,0.8513,0.7328,0.5490,0.5206,0.5094,0.7454,0.9498,0.6509,0.2759,0.6866,0.4337,0.4265,0.0741,0.5084,0.1068,0.5444,0.8326 -RN101-quickgelu,openai,0.5033,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.6852,0.4025,0.4130,0.0888,0.4723,0.1615,0.5631,0.8164 RN101,openai,0.5033,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.6852,0.4025,0.4130,0.0888,0.4723,0.1615,0.5631,0.8164 +RN101-quickgelu,openai,0.5033,0.6228,0.8527,0.8078,0.4764,0.2437,0.0923,0.1693,0.4335,0.3131,0.1853,0.8367,0.3753,0.4106,0.5612,0.2944,0.5085,0.6817,0.2644,0.5254,0.4515,0.6532,0.8652,0.6512,0.5819,0.6403,0.5476,0.6100,0.9680,0.5803,0.3185,0.6852,0.4025,0.4130,0.0888,0.4723,0.1615,0.5631,0.8164 ViT-B-16,commonpool_l_laion_s1b_b8k,0.5011,0.5526,0.8766,0.9296,0.7184,0.2681,0.2173,0.1119,0.4144,0.4115,0.0714,0.7661,0.3296,0.4315,0.4790,0.2004,0.4930,0.6501,0.3432,0.4753,0.4638,0.5023,0.7769,0.7686,0.5158,0.5228,0.5314,0.6760,0.9409,0.6278,0.4301,0.6447,0.3924,0.4476,0.0490,0.5127,0.1026,0.5514,0.8463 ViT-B-16,commonpool_l_image_s1b_b8k,0.4812,0.5719,0.8856,0.9321,0.6955,0.2143,0.2453,0.1308,0.4170,0.3193,0.0735,0.7797,0.2514,0.4343,0.4872,0.2143,0.4725,0.6356,0.3826,0.2219,0.4793,0.4817,0.7784,0.7841,0.5002,0.4986,0.4622,0.6627,0.9489,0.6335,0.2673,0.6026,0.3622,0.4787,0.0424,0.5000,0.0000,0.5946,0.8422 -RN50-quickgelu,openai,0.4810,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.6868,0.3869,0.3622,0.0623,0.5624,0.0000,0.5222,0.8129 RN50,openai,0.4810,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.6868,0.3869,0.3622,0.0623,0.5624,0.0000,0.5222,0.8129 +RN50-quickgelu,openai,0.4810,0.5982,0.8329,0.7157,0.4030,0.2171,0.1623,0.1542,0.4154,0.4081,0.1703,0.8080,0.3510,0.3544,0.5284,0.2327,0.5720,0.6073,0.1730,0.5755,0.4141,0.6522,0.8529,0.6510,0.6393,0.5645,0.4521,0.5453,0.9419,0.5994,0.2883,0.6868,0.3869,0.3622,0.0623,0.5624,0.0000,0.5222,0.8129 ViT-B-16,commonpool_l_text_s1b_b8k,0.4760,0.5605,0.8720,0.9391,0.7054,0.1843,0.2373,0.0995,0.3941,0.3830,0.0451,0.7724,0.2317,0.4437,0.4835,0.2220,0.4770,0.6708,0.2686,0.2593,0.4911,0.5164,0.7049,0.7669,0.4857,0.4931,0.4663,0.6525,0.9523,0.6088,0.2122,0.6078,0.3730,0.4570,0.0623,0.5697,0.0000,0.5643,0.8564 ViT-B-16,commonpool_l_basic_s1b_b8k,0.4585,0.5155,0.8444,0.8289,0.5251,0.2061,0.2277,0.1173,0.4133,0.3820,0.0481,0.7461,0.2021,0.3932,0.4325,0.1913,0.4600,0.6087,0.3333,0.2809,0.4493,0.4357,0.6956,0.7151,0.5899,0.5387,0.4313,0.7216,0.9373,0.5974,0.1173,0.6015,0.3583,0.4812,0.0436,0.5712,0.0000,0.5421,0.8384 ViT-B-16,commonpool_l_s1b_b8k,0.4370,0.4593,0.8089,0.9133,0.6421,0.1594,0.2203,0.1177,0.3383,0.3348,0.0316,0.6735,0.2766,0.3448,0.3914,0.1592,0.4335,0.5265,0.2686,0.3603,0.4126,0.3681,0.5587,0.7093,0.5516,0.5118,0.4154,0.6060,0.9339,0.5713,0.3047,0.4948,0.2855,0.4777,0.0399,0.5102,0.0000,0.5654,0.8305 @@ -92,9 +107,9 @@ RN50-quickgelu,yfcc15m,0.2776,0.3275,0.5089,0.4919,0.2033,0.1305,0.1990,0.0637,0 ViT-B-32,commonpool_m_s128m_b4k,0.2580,0.1755,0.5231,0.7459,0.4391,0.1263,0.2265,0.0362,0.1606,0.2537,0.0115,0.2342,0.0869,0.0952,0.1440,0.0388,0.2780,0.1983,0.2743,0.0933,0.1574,0.1128,0.1676,0.5448,0.5048,0.5003,0.1810,0.1332,0.7690,0.3066,0.0933,0.1599,0.0974,0.3983,0.0127,0.5015,0.0000,0.4276,0.5942 ViT-B-32,commonpool_s_clip_s13m_b4k,0.1731,0.0505,0.2483,0.4768,0.1937,0.1529,0.2313,0.0119,0.0782,0.2067,0.0083,0.0801,0.0732,0.0200,0.0380,0.0181,0.1380,0.0655,0.2785,0.0874,0.0506,0.0539,0.0796,0.3379,0.6367,0.5014,0.0806,0.0276,0.5353,0.1126,0.1166,0.0343,0.0224,0.2994,0.0004,0.6874,0.0000,0.2605,0.2827 ViT-B-32,commonpool_s_text_s13m_b4k,0.1573,0.0460,0.2231,0.4679,0.1844,0.1350,0.1899,0.0121,0.0670,0.0896,0.0139,0.0618,0.0411,0.0175,0.0398,0.0187,0.1270,0.0606,0.3980,0.0771,0.0494,0.0428,0.0581,0.2942,0.5027,0.5008,0.1029,0.0204,0.5019,0.1051,0.0933,0.0424,0.0214,0.3120,0.0015,0.5000,0.0000,0.2745,0.2843 -ViT-B-32,commonpool_s_image_s13m_b4k,0.1449,0.0392,0.2238,0.3176,0.1329,0.1121,0.2217,0.0109,0.0521,0.1593,0.0120,0.0604,0.0579,0.0186,0.0308,0.0155,0.1055,0.0578,0.2883,0.0991,0.0436,0.0528,0.0474,0.2666,0.5273,0.4646,0.0794,0.0173,0.4601,0.0725,0.1305,0.0171,0.0130,0.2525,0.0033,0.5425,0.0085,0.2150,0.2752 ViT-B-32,datacomp_s_s13m_b4k,0.1449,0.0392,0.2238,0.3176,0.1329,0.1121,0.2217,0.0109,0.0521,0.1593,0.0120,0.0604,0.0579,0.0186,0.0308,0.0155,0.1055,0.0578,0.2883,0.0991,0.0436,0.0528,0.0474,0.2666,0.5273,0.4646,0.0794,0.0173,0.4601,0.0725,0.1305,0.0171,0.0130,0.2525,0.0033,0.5425,0.0085,0.2150,0.2752 +ViT-B-32,commonpool_s_image_s13m_b4k,0.1449,0.0392,0.2238,0.3176,0.1329,0.1121,0.2217,0.0109,0.0521,0.1593,0.0120,0.0604,0.0579,0.0186,0.0308,0.0155,0.1055,0.0578,0.2883,0.0991,0.0436,0.0528,0.0474,0.2666,0.5273,0.4646,0.0794,0.0173,0.4601,0.0725,0.1305,0.0171,0.0130,0.2525,0.0033,0.5425,0.0085,0.2150,0.2752 ViT-B-32,commonpool_s_basic_s13m_b4k,0.1423,0.0377,0.1806,0.2664,0.1154,0.1245,0.2335,0.0120,0.0553,0.0587,0.0103,0.0588,0.0638,0.0151,0.0319,0.0203,0.0985,0.0499,0.3390,0.1085,0.0440,0.0351,0.0488,0.3081,0.5096,0.4986,0.0795,0.0200,0.4659,0.0879,0.0810,0.0328,0.0168,0.3033,0.0003,0.5001,0.0000,0.2325,0.2643 ViT-B-32,commonpool_s_s13m_b4k,0.1420,0.0270,0.1564,0.4079,0.1296,0.1305,0.2233,0.0126,0.0574,0.1487,0.0081,0.0473,0.0654,0.0108,0.0234,0.0141,0.1000,0.0404,0.3460,0.0708,0.0360,0.0338,0.0443,0.2235,0.5268,0.5008,0.0698,0.0143,0.4266,0.0766,0.1121,0.0257,0.0132,0.3126,0.0002,0.5124,0.0000,0.2290,0.2167 ViT-B-32,commonpool_s_laion_s13m_b4k,0.1332,0.0305,0.1549,0.3364,0.1347,0.1309,0.1299,0.0098,0.0553,0.1578,0.0134,0.0501,0.0538,0.0125,0.0271,0.0147,0.1015,0.0443,0.2518,0.1387,0.0369,0.0244,0.0399,0.3030,0.4216,0.4992,0.0583,0.0155,0.4874,0.0659,0.1473,0.0223,0.0121,0.2410,0.0017,0.3703,0.0000,0.2079,0.2580 -coca_ViT-B-32,mscoco_finetuned_laion2b_s13b_b90k,0.1064,0.0091,0.0441,0.2002,0.0173,0.1315,0.2019,0.0047,0.0452,0.0844,0.0139,0.0177,0.0298,0.0034,0.0086,0.0091,0.0230,0.0158,0.2714,0.1442,0.0159,0.0131,0.0438,0.1247,0.5183,0.4992,0.0589,0.0058,0.2913,0.0211,0.1519,0.0104,0.0061,0.2375,0.0003,0.5140,0.0000,0.1729,0.0814 +coca_ViT-B-32,mscoco_finetuned_laion2b_s13b_b90k,0.1108,0.0079,0.0320,0.2564,0.0193,0.1245,0.2027,0.0044,0.0303,0.1157,0.0064,0.0159,0.0146,0.0028,0.0067,0.0121,0.0220,0.0199,0.3010,0.1506,0.0144,0.0054,0.0416,0.2023,0.5713,0.4992,0.0478,0.0056,0.2579,0.0204,0.1529,0.0092,0.0060,0.2329,0.0004,0.5681,0.0000,0.1729,0.0589 diff --git a/scripts/clipav1_vit_l16_i37_t8.sh b/scripts/clipav1_vit_l16_i37_t8.sh new file mode 100644 index 000000000..d3ff0901e --- /dev/null +++ b/scripts/clipav1_vit_l16_i37_t8.sh @@ -0,0 +1,6 @@ +# eval on a single gpu +CUDA_VISIBLE_DEVICES=2 TORCH_CUDNN_V8_API_ENABLED=1 TFDS_PREFETCH_SIZE=8192 python3 -m training.main \ + --model ViT-L-16-CL32-GAP \ + --pretrained "/path/to/clipa_vit_l16_i37_t8.pt" \ + --seed 0 \ + --imagenet-val '/path/to/ImageNet/val' \ No newline at end of file diff --git a/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh b/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh new file mode 100644 index 000000000..7f22386c3 --- /dev/null +++ b/scripts/clipav2_vit_h14_i84_224_336_cl32_gap_datacomp1b.sh @@ -0,0 +1,10 @@ +CUDA_VISIBLE_DEVICES=1 python3 -m training.main \ + --model ViT-H-14-CL32-GAP-BigVision \ + --pretrained "/path/to/vit_h14_i84_224_336_cl32_gap_datacomp1b.pt" \ + --force-image-size 336 \ + --square-resize-only \ + --interpolation 'bilinear' \ + --image-mean 0.485 0.456 0.406 \ + --image-std 0.229 0.224 0.225 \ + --seed 0 \ + --imagenet-val '/path/to/ImageNet/val' diff --git a/src/open_clip/__init__.py b/src/open_clip/__init__.py index fdb1199b8..23856a3f1 100644 --- a/src/open_clip/__init__.py +++ b/src/open_clip/__init__.py @@ -4,7 +4,8 @@ from .factory import list_models, add_model_config, get_model_config, load_checkpoint from .loss import ClipLoss, DistillClipLoss, CoCaLoss from .model import CLIP, CustomTextCLIP, CLIPTextCfg, CLIPVisionCfg, \ - convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype + convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype, get_input_dtype, \ + get_model_tokenize_cfg, get_model_preprocess_cfg, set_model_preprocess_cfg from .openai import load_openai_model, list_openai_models from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, \ get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained diff --git a/src/open_clip/big_vision.py b/src/open_clip/big_vision.py new file mode 100644 index 000000000..0d7eaf3fa --- /dev/null +++ b/src/open_clip/big_vision.py @@ -0,0 +1,136 @@ +import torch +import numpy as np + +from .model import CustomTextCLIP +from .transformer import TextTransformer, Transformer + + +@torch.no_grad() +def load_big_vision_weights(model: CustomTextCLIP, checkpoint_path: str): + """ Load weights from .npz checkpoints for official Google big_vision image-text models + + Currently the SigLIP source models are supported and a CustomTextCLIP destination model + w/ timm image encoder. + """ + from timm.layers import resample_patch_embed, resample_abs_pos_embed + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + w = np.load(checkpoint_path) + interpolation = 'bilinear' + antialias = False + + def _convert_timm_img(module, prefix): + embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) + if embed_conv_w.shape[-2:] != module.patch_embed.proj.weight.shape[-2:]: + embed_conv_w = resample_patch_embed( + embed_conv_w, + module.patch_embed.proj.weight.shape[-2:], + interpolation=interpolation, + antialias=antialias, + verbose=True, + ) + module.patch_embed.proj.weight.copy_(embed_conv_w) + module.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) + + if module.cls_token is not None: + module.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) + + pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False) + if pos_embed_w.shape != module.pos_embed.shape: + assert False, f'{pos_embed_w.shape}, {module.pos_embed.shape}' + num_prefix_tokens = 0 if getattr(module, 'no_embed_class', False) else getattr(module, 'num_prefix_tokens', 1) + pos_embed_w = resample_abs_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, + new_size=module.patch_embed.grid_size, + num_prefix_tokens=num_prefix_tokens, + interpolation=interpolation, + antialias=antialias, + verbose=True, + ) + module.pos_embed.copy_(pos_embed_w) + + mha_sub, b_sub, ln1_sub = (0, 0, 1) + for i, block in enumerate(module.blocks.children()): + block_prefix = f'{prefix}Transformer/encoderblock_{i}/' + mha_prefix = block_prefix + f'MultiHeadDotProductAttention_{mha_sub}/' + block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + block.attn.qkv.weight.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) + block.attn.qkv.bias.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) + block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + for r in range(2): + getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/kernel'])) + getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_{b_sub}/Dense_{r}/bias'])) + block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/scale'])) + block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_{ln1_sub}/bias'])) + + module.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) + module.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) + + if module.attn_pool is not None: + block_prefix = f'{prefix}MAPHead_0/' + mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/' + module.attn_pool.latent.copy_(_n2p(w[f'{block_prefix}probe'], t=False)) + module.attn_pool.q.weight.copy_(_n2p(w[f'{mha_prefix}query/kernel'], t=False).flatten(1).T) + module.attn_pool.q.bias.copy_(_n2p(w[f'{mha_prefix}query/bias'], t=False).reshape(-1)) + module.attn_pool.kv.weight.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('key', 'value')])) + module.attn_pool.kv.bias.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('key', 'value')])) + module.attn_pool.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + module.attn_pool.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + module.attn_pool.norm.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + module.attn_pool.norm.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + for r in range(2): + getattr(module.attn_pool.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/kernel'])) + getattr(module.attn_pool.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_{r}/bias'])) + + def _convert_openclip_transformer(module: Transformer, prefix): + for i, block in enumerate(module.resblocks.children()): + block_prefix = f'{prefix}encoderblock_{i}/' + mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/' + block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + block.attn.in_proj_weight.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) + block.attn.in_proj_bias.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) + block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale'])) + block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias'])) + block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel'])) + block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias'])) + block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel'])) + block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias'])) + + def _convert_openclip_txt(module: TextTransformer, prefix): + module.token_embedding.weight.copy_(_n2p(w[f'{prefix}Embed_0/embedding'], t=False)) + pos_embed_w = _n2p(w[f'{prefix}pos_embedding'], t=False).squeeze(0) + module.positional_embedding.copy_(pos_embed_w) + _convert_openclip_transformer(module.transformer, prefix=prefix + 'Encoder_0/') + module.ln_final.weight.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/scale'])) + module.ln_final.bias.copy_(_n2p(w[f'{prefix}Encoder_0/encoder_norm/bias'])) + module.text_projection.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) + module.text_projection.bias.copy_(_n2p(w[f'{prefix}head/bias'])) + + _convert_timm_img(model.visual.trunk, 'params/img/') + _convert_openclip_txt(model.text, 'params/txt/') + model.logit_bias.copy_(_n2p(w['params/b'])[0]) + model.logit_scale.copy_(_n2p(w['params/t'])[0]) + + diff --git a/src/open_clip/coca_model.py b/src/open_clip/coca_model.py index ad81fb665..485de1bd5 100644 --- a/src/open_clip/coca_model.py +++ b/src/open_clip/coca_model.py @@ -123,35 +123,46 @@ def __init__( self.pad_id = pad_id @torch.jit.ignore - def set_grad_checkpointing(self, enable=True): + def set_grad_checkpointing(self, enable: bool = True): self.visual.set_grad_checkpointing(enable) self.text.set_grad_checkpointing(enable) self.text_decoder.set_grad_checkpointing(enable) - def _encode_image(self, images, normalize=True): + def _encode_image(self, images, normalize: bool = True): image_latent, tokens_embs = self.visual(images) image_latent = F.normalize(image_latent, dim=-1) if normalize else image_latent return image_latent, tokens_embs - def _encode_text(self, text, normalize=True, embed_cls=True): + def _encode_text(self, text, normalize: bool = True, embed_cls: bool = True): text = text[:, :-1] if embed_cls else text # make space for CLS token text_latent, token_emb = self.text(text) text_latent = F.normalize(text_latent, dim=-1) if normalize else text_latent return text_latent, token_emb - def encode_image(self, images, normalize=True): + def encode_image(self, images, normalize: bool = True): image_latent, _ = self._encode_image(images, normalize=normalize) return image_latent - def encode_text(self, text, normalize=True, embed_cls=True): + def encode_text(self, text, normalize: bool = True, embed_cls: bool = True): text_latent, _ = self._encode_text(text, normalize=normalize, embed_cls=embed_cls) return text_latent - def forward(self, image, text, embed_cls=True, image_latent=None, image_embs=None): - text_latent, token_embs = self._encode_text(text, embed_cls=embed_cls) + def forward( + self, + image, + text: Optional[torch.Tensor] = None, + embed_cls: bool = True, + image_latent: Optional[torch.Tensor] = None, + image_embs: Optional[torch.Tensor] = None, + ): if image_latent is None or image_embs is None: image_latent, image_embs = self._encode_image(image) + if text is None: + return {"image_features": image_latent, "image_embs": image_embs} + + text_latent, token_embs = self._encode_text(text, embed_cls=embed_cls) + # TODO: add assertion to avoid bugs? labels = text[:, -token_embs.shape[1]:] diff --git a/src/open_clip/constants.py b/src/open_clip/constants.py index a670bb3fa..599c48c03 100644 --- a/src/open_clip/constants.py +++ b/src/open_clip/constants.py @@ -1,2 +1,6 @@ OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) +INCEPTION_MEAN = (0.5, 0.5, 0.5) +INCEPTION_STD = (0.5, 0.5, 0.5) diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py index 7268522e4..ef94b51f8 100644 --- a/src/open_clip/factory.py +++ b/src/open_clip/factory.py @@ -1,25 +1,24 @@ import json import logging import os -import pathlib import re from copy import deepcopy +from dataclasses import asdict from pathlib import Path from typing import Any, Dict, Optional, Tuple, Union -from functools import partial import torch from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD from .model import CLIP, CustomTextCLIP, convert_weights_to_lp, convert_to_custom_text_state_dict,\ - resize_pos_embed, get_cast_dtype, resize_text_pos_embed + resize_pos_embed, get_cast_dtype, resize_text_pos_embed, set_model_preprocess_cfg from .coca_model import CoCa from .loss import ClipLoss, DistillClipLoss, CoCaLoss, SigLipLoss from .openai import load_openai_model from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\ list_pretrained_tags_by_model, download_pretrained_from_hf -from .transform import image_transform, AugmentationCfg -from .tokenizer import HFTokenizer, tokenize, syntax_mask_tokenize, random_mask_tokenize, block_mask_tokenize +from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs +from .tokenizer import HFTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH HF_HUB_PREFIX = 'hf-hub:' @@ -75,24 +74,54 @@ def get_model_config(model_name): return None -def get_tokenizer(model_name): +def _get_hf_config(model_id, cache_dir=None): + config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir) + with open(config_path, 'r', encoding='utf-8') as f: + config = json.load(f) + return config + + +def get_tokenizer( + model_name: str = '', + context_length: Optional[int] = None, + **kwargs, +): if model_name.startswith(HF_HUB_PREFIX): - tokenizer = HFTokenizer(model_name[len(HF_HUB_PREFIX):]) + model_name = model_name[len(HF_HUB_PREFIX):] + try: + config = _get_hf_config(model_name)['model_cfg'] + except Exception: + tokenizer = HFTokenizer( + model_name, + context_length=context_length or DEFAULT_CONTEXT_LENGTH, + **kwargs, + ) + return tokenizer else: config = get_model_config(model_name) - if 'hf_tokenizer_name' in config['text_cfg']: - tokenizer = HFTokenizer(config['text_cfg']['hf_tokenizer_name']) - elif 'text_mask' in config['text_cfg'] and config['text_cfg']['text_mask'] == 'syntax': - tokenizer = syntax_mask_tokenize - elif 'text_mask' in config['text_cfg'] and config['text_cfg']['text_mask'] == 'random': - tokenizer = random_mask_tokenize - elif 'text_mask' in config['text_cfg'] and config['text_cfg']['text_mask'] == 'block': - tokenizer = block_mask_tokenize - else: - tokenizer = tokenize - if 'context_length' in config['text_cfg'].keys(): - context_length = config['text_cfg']['context_length'] - tokenizer = partial(tokenizer, context_length=context_length) + assert config is not None, f"No valid model config found for {model_name}." + + text_config = config.get('text_cfg', {}) + if 'tokenizer_kwargs' in text_config: + tokenizer_kwargs = dict(text_config['tokenizer_kwargs'], **kwargs) + else: + tokenizer_kwargs = kwargs + + if context_length is None: + context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH) + + if 'hf_tokenizer_name' in text_config: + tokenizer = HFTokenizer( + text_config['hf_tokenizer_name'], + context_length=context_length, + **tokenizer_kwargs, + ) + else: + tokenizer = SimpleTokenizer( + context_length=context_length, + **tokenizer_kwargs, + ) + return tokenizer @@ -112,6 +141,11 @@ def load_state_dict(checkpoint_path: str, map_location='cpu'): def load_checkpoint(model, checkpoint_path, strict=True): + if Path(checkpoint_path).suffix in ('.npz', '.npy'): + from .big_vision import load_big_vision_weights + load_big_vision_weights(model, checkpoint_path) + return {} + state_dict = load_state_dict(checkpoint_path) # detect old format and make compatible with new format if 'positional_embedding' in state_dict and not hasattr(model, 'positional_embedding'): @@ -136,6 +170,7 @@ def create_model( force_custom_text: bool = False, force_patch_dropout: Optional[float] = None, force_image_size: Optional[Union[int, Tuple[int, int]]] = None, + force_preprocess_cfg: Optional[Dict[str, Any]] = None, pretrained_image: bool = False, pretrained_hf: bool = True, cache_dir: Optional[str] = None, @@ -143,20 +178,19 @@ def create_model( require_pretrained: bool = False, **model_kwargs, ): + force_preprocess_cfg = force_preprocess_cfg or {} + preprocess_cfg = asdict(PreprocessCfg()) has_hf_hub_prefix = model_name.startswith(HF_HUB_PREFIX) if has_hf_hub_prefix: model_id = model_name[len(HF_HUB_PREFIX):] checkpoint_path = download_pretrained_from_hf(model_id, cache_dir=cache_dir) - config_path = download_pretrained_from_hf(model_id, filename='open_clip_config.json', cache_dir=cache_dir) - - with open(config_path, 'r', encoding='utf-8') as f: - config = json.load(f) - pretrained_cfg = config['preprocess_cfg'] + config = _get_hf_config(model_id, cache_dir) + preprocess_cfg = merge_preprocess_dict(preprocess_cfg, config['preprocess_cfg']) model_cfg = config['model_cfg'] + pretrained_hf = False # override, no need to load original HF text weights else: model_name = model_name.replace('/', '-') # for callers using old naming with / in ViT names checkpoint_path = None - pretrained_cfg = {} model_cfg = None if isinstance(device, str): @@ -201,11 +235,12 @@ def create_model( # cast_dtype set for fp16 and bf16 (manual mixed-precision), not set for 'amp' or 'pure' modes cast_dtype = get_cast_dtype(precision) is_hf_model = 'hf_model_name' in model_cfg.get('text_cfg', {}) + if is_hf_model: + # load pretrained weights for HF text model IFF no CLIP weights being loaded + model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf and not pretrained custom_text = model_cfg.pop('custom_text', False) or force_custom_text or is_hf_model if custom_text: - if is_hf_model: - model_cfg['text_cfg']['hf_model_pretrained'] = pretrained_hf if "multimodal_cfg" in model_cfg: model = CoCa(**model_cfg, **model_kwargs, cast_dtype=cast_dtype) else: @@ -222,6 +257,7 @@ def create_model( # Why? The convert_weights_to_lp fn only works with native models. model.to(device=device, dtype=dtype) from .transformer import LayerNormFp32 + def _convert_ln(m): if isinstance(m, LayerNormFp32): m.weight.data = m.weight.data.to(torch.float32) @@ -242,6 +278,7 @@ def _convert_ln(m): pretrained_cfg = get_pretrained_cfg(model_name, pretrained) if pretrained_cfg: checkpoint_path = download_pretrained(pretrained_cfg, cache_dir=cache_dir) + preprocess_cfg = merge_preprocess_dict(preprocess_cfg, pretrained_cfg) elif os.path.exists(pretrained): checkpoint_path = pretrained @@ -256,7 +293,7 @@ def _convert_ln(m): raise RuntimeError(error_str) pretrained_loaded = True elif has_hf_hub_prefix: - logging.info(f'Loading pretrained {model_name} weights ({pretrained}).') + logging.info(f'Loading pretrained {model_name} weights ({checkpoint_path}).') load_checkpoint(model, checkpoint_path) pretrained_loaded = True @@ -265,16 +302,18 @@ def _convert_ln(m): raise RuntimeError( f'Pretrained weights were required for (model: {model_name}, pretrained: {pretrained}) but not loaded.') - # set image / mean metadata from pretrained_cfg if available, or use default - model.visual.image_mean = pretrained_cfg.get('mean', None) or OPENAI_DATASET_MEAN - model.visual.image_std = pretrained_cfg.get('std', None) or OPENAI_DATASET_STD - if output_dict and hasattr(model, "output_dict"): model.output_dict = True if jit: model = torch.jit.script(model) + # set image preprocessing configuration in model attributes for convenience + if getattr(model.visual, 'image_size', None) is not None: + # use image_size set on model creation (via config or force_image_size arg) + force_preprocess_cfg['size'] = model.visual.image_size + set_model_preprocess_cfg(model, merge_preprocess_dict(preprocess_cfg, force_preprocess_cfg)) + return model @@ -325,15 +364,20 @@ def create_model_and_transforms( force_custom_text: bool = False, force_patch_dropout: Optional[float] = None, force_image_size: Optional[Union[int, Tuple[int, int]]] = None, - pretrained_image: bool = False, - pretrained_hf: bool = True, image_mean: Optional[Tuple[float, ...]] = None, image_std: Optional[Tuple[float, ...]] = None, + image_interpolation: Optional[str] = None, + image_resize_mode: Optional[str] = None, # only effective for inference aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, + pretrained_image: bool = False, + pretrained_hf: bool = True, cache_dir: Optional[str] = None, output_dict: Optional[bool] = None, **model_kwargs, ): + force_preprocess_cfg = merge_preprocess_kwargs( + {}, mean=image_mean, std=image_std, interpolation=image_interpolation, resize_mode=image_resize_mode) + model = create_model( model_name, pretrained, @@ -344,6 +388,7 @@ def create_model_and_transforms( force_custom_text=force_custom_text, force_patch_dropout=force_patch_dropout, force_image_size=force_image_size, + force_preprocess_cfg=force_preprocess_cfg, pretrained_image=pretrained_image, pretrained_hf=pretrained_hf, cache_dir=cache_dir, @@ -351,20 +396,16 @@ def create_model_and_transforms( **model_kwargs, ) - image_mean = image_mean or getattr(model.visual, 'image_mean', None) - image_std = image_std or getattr(model.visual, 'image_std', None) - preprocess_train = image_transform( - model.visual.image_size, + pp_cfg = PreprocessCfg(**model.visual.preprocess_cfg) + + preprocess_train = image_transform_v2( + pp_cfg, is_train=True, - mean=image_mean, - std=image_std, aug_cfg=aug_cfg, ) - preprocess_val = image_transform( - model.visual.image_size, + preprocess_val = image_transform_v2( + pp_cfg, is_train=False, - mean=image_mean, - std=image_std, ) return model, preprocess_train, preprocess_val @@ -379,12 +420,17 @@ def create_model_from_pretrained( force_quick_gelu: bool = False, force_custom_text: bool = False, force_image_size: Optional[Union[int, Tuple[int, int]]] = None, - return_transform: bool = True, image_mean: Optional[Tuple[float, ...]] = None, image_std: Optional[Tuple[float, ...]] = None, + image_interpolation: Optional[str] = None, + image_resize_mode: Optional[str] = None, # only effective for inference + return_transform: bool = True, cache_dir: Optional[str] = None, **model_kwargs, ): + force_preprocess_cfg = merge_preprocess_kwargs( + {}, mean=image_mean, std=image_std, interpolation=image_interpolation, resize_mode=image_resize_mode) + model = create_model( model_name, pretrained, @@ -394,6 +440,7 @@ def create_model_from_pretrained( force_quick_gelu=force_quick_gelu, force_custom_text=force_custom_text, force_image_size=force_image_size, + force_preprocess_cfg=force_preprocess_cfg, cache_dir=cache_dir, require_pretrained=True, **model_kwargs, @@ -402,13 +449,9 @@ def create_model_from_pretrained( if not return_transform: return model - image_mean = image_mean or getattr(model.visual, 'image_mean', None) - image_std = image_std or getattr(model.visual, 'image_std', None) - preprocess = image_transform( - model.visual.image_size, + preprocess = image_transform_v2( + PreprocessCfg(**model.visual.preprocess_cfg), is_train=False, - mean=image_mean, - std=image_std, ) return model, preprocess diff --git a/src/open_clip/hf_model.py b/src/open_clip/hf_model.py index 08dbdbcde..281a06cc5 100644 --- a/src/open_clip/hf_model.py +++ b/src/open_clip/hf_model.py @@ -103,7 +103,7 @@ def __init__( output_dim: int, config: PretrainedConfig = None, pooler_type: str = None, - proj: str = None, + proj_type: str = None, pretrained: bool = True, output_tokens: bool = False, ): @@ -139,11 +139,11 @@ def __init__( self.pooler = _POOLERS[pooler_type]() d_model = getattr(self.config, arch_dict[self.config.model_type]["config_names"]["width"]) - if (d_model == output_dim) and (proj is None): # do we always need a proj? + if (d_model == output_dim) and (proj_type is None): # do we always need a proj? self.proj = nn.Identity() - elif proj == 'linear': + elif proj_type == 'linear': self.proj = nn.Linear(d_model, output_dim, bias=False) - elif proj == 'mlp': + elif proj_type == 'mlp': hidden_size = (d_model + output_dim) // 2 self.proj = nn.Sequential( nn.Linear(d_model, hidden_size, bias=False), diff --git a/src/open_clip/model.py b/src/open_clip/model.py index 0ccf01bca..0310ee560 100644 --- a/src/open_clip/model.py +++ b/src/open_clip/model.py @@ -2,21 +2,24 @@ Adapted from https://github.com/openai/CLIP. Originally MIT License, Copyright (c) 2021 OpenAI. """ -from dataclasses import dataclass +import copy import logging import math -from typing import Optional, Tuple, Union +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple, Union import numpy as np import torch import torch.nn.functional as F from torch import nn from torch.utils.checkpoint import checkpoint +from functools import partial from .hf_model import HFTextEncoder from .modified_resnet import ModifiedResNet from .timm_model import TimmModel -from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer +from .transformer import LayerNormFp32, LayerNorm, QuickGELU, Attention, VisionTransformer, TextTransformer,\ + text_global_pool from .utils import to_2tuple @@ -31,14 +34,18 @@ class CLIPVisionCfg: ls_init_value: Optional[float] = None # layer scale initial value patch_dropout: float = 0. # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results - input_patchnorm: bool = False # whether to use dual patchnorm - would only apply the input layernorm on each patch, as post-layernorm already exist in original clip vit design - global_average_pool: bool = False # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580) - attentional_pool: bool = False # whether to use attentional pooler in the last embedding layer - n_queries: int = 256 # n_queries for attentional pooler + attentional_pool: bool = False # whether to use attentional pooler in the last embedding layer (overrides pool_type) + attn_pooler_queries: int = 256 # n_queries for attentional pooler attn_pooler_heads: int = 8 # n heads for attentional_pooling + no_ln_pre: bool = False # disable pre transformer LayerNorm + pos_embed_type: str = 'learnable' + final_ln_after_pool: bool = False # apply final LayerNorm after pooling + pool_type: str = 'tok' output_tokens: bool = False + act_kwargs: Optional[dict] = None + norm_kwargs: Optional[dict] = None - timm_model_name: str = None # a valid model name overrides layers, width, patch_size + timm_model_name: Optional[str] = None # a valid model name overrides layers, width, patch_size timm_model_pretrained: bool = False # use (imagenet) pretrained weights for named model timm_pool: str = 'avg' # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '') timm_proj: str = 'linear' # linear projection for timm model output ('linear', 'mlp', '') @@ -51,19 +58,29 @@ class CLIPVisionCfg: class CLIPTextCfg: context_length: int = 77 vocab_size: int = 49408 + hf_tokenizer_name: Optional[str] = None + tokenizer_kwargs: Optional[dict] = None + width: int = 512 heads: int = 8 layers: int = 12 + mlp_ratio: float = 4.0 ls_init_value: Optional[float] = None # layer scale initial value - hf_model_name: str = None - hf_tokenizer_name: str = None - hf_model_pretrained: bool = True - proj: str = 'mlp' - pooler_type: str = 'mean_pooler' embed_cls: bool = False pad_id: int = 0 + no_causal_mask: bool = False # disable causal masking + final_ln_after_pool: bool = False # apply final LayerNorm after pooling + pool_type: str = 'argmax' + proj_bias: bool = False output_tokens: bool = False - text_mask: str = 'first' # default first truncate in bpe_tokenizer + act_kwargs: dict = None + norm_kwargs: dict = None + + # HuggingFace specific text tower config + hf_model_name: Optional[str] = None + hf_model_pretrained: bool = True + hf_proj_type: str = 'mlp' + hf_pooler_type: str = 'mean_pooler' # attentional pooling for HF models def get_cast_dtype(precision: str): @@ -123,6 +140,11 @@ def _build_vision_tower( else: vision_heads = vision_cfg.width // vision_cfg.head_width norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm + if vision_cfg.norm_kwargs: + norm_layer = partial(norm_layer, **vision_cfg.norm_kwargs) + if vision_cfg.act_kwargs is not None: + act_layer = partial(act_layer, **vision_cfg.act_kwargs) + visual = VisionTransformer( image_size=vision_cfg.image_size, patch_size=vision_cfg.patch_size, @@ -132,11 +154,13 @@ def _build_vision_tower( mlp_ratio=vision_cfg.mlp_ratio, ls_init_value=vision_cfg.ls_init_value, patch_dropout=vision_cfg.patch_dropout, - input_patchnorm=vision_cfg.input_patchnorm, - global_average_pool=vision_cfg.global_average_pool, attentional_pool=vision_cfg.attentional_pool, - n_queries=vision_cfg.n_queries, + attn_pooler_queries=vision_cfg.attn_pooler_queries, attn_pooler_heads=vision_cfg.attn_pooler_heads, + pos_embed_type=vision_cfg.pos_embed_type, + no_ln_pre=vision_cfg.no_ln_pre, + final_ln_after_pool=vision_cfg.final_ln_after_pool, + pool_type=vision_cfg.pool_type, output_tokens=vision_cfg.output_tokens, output_dim=embed_dim, act_layer=act_layer, @@ -159,14 +183,18 @@ def _build_text_tower( text = HFTextEncoder( text_cfg.hf_model_name, output_dim=embed_dim, - proj=text_cfg.proj, - pooler_type=text_cfg.pooler_type, + proj_type=text_cfg.hf_proj_type, + pooler_type=text_cfg.hf_pooler_type, pretrained=text_cfg.hf_model_pretrained, output_tokens=text_cfg.output_tokens, ) else: act_layer = QuickGELU if quick_gelu else nn.GELU norm_layer = LayerNormFp32 if cast_dtype in (torch.float16, torch.bfloat16) else LayerNorm + if text_cfg.norm_kwargs: + norm_layer = partial(norm_layer, **text_cfg.norm_kwargs) + if text_cfg.act_kwargs is not None: + act_layer = partial(act_layer, **text_cfg.act_kwargs) text = TextTransformer( context_length=text_cfg.context_length, @@ -174,11 +202,15 @@ def _build_text_tower( width=text_cfg.width, heads=text_cfg.heads, layers=text_cfg.layers, + mlp_ratio=text_cfg.mlp_ratio, ls_init_value=text_cfg.ls_init_value, output_dim=embed_dim, embed_cls=text_cfg.embed_cls, - output_tokens=text_cfg.output_tokens, + no_causal_mask=text_cfg.no_causal_mask, pad_id=text_cfg.pad_id, + pool_type=text_cfg.pool_type, + proj_bias=text_cfg.proj_bias, + output_tokens=text_cfg.output_tokens, act_layer=act_layer, norm_layer=norm_layer, ) @@ -201,6 +233,7 @@ def __init__( ): super().__init__() self.output_dict = output_dict + self.visual = _build_vision_tower(embed_dim, vision_cfg, quick_gelu, cast_dtype) text = _build_text_tower(embed_dim, text_cfg, quick_gelu, cast_dtype) @@ -211,6 +244,7 @@ def __init__( self.positional_embedding = text.positional_embedding self.ln_final = text.ln_final self.text_projection = text.text_projection + self.text_pool_type = text.pool_type self.register_buffer('attn_mask', text.attn_mask, persistent=False) self.logit_scale = nn.Parameter(torch.ones([]) * init_logit_scale) @@ -242,8 +276,13 @@ def encode_text(self, text, normalize: bool = False): x = self.transformer(x, attn_mask=self.attn_mask) x = x.permute(1, 0, 2) # LND -> NLD x = self.ln_final(x) # [batch_size, n_ctx, transformer.width] - # take features from the eot embedding (eot_token is the highest number in each sequence) - x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + x, _ = text_global_pool(x, text, self.text_pool_type) + if self.text_projection is not None: + if isinstance(self.text_projection, nn.Linear): + x = self.text_projection(x) + else: + x = x @ self.text_projection + return F.normalize(x, dim=-1) if normalize else x def forward( @@ -528,4 +567,40 @@ def resize_text_pos_embed(state_dict, model, interpolation: str = 'linear', anti old_pos_embed = old_pos_embed.permute(0, 2, 1)[0] new_pos_embed = old_pos_embed - state_dict['positional_embedding'] = new_pos_embed \ No newline at end of file + state_dict['positional_embedding'] = new_pos_embed + + +def get_model_preprocess_cfg(model): + module = getattr(model, 'visual', model) + preprocess_cfg = getattr(module, 'preprocess_cfg', {}) + if not preprocess_cfg: + # use separate legacy attributes if preprocess_cfg dict not found + size = getattr(module, 'image_size') + if size is not None: + preprocess_cfg['size'] = size + mean = getattr(module, 'image_mean', None) + if mean is not None: + preprocess_cfg['mean'] = getattr(module, 'mean') + std = getattr(module, 'image_std', None) + if std is not None: + preprocess_cfg['std'] = getattr(module, 'std') + return preprocess_cfg + + +def set_model_preprocess_cfg(model, preprocess_cfg: Dict[str, Any]): + module = getattr(model, 'visual', model) + module.image_mean = preprocess_cfg['mean'] # legacy attribute, keeping for bwd compat + module.image_std = preprocess_cfg['std'] # legacy attribute, keeping for bwd compat + module.preprocess_cfg = copy.deepcopy(preprocess_cfg) # new attr, package all pp cfg as dict + + +def get_model_tokenize_cfg(model): + module = getattr(model, 'text', model) + cfg = {} + context_length = getattr(module, 'context_length', None) + if context_length is not None: + cfg['context_length'] = context_length + vocab_size = getattr(module, 'vocab_size', None) + if vocab_size is not None: + cfg['vocab_size'] = vocab_size + return cfg diff --git a/src/open_clip/model_configs/ViT-B-16-CL16.json b/src/open_clip/model_configs/ViT-B-16-CL16.json deleted file mode 100644 index 829f8c40a..000000000 --- a/src/open_clip/model_configs/ViT-B-16-CL16.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "embed_dim": 512, - "vision_cfg": { - "image_size": 224, - "layers": 12, - "width": 768, - "patch_size": 16 - }, - "text_cfg": { - "context_length": 16, - "vocab_size": 49408, - "width": 512, - "heads": 8, - "layers": 12 - } -} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json b/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json new file mode 100644 index 000000000..d7ad3acba --- /dev/null +++ b/src/open_clip/model_configs/ViT-B-16-SigLIP-256.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 768, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 256, + "timm_model_name": "vit_base_patch16_siglip_256", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 768, + "heads": 12, + "layers": 12, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json b/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json new file mode 100644 index 000000000..df9a25cdc --- /dev/null +++ b/src/open_clip/model_configs/ViT-B-16-SigLIP-384.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 768, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 384, + "timm_model_name": "vit_base_patch16_siglip_384", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 768, + "heads": 12, + "layers": 12, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json b/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json new file mode 100644 index 000000000..88b018528 --- /dev/null +++ b/src/open_clip/model_configs/ViT-B-16-SigLIP-512.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 768, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 512, + "timm_model_name": "vit_base_patch16_siglip_512", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 768, + "heads": 12, + "layers": 12, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json b/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json new file mode 100644 index 000000000..7a28797a7 --- /dev/null +++ b/src/open_clip/model_configs/ViT-B-16-SigLIP-i18n-256.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 768, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 256, + "timm_model_name": "vit_base_patch16_siglip_256", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 250000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP-i18n-256", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 768, + "heads": 12, + "layers": 12, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP.json b/src/open_clip/model_configs/ViT-B-16-SigLIP.json new file mode 100644 index 000000000..a9f2b654a --- /dev/null +++ b/src/open_clip/model_configs/ViT-B-16-SigLIP.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 768, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 224, + "timm_model_name": "vit_base_patch16_siglip_224", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 768, + "heads": 12, + "layers": 12, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-H-14-CL32-GAP.json b/src/open_clip/model_configs/ViT-H-14-CL32-GAP.json deleted file mode 100644 index 26f91cf50..000000000 --- a/src/open_clip/model_configs/ViT-H-14-CL32-GAP.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "embed_dim": 1024, - "vision_cfg": { - "image_size": 224, - "layers": 32, - "width": 1280, - "head_width": 80, - "patch_size": 14, - "global_average_pool": true - }, - "text_cfg": { - "context_length": 32, - "vocab_size": 49408, - "width": 1024, - "heads": 16, - "layers": 24 - } -} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-H-14-CL8-SyntaxMask-GAP.json b/src/open_clip/model_configs/ViT-H-14-CL8-SyntaxMask-GAP.json deleted file mode 100644 index 7e28b6173..000000000 --- a/src/open_clip/model_configs/ViT-H-14-CL8-SyntaxMask-GAP.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "embed_dim": 1024, - "vision_cfg": { - "image_size": 224, - "layers": 32, - "width": 1280, - "head_width": 80, - "patch_size": 14, - "global_average_pool": true - }, - "text_cfg": { - "context_length": 8, - "vocab_size": 49408, - "width": 1024, - "heads": 16, - "layers": 24, - "text_mask": "syntax" - } -} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json b/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json new file mode 100644 index 000000000..01fabb29d --- /dev/null +++ b/src/open_clip/model_configs/ViT-H-14-CLIPA-336.json @@ -0,0 +1,26 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 336, + "layers": 32, + "width": 1280, + "head_width": 80, + "patch_size": 14, + "no_ln_pre": true, + "pool_type": "avg", + "final_ln_after_pool": true + }, + "text_cfg": { + "context_length": 32, + "vocab_size": 32000, + "hf_tokenizer_name": "bert-base-uncased", + "tokenizer_kwargs": { + "strip_sep_token": true + }, + "width": 1024, + "heads": 16, + "layers": 24, + "pool_type": "last", + "no_causal_mask": true + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-H-14-CLIPA.json b/src/open_clip/model_configs/ViT-H-14-CLIPA.json new file mode 100644 index 000000000..7df033884 --- /dev/null +++ b/src/open_clip/model_configs/ViT-H-14-CLIPA.json @@ -0,0 +1,26 @@ +{ + "embed_dim": 1024, + "vision_cfg": { + "image_size": 224, + "layers": 32, + "width": 1280, + "head_width": 80, + "patch_size": 14, + "no_ln_pre": true, + "pool_type": "avg", + "final_ln_after_pool": true + }, + "text_cfg": { + "context_length": 32, + "vocab_size": 32000, + "hf_tokenizer_name": "bert-base-uncased", + "tokenizer_kwargs": { + "strip_sep_token": true + }, + "width": 1024, + "heads": 16, + "layers": 24, + "pool_type": "last", + "no_causal_mask": true + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json b/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json new file mode 100644 index 000000000..60a4df589 --- /dev/null +++ b/src/open_clip/model_configs/ViT-L-14-CLIPA-336.json @@ -0,0 +1,25 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 336, + "layers": 24, + "width": 1024, + "patch_size": 14, + "no_ln_pre": true, + "pool_type": "avg", + "final_ln_after_pool": true + }, + "text_cfg": { + "context_length": 32, + "vocab_size": 32000, + "hf_tokenizer_name": "bert-base-uncased", + "tokenizer_kwargs": { + "strip_sep_token": true + }, + "width": 768, + "heads": 12, + "layers": 12, + "pool_type": "last", + "no_causal_mask": true + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-14-CLIPA.json b/src/open_clip/model_configs/ViT-L-14-CLIPA.json new file mode 100644 index 000000000..b4dde7b54 --- /dev/null +++ b/src/open_clip/model_configs/ViT-L-14-CLIPA.json @@ -0,0 +1,25 @@ +{ + "embed_dim": 768, + "vision_cfg": { + "image_size": 224, + "layers": 24, + "width": 1024, + "patch_size": 14, + "no_ln_pre": true, + "pool_type": "avg", + "final_ln_after_pool": true + }, + "text_cfg": { + "context_length": 32, + "vocab_size": 32000, + "hf_tokenizer_name": "bert-base-uncased", + "tokenizer_kwargs": { + "strip_sep_token": true + }, + "width": 768, + "heads": 12, + "layers": 12, + "pool_type": "last", + "no_causal_mask": true + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-16-CL16-GAP.json b/src/open_clip/model_configs/ViT-L-16-CL16-GAP.json deleted file mode 100644 index a4262daf6..000000000 --- a/src/open_clip/model_configs/ViT-L-16-CL16-GAP.json +++ /dev/null @@ -1,17 +0,0 @@ -{ - "embed_dim": 768, - "vision_cfg": { - "image_size": 224, - "layers": 24, - "width": 1024, - "patch_size": 16, - "global_average_pool": true - }, - "text_cfg": { - "context_length": 16, - "vocab_size": 49408, - "width": 768, - "heads": 12, - "layers": 12 - } -} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-16-CL8-Syntax-GAP.json b/src/open_clip/model_configs/ViT-L-16-CL8-Syntax-GAP.json deleted file mode 100644 index 3569fdbe8..000000000 --- a/src/open_clip/model_configs/ViT-L-16-CL8-Syntax-GAP.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "embed_dim": 768, - "vision_cfg": { - "image_size": 224, - "layers": 24, - "width": 1024, - "patch_size": 16, - "global_average_pool": true - }, - "text_cfg": { - "context_length": 8, - "vocab_size": 49408, - "width": 768, - "heads": 12, - "layers": 12, - "text_mask": "syntax" - } -} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json b/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json new file mode 100644 index 000000000..5ba8f7abb --- /dev/null +++ b/src/open_clip/model_configs/ViT-L-16-SigLIP-256.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 1024, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 256, + "timm_model_name": "vit_large_patch16_siglip_256", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 1024, + "heads": 16, + "layers": 24, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json b/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json new file mode 100644 index 000000000..fd2cc2e34 --- /dev/null +++ b/src/open_clip/model_configs/ViT-L-16-SigLIP-384.json @@ -0,0 +1,29 @@ +{ + "embed_dim": 1024, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 384, + "timm_model_name": "vit_large_patch16_siglip_384", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 1024, + "heads": 16, + "layers": 24, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json b/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json new file mode 100644 index 000000000..4c527f581 --- /dev/null +++ b/src/open_clip/model_configs/ViT-SO400M-14-SigLIP-384.json @@ -0,0 +1,30 @@ +{ + "embed_dim": 1152, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 384, + "timm_model_name": "vit_so400m_patch14_siglip_384", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 64, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 1152, + "heads": 16, + "layers": 27, + "mlp_ratio": 3.7362, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json b/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json new file mode 100644 index 000000000..564eb78a4 --- /dev/null +++ b/src/open_clip/model_configs/ViT-SO400M-14-SigLIP.json @@ -0,0 +1,30 @@ +{ + "embed_dim": 1152, + "init_logit_bias": -10, + "custom_text": true, + "vision_cfg": { + "image_size": 224, + "timm_model_name": "vit_so400m_patch14_siglip_224", + "timm_model_pretrained": false, + "timm_pool": "map", + "timm_proj": "none" + }, + "text_cfg": { + "context_length": 16, + "vocab_size": 32000, + "hf_tokenizer_name": "timm/ViT-B-16-SigLIP", + "tokenizer_kwargs": { + "clean": "canonicalize" + }, + "width": 1152, + "heads": 16, + "layers": 27, + "mlp_ratio": 3.7362, + "no_causal_mask": true, + "proj_bias": true, + "pool_type": "last", + "norm_kwargs":{ + "eps": 1e-6 + } + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json b/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json new file mode 100644 index 000000000..75ba7675c --- /dev/null +++ b/src/open_clip/model_configs/ViT-bigG-14-CLIPA-336.json @@ -0,0 +1,27 @@ +{ + "embed_dim": 1280, + "vision_cfg": { + "image_size": 336, + "layers": 48, + "width": 1664, + "head_width": 104, + "mlp_ratio": 4.9231, + "patch_size": 14, + "no_ln_pre": true, + "pool_type": "avg", + "final_ln_after_pool": true + }, + "text_cfg": { + "context_length": 32, + "vocab_size": 32000, + "hf_tokenizer_name": "bert-base-uncased", + "tokenizer_kwargs": { + "strip_sep_token": true + }, + "width": 1280, + "heads": 20, + "layers": 32, + "pool_type": "last", + "no_causal_mask": true + } +} \ No newline at end of file diff --git a/src/open_clip/model_configs/coca_roberta-ViT-B-32.json b/src/open_clip/model_configs/coca_roberta-ViT-B-32.json index fb46354b9..aa9d3f562 100644 --- a/src/open_clip/model_configs/coca_roberta-ViT-B-32.json +++ b/src/open_clip/model_configs/coca_roberta-ViT-B-32.json @@ -10,7 +10,7 @@ "text_cfg": { "hf_model_name": "roberta-base", "hf_tokenizer_name": "roberta-base", - "proj": "linear", + "hf_proj_type": "linear", "width": 768, "output_tokens": true }, diff --git a/src/open_clip/model_configs/mt5-base-ViT-B-32.json b/src/open_clip/model_configs/mt5-base-ViT-B-32.json index 58cad89cf..e22366897 100644 --- a/src/open_clip/model_configs/mt5-base-ViT-B-32.json +++ b/src/open_clip/model_configs/mt5-base-ViT-B-32.json @@ -9,7 +9,6 @@ "text_cfg": { "hf_model_name": "google/mt5-base", "hf_tokenizer_name": "google/mt5-base", - "proj": "mlp", - "pooler_type": "mean_pooler" + "hf_pooler_type": "mean_pooler" } } diff --git a/src/open_clip/model_configs/mt5-xl-ViT-H-14.json b/src/open_clip/model_configs/mt5-xl-ViT-H-14.json index b43281077..f58717cdd 100644 --- a/src/open_clip/model_configs/mt5-xl-ViT-H-14.json +++ b/src/open_clip/model_configs/mt5-xl-ViT-H-14.json @@ -10,7 +10,6 @@ "text_cfg": { "hf_model_name": "google/mt5-xl", "hf_tokenizer_name": "google/mt5-xl", - "proj": "mlp", - "pooler_type": "mean_pooler" + "hf_pooler_type": "mean_pooler" } } diff --git a/src/open_clip/model_configs/nllb-clip-base.json b/src/open_clip/model_configs/nllb-clip-base.json index 8b85d0df5..57265b33f 100644 --- a/src/open_clip/model_configs/nllb-clip-base.json +++ b/src/open_clip/model_configs/nllb-clip-base.json @@ -9,7 +9,7 @@ "text_cfg": { "hf_model_name": "facebook/nllb-200-distilled-600M", "hf_tokenizer_name": "facebook/nllb-200-distilled-600M", - "proj": "linear", - "pooler_type": "cls_pooler" + "hf_proj_type": "linear", + "hf_pooler_type": "cls_pooler" } } \ No newline at end of file diff --git a/src/open_clip/model_configs/nllb-clip-large.json b/src/open_clip/model_configs/nllb-clip-large.json index 4e5bc14a8..72d04a733 100644 --- a/src/open_clip/model_configs/nllb-clip-large.json +++ b/src/open_clip/model_configs/nllb-clip-large.json @@ -10,7 +10,7 @@ "text_cfg": { "hf_model_name": "facebook/nllb-200-distilled-1.3B", "hf_tokenizer_name": "facebook/nllb-200-distilled-1.3B", - "proj": "linear", - "pooler_type": "cls_pooler" + "hf_proj_type": "linear", + "hf_pooler_type": "cls_pooler" } } \ No newline at end of file diff --git a/src/open_clip/model_configs/roberta-ViT-B-32.json b/src/open_clip/model_configs/roberta-ViT-B-32.json index ed687d472..c0c7a5599 100644 --- a/src/open_clip/model_configs/roberta-ViT-B-32.json +++ b/src/open_clip/model_configs/roberta-ViT-B-32.json @@ -10,7 +10,6 @@ "text_cfg": { "hf_model_name": "roberta-base", "hf_tokenizer_name": "roberta-base", - "proj": "mlp", - "pooler_type": "mean_pooler" + "hf_pooler_type": "mean_pooler" } } diff --git a/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json b/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json index 751bccc2c..375fa9e12 100644 --- a/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json +++ b/src/open_clip/model_configs/xlm-roberta-base-ViT-B-32.json @@ -9,7 +9,6 @@ "text_cfg": { "hf_model_name": "xlm-roberta-base", "hf_tokenizer_name": "xlm-roberta-base", - "proj": "mlp", - "pooler_type": "mean_pooler" + "hf_pooler_type": "mean_pooler" } } diff --git a/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json b/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json index 31f271faa..c56b4e898 100644 --- a/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json +++ b/src/open_clip/model_configs/xlm-roberta-large-ViT-H-14.json @@ -10,7 +10,6 @@ "text_cfg": { "hf_model_name": "xlm-roberta-large", "hf_tokenizer_name": "xlm-roberta-large", - "proj": "mlp", - "pooler_type": "mean_pooler" + "hf_pooler_type": "mean_pooler" } } diff --git a/src/open_clip/pos_embed.py b/src/open_clip/pos_embed.py new file mode 100644 index 000000000..5c8082b34 --- /dev/null +++ b/src/open_clip/pos_embed.py @@ -0,0 +1,96 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + +import numpy as np + +import torch + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=float) + omega /= embed_dim / 2. + omega = 1. / 10000**omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model): + if 'pos_embed' in checkpoint_model: + pos_embed_checkpoint = checkpoint_model['pos_embed'] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.patch_embed.num_patches + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size)) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model['pos_embed'] = new_pos_embed diff --git a/src/open_clip/pretrained.py b/src/open_clip/pretrained.py index 59961f986..2454f5797 100644 --- a/src/open_clip/pretrained.py +++ b/src/open_clip/pretrained.py @@ -7,6 +7,8 @@ from tqdm import tqdm +from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD, INCEPTION_MEAN, INCEPTION_STD, \ + IMAGENET_MEAN, IMAGENET_STD from .version import __version__ try: @@ -18,13 +20,43 @@ _has_hf_hub = False -def _pcfg(url='', hf_hub='', mean=None, std=None): - return dict( - url=url, - hf_hub=hf_hub, - mean=mean, - std=std, - ) +def _pcfg(url='', hf_hub='', **kwargs): + # OpenAI / OpenCLIP defaults + return { + 'url': url, + 'hf_hub': hf_hub, + 'mean': OPENAI_DATASET_MEAN, + 'std': OPENAI_DATASET_STD, + 'interpolation': 'bicubic', + 'resize_mode': 'shortest', + **kwargs, + } + + +def _slpcfg(url='', hf_hub='', **kwargs): + # SiGLIP defaults + return { + 'url': url, + 'hf_hub': hf_hub, + 'mean': INCEPTION_MEAN, + 'std': INCEPTION_STD, + 'interpolation': 'bicubic', + 'resize_mode': 'squash', + **kwargs, + } + + +def _apcfg(url='', hf_hub='', **kwargs): + # CLIPA defaults + return { + 'url': url, + 'hf_hub': hf_hub, + 'mean': IMAGENET_MEAN, + 'std': IMAGENET_STD, + 'interpolation': 'bilinear', + 'resize_mode': 'squash', + **kwargs, + } _RN50 = dict( @@ -164,7 +196,7 @@ def _pcfg(url='', hf_hub='', mean=None, std=None): "https://github.com/mlfoundations/open_clip/releases/download/v0.2-weights/vit_l_14-laion400m_e32-3d133497.pt"), laion2b_s32b_b82k=_pcfg( hf_hub='laion/CLIP-ViT-L-14-laion2B-s32B-b82K/', - mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)), + mean=INCEPTION_MEAN, std=INCEPTION_STD), # DataComp-XL models datacomp_xl_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-DataComp.XL-s13B-b90K/'), commonpool_xl_clip_s13b_b90k=_pcfg(hf_hub='laion/CLIP-ViT-L-14-CommonPool.XL.clip-s13B-b90K/'), @@ -263,6 +295,7 @@ def _pcfg(url='', hf_hub='', mean=None, std=None): "RN50x4": _RN50x4, "RN50x16": _RN50x16, "RN50x64": _RN50x64, + "ViT-B-32": _VITB32, "ViT-B-32-256": _VITB32_256, "ViT-B-32-quickgelu": _VITB32_quickgelu, @@ -276,17 +309,21 @@ def _pcfg(url='', hf_hub='', mean=None, std=None): "ViT-H-14-quickgelu": _VITH14_quickgelu, "ViT-g-14": _VITg14, "ViT-bigG-14": _VITbigG14, + "roberta-ViT-B-32": _robertaViTB32, "xlm-roberta-base-ViT-B-32": _xlmRobertaBaseViTB32, "xlm-roberta-large-ViT-H-14": _xlmRobertaLargeFrozenViTH14, + "convnext_base": _convnext_base, "convnext_base_w": _convnext_base_w, "convnext_base_w_320": _convnext_base_w_320, "convnext_large_d": _convnext_large_d, "convnext_large_d_320": _convnext_large_d_320, "convnext_xxlarge": _convnext_xxlarge, + "coca_ViT-B-32": _coca_VITB32, "coca_ViT-L-14": _coca_VITL14, + "EVA01-g-14": dict( # from QuanSun/EVA-CLIP/EVA01_CLIP_g_14_psz14_s11B.pt laion400m_s11b_b41k=_pcfg(hf_hub='timm/eva_giant_patch14_clip_224.laion400m_s11b_b41k/'), @@ -315,6 +352,56 @@ def _pcfg(url='', hf_hub='', mean=None, std=None): # from QuanSun/EVA-CLIP/EVA02_CLIP_E_psz14_plus_s9B.pt laion2b_s9b_b144k=_pcfg(hf_hub='timm/eva02_enormous_patch14_plus_clip_224.laion2b_s9b_b144k/'), ), + + "ViT-B-16-SigLIP": dict( + webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP/'), + ), + "ViT-B-16-SigLIP-256": dict( + webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-256/'), + ), + "ViT-B-16-SigLIP-i18n-256": dict( + webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-i18n-256/'), + ), + "ViT-B-16-SigLIP-384": dict( + webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-384/'), + ), + "ViT-B-16-SigLIP-512": dict( + webli=_slpcfg(hf_hub='timm/ViT-B-16-SigLIP-512/'), + ), + "ViT-L-16-SigLIP-256": dict( + webli=_slpcfg(hf_hub='timm/ViT-L-16-SigLIP-256/'), + ), + "ViT-L-16-SigLIP-384": dict( + webli=_slpcfg(hf_hub='timm/ViT-L-16-SigLIP-384/'), + ), + "ViT-SO400M-14-SigLIP": dict( + webli=_slpcfg(hf_hub='timm/ViT-SO400M-14-SigLIP/'), + ), + "ViT-SO400M-14-SigLIP-384": dict( + webli=_slpcfg(hf_hub='timm/ViT-SO400M-14-SigLIP-384/'), + ), + + # FIXME update CLIPA pretrained to final home, rwightman/ is temporary for testing + "ViT-L-14-CLIPA": dict( + datacomp1b=_apcfg(hf_hub='rwightman/ViT-L-14-CLIPA-datacomp1B/'), + ), + "ViT-L-14-CLIPA-336": dict( + datacomp1b=_apcfg(hf_hub='rwightman/ViT-L-14-CLIPA-336-datacomp1B/'), + ), + "ViT-H-14-CLIPA": dict( + datacomp1b=_apcfg(hf_hub='rwightman/ViT-H-14-CLIPA-datacomp1B/'), + ), + "ViT-H-14-CLIPA-336": dict( + laion2b=_apcfg(hf_hub='rwightman/ViT-H-14-CLIPA-336-laion2B/'), + datacomp1b=_apcfg(hf_hub='rwightman/ViT-H-14-CLIPA-336-datacomp1B/'), + ), + # "ViT-bigG-14-CLIPA": dict( + # datacomp1b=_apcfg(hf_hub='rwightman/ViT-bigG-14-CLIPA-datacomp1B/'), + # ), + "ViT-bigG-14-CLIPA-336": dict( + datacomp1b=_apcfg(hf_hub='rwightman/ViT-bigG-14-CLIPA-336-datacomp1B/'), + ), + "nllb-clip-base": dict( v1=_pcfg(hf_hub='visheratin/nllb-clip-base-oc/'), ), diff --git a/src/open_clip/push_to_hf_hub.py b/src/open_clip/push_to_hf_hub.py index 6e6271da1..dcb8a78b5 100644 --- a/src/open_clip/push_to_hf_hub.py +++ b/src/open_clip/push_to_hf_hub.py @@ -36,6 +36,7 @@ HF_SAFE_WEIGHTS_NAME = "open_clip_model.safetensors" # safetensors version HF_CONFIG_NAME = 'open_clip_config.json' + def save_config_for_hf( model, config_path: str, @@ -45,6 +46,11 @@ def save_config_for_hf( 'mean': model.visual.image_mean, 'std': model.visual.image_std, } + other_pp = getattr(model.visual, 'preprocess_cfg', {}) + if 'interpolation' in other_pp: + preprocess_cfg['interpolation'] = other_pp['interpolation'] + if 'resize_mode' in other_pp: + preprocess_cfg['resize_mode'] = other_pp['resize_mode'] hf_config = { 'model_cfg': model_config, 'preprocess_cfg': preprocess_cfg, @@ -59,7 +65,7 @@ def save_for_hf( tokenizer: HFTokenizer, model_config: dict, save_directory: str, - safe_serialization: Union[bool, str] = False, + safe_serialization: Union[bool, str] = 'both', skip_weights : bool = False, ): config_filename = HF_CONFIG_NAME @@ -95,6 +101,7 @@ def push_to_hf_hub( safe_serialization: Union[bool, str] = False, ): if not isinstance(tokenizer, HFTokenizer): + # FIXME this makes it awkward to push models with new tokenizers, come up with better soln. # default CLIP tokenizers use https://huggingface.co/openai/clip-vit-large-patch14 tokenizer = HFTokenizer('openai/clip-vit-large-patch14') @@ -157,12 +164,15 @@ def push_pretrained_to_hf_hub( precision: str = 'fp32', image_mean: Optional[Tuple[float, ...]] = None, image_std: Optional[Tuple[float, ...]] = None, + image_interpolation: Optional[str] = None, + image_resize_mode: Optional[str] = None, # only effective for inference commit_message: str = 'Add model', token: Optional[str] = None, revision: Optional[str] = None, private: bool = False, create_pr: bool = False, model_card: Optional[dict] = None, + hf_tokenizer_self: bool = False, ): model, preprocess_eval = create_model_from_pretrained( model_name, @@ -170,12 +180,16 @@ def push_pretrained_to_hf_hub( precision=precision, image_mean=image_mean, image_std=image_std, + image_interpolation=image_interpolation, + image_resize_mode=image_resize_mode, ) - model_config = get_model_config(model_name) assert model_config tokenizer = get_tokenizer(model_name) + if hf_tokenizer_self: + # make hf tokenizer config in the uploaded model point to self instead of original location + model_config['text']['hf_tokenizer_name'] = repo_id push_to_hf_hub( model=model, @@ -193,10 +207,15 @@ def push_pretrained_to_hf_hub( def generate_readme(model_card: dict, model_name: str): + tags = model_card.pop('tags', ('clip',)) + pipeline_tag = model_card.pop('pipeline_tag', 'zero-shot-image-classification') readme_text = "---\n" - readme_text += "tags:\n- clip\n" + if tags: + readme_text += "tags:\n" + for t in tags: + readme_text += f"- {t}\n" readme_text += "library_name: open_clip\n" - readme_text += "pipeline_tag: zero-shot-image-classification\n" + readme_text += f"pipeline_tag: {pipeline_tag}\n" readme_text += f"license: {model_card.get('license', 'mit')}\n" if 'details' in model_card and 'Dataset' in model_card['details']: readme_text += 'datasets:\n' @@ -262,6 +281,22 @@ def generate_readme(model_card: dict, model_name: str): parser.add_argument( '--image-std', type=float, nargs='+', default=None, metavar='STD', help='Override default image std deviation of of dataset') + parser.add_argument( + '--image-interpolation', + default=None, type=str, choices=['bicubic', 'bilinear', 'random'], + help="image resize interpolation" + ) + parser.add_argument( + '--image-resize-mode', + default=None, type=str, choices=['shortest', 'longest', 'squash'], + help="image resize mode during inference" + ) + parser.add_argument( + "--hf-tokenizer-self", + default=False, + action="store_true", + help="make hf_tokenizer_name point in uploaded config point to itself" + ) args = parser.parse_args() print(f'Saving model {args.model} with pretrained weights {args.pretrained} to Hugging Face Hub at {args.repo_id}') @@ -275,6 +310,8 @@ def generate_readme(model_card: dict, model_name: str): precision=args.precision, image_mean=args.image_mean, # override image mean/std if trained w/ non defaults image_std=args.image_std, + image_interpolation=args.image_interpolation, + image_resize_mode=args.image_resize_mode, ) print(f'{args.model} saved.') diff --git a/src/open_clip/timm_model.py b/src/open_clip/timm_model.py index 3d3f595d6..5ddb9a76b 100644 --- a/src/open_clip/timm_model.py +++ b/src/open_clip/timm_model.py @@ -55,11 +55,16 @@ def __init__( timm_kwargs['patch_drop_rate'] = patch_drop custom_pool = pool in ('abs_attn', 'rot_attn') - if not proj and not custom_pool: + if proj: + assert proj in ("linear", "mlp", "none") + extra_proj = proj in ("linear", "mlp") + if not extra_proj and not custom_pool: # use network classifier head as projection if no proj specified and no custom pooling used + # if projection is explicitly set to "none" will be pass through from network trunk + proj_dim = 0 if proj == 'none' else embed_dim self.trunk = timm.create_model( model_name, - num_classes=embed_dim, + num_classes=proj_dim, global_pool=pool, pretrained=pretrained, **timm_kwargs, @@ -99,8 +104,6 @@ def __init__( head_layers['proj'] = nn.Linear(prev_chs, embed_dim, bias=proj_bias) elif proj == 'mlp': head_layers['mlp'] = Mlp(prev_chs, 2 * embed_dim, embed_dim, drop=(drop, 0), bias=(True, proj_bias)) - else: - assert not proj, f'Unknown projection type {proj}.' self.head = nn.Sequential(head_layers) diff --git a/src/open_clip/tokenizer.py b/src/open_clip/tokenizer.py index 3e651aed5..985c0e030 100644 --- a/src/open_clip/tokenizer.py +++ b/src/open_clip/tokenizer.py @@ -5,25 +5,21 @@ import gzip import html import os -from functools import lru_cache -from typing import Union, List +import random +import string +from functools import lru_cache, partial +from typing import Callable, Optional, List, Union import ftfy +import numpy as np import regex as re import torch -import numpy as np # https://stackoverflow.com/q/62691279 -import os os.environ["TOKENIZERS_PARALLELISM"] = "false" +_nltk_init = False -try: - import nltk - # run them for the first time - nltk.download('punkt') - nltk.download('averaged_perceptron_tagger') -except: - nltk = None +DEFAULT_CONTEXT_LENGTH = 77 # default context length for OpenAI CLIP @lru_cache() @@ -78,8 +74,64 @@ def whitespace_clean(text): return text +def _clean_canonicalize(x): + # basic, remove whitespace, remove punctuation, lower case + return canonicalize_text(basic_clean(x)) + + +def _clean_lower(x): + # basic, remove whitespace, lower case + return whitespace_clean(basic_clean(x)).lower() + + +def _clean_whitespace(x): + # basic, remove whitespace + return whitespace_clean(basic_clean(x)) + + +def get_clean_fn(type: str): + if type == 'canonicalize': + return _clean_canonicalize + elif type == 'lower': + return _clean_lower + elif type == 'whitespace': + return _clean_whitespace + else: + assert False, f"Invalid clean function ({type})." + + +def canonicalize_text(text, *, keep_punctuation_exact_string=None): + """Returns canonicalized `text` (lowercase and punctuation removed). + + From: https://github.com/google-research/big_vision/blob/53f18caf27a9419231bbf08d3388b07671616d3d/big_vision/evaluators/proj/image_text/prompt_engineering.py#L94 + + Args: + text: string to be canonicalized. + keep_punctuation_exact_string: If provided, then this exact string kept. + For example providing '{}' will keep any occurrences of '{}' (but will + still remove '{' and '}' that appear separately). + """ + text = text.replace("_", " ") + if keep_punctuation_exact_string: + text = keep_punctuation_exact_string.join( + part.translate(str.maketrans("", "", string.punctuation)) + for part in text.split(keep_punctuation_exact_string)) + else: + text = text.translate(str.maketrans("", "", string.punctuation)) + text = text.lower() + text = re.sub(r"\s+", " ", text) + return text.strip() + + class SimpleTokenizer(object): - def __init__(self, bpe_path: str = default_bpe(), special_tokens=None): + def __init__( + self, + bpe_path: str = default_bpe(), + additional_special_tokens: Optional[List[str]] = None, + context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH, + clean: str = 'lower', + reduction_mask: str = '' + ): self.byte_encoder = bytes_to_unicode() self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') @@ -89,20 +141,26 @@ def __init__(self, bpe_path: str = default_bpe(), special_tokens=None): vocab = vocab + [v+'' for v in vocab] for merge in merges: vocab.append(''.join(merge)) - if not special_tokens: - special_tokens = ['', ''] - else: - special_tokens = ['', ''] + special_tokens + special_tokens = ['', ''] + if additional_special_tokens: + special_tokens += additional_special_tokens vocab.extend(special_tokens) self.encoder = dict(zip(vocab, range(len(vocab)))) self.decoder = {v: k for k, v in self.encoder.items()} self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {t:t for t in special_tokens} special = "|".join(special_tokens) - self.pat = re.compile(special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) - + self.pat = re.compile( + special + r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE, + ) self.vocab_size = len(self.encoder) self.all_special_ids = [self.encoder[t] for t in special_tokens] + self.sot_token_id = self.all_special_ids[0] + self.eot_token_id = self.all_special_ids[1] + self.context_length = context_length + self.clean_fn = get_clean_fn(clean) + self.reduction_fn = get_reduction_mask_fn(reduction_mask) if reduction_mask else None def bpe(self, token): if token in self.cache: @@ -147,7 +205,7 @@ def bpe(self, token): def encode(self, text): bpe_tokens = [] - text = whitespace_clean(basic_clean(text)).lower() + text = self.clean_fn(text) for token in re.findall(self.pat, text): token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) @@ -158,157 +216,128 @@ def decode(self, tokens): text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') return text + def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.LongTensor: + """ Returns the tokenized representation of given input string(s) -_tokenizer = SimpleTokenizer() + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + context_length : int + The context length to use; all CLIP models use 77 as the context length -def decode(output_ids: torch.Tensor): - output_ids = output_ids.cpu().numpy() - return _tokenizer.decode(output_ids) + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] -def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all CLIP models use 77 as the context length - - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] + context_length = context_length or self.context_length + assert context_length, 'Please set a valid context length' - sot_token = _tokenizer.encoder[""] - eot_token = _tokenizer.encoder[""] - all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] - result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + if self.reduction_fn is not None: + # use reduction strategy for tokenize if set, otherwise default to truncation below + return self.reduction_fn( + texts, + context_length=context_length, + sot_token_id=self.sot_token_id, + eot_token_id=self.eot_token_id, + encode_fn=self.encode, + ) - for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length: - tokens = tokens[:context_length] # Truncate - tokens[-1] = eot_token - result[i, :len(tokens)] = torch.tensor(tokens) + all_tokens = [[self.sot_token_id] + self.encode(text) + [self.eot_token_id] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) - return result + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + tokens = tokens[:context_length] # Truncate + tokens[-1] = self.eot_token_id + result[i, :len(tokens)] = torch.tensor(tokens) + return result -class HFTokenizer: - """HuggingFace tokenizer wrapper""" - def __init__(self, tokenizer_name: str): - from transformers import AutoTokenizer - self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) +_tokenizer = SimpleTokenizer() - def save_pretrained(self, dest): - self.tokenizer.save_pretrained(dest) - def __call__(self, texts: Union[str, List[str]], context_length: int = 77) -> torch.Tensor: - # same cleaning as for default tokenizer, except lowercasing - # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance - if isinstance(texts, str): - texts = [texts] - texts = [whitespace_clean(basic_clean(text)) for text in texts] - input_ids = self.tokenizer( - texts, - return_tensors='pt', - max_length=context_length, - padding='max_length', - truncation=True, - ).input_ids - return input_ids +def decode(output_ids: torch.Tensor): + output_ids = output_ids.cpu().numpy() + return _tokenizer.decode(output_ids) -def random_mask_tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all CLIP models use 77 as the context length - - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] +def tokenize(texts: Union[str, List[str]], context_length: int = DEFAULT_CONTEXT_LENGTH) -> torch.LongTensor: + return _tokenizer(texts, context_length=context_length) + - sot_token = _tokenizer.encoder[""] - eot_token = _tokenizer.encoder[""] - all_tokens = [_tokenizer.encode(text) for text in texts] +def random_mask_tokenize( + texts: Union[str, List[str]], + context_length: int, + sot_token_id: int, + eot_token_id: int, + encode_fn: Callable, + shuffle: bool = False, +): + all_tokens = [encode_fn(text) for text in texts] result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length - 2: # 2 for sot and eot token - indices = np.random.permutation(len(tokens)).tolist() - indices = indices[:context_length - 2] + tokens = torch.tensor(tokens) + num_tokens = len(tokens) + if num_tokens > context_length - 2: # 2 for sot and eot token + num_keep = context_length - 2 + indices = torch.randperm(len(tokens)) + indices = indices[:num_keep] + if not shuffle: + indices = indices.msort() tokens = tokens[indices] - tokens = [sot_token,] + tokens + [eot_token,] - result[i, :len(tokens)] = torch.tensor(tokens) + num_tokens = num_keep + result[i, 0] = sot_token_id + result[i, 1:num_tokens + 1] = tokens + result[i, num_tokens + 1] = eot_token_id return result -def block_mask_tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s) - - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all CLIP models use 77 as the context length - - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] - """ - if isinstance(texts, str): - texts = [texts] - - sot_token = _tokenizer.encoder[""] - eot_token = _tokenizer.encoder[""] - all_tokens = [_tokenizer.encode(text) for text in texts] +def simple_mask_tokenize( + texts: Union[str, List[str]], + context_length: int, + sot_token_id: int, + eot_token_id: int, + encode_fn: Callable, +): + all_tokens = [encode_fn(text) for text in texts] result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) for i, tokens in enumerate(all_tokens): - if len(tokens) > context_length - 2: # 2 for sot and eot token - start_index = np.random.randint(len(tokens) - context_length + 3) - tokens = tokens[start_index : start_index + context_length - 2] - tokens = [sot_token,] + tokens + [eot_token,] + num_tokens = len(tokens) + if num_tokens > context_length - 2: # 2 for sot and eot token + num_keep = context_length - 2 + start_index = random.randint(0, num_tokens - num_keep) # high is incl + tokens = tokens[start_index: start_index + num_keep] + tokens = [sot_token_id] + tokens + [eot_token_id] result[i, :len(tokens)] = torch.tensor(tokens) return result -def syntax_mask_tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor: - """ - Returns the tokenized representation of given input string(s). +def syntax_mask_tokenize( + texts: Union[str, List[str]], + context_length: int, + sot_token_id: int, + eot_token_id: int, + encode_fn: Callable, +) -> torch.LongTensor: + """ Returns the tokenized representation of given input string(s). Apply syntax masking before tokenize. - - Parameters - ---------- - texts : Union[str, List[str]] - An input string or a list of input strings to tokenize - context_length : int - The context length to use; all CLIP models use 77 as the context length - - Returns - ------- - A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] """ - assert nltk is not None - if isinstance(texts, str): - texts = [texts] + import nltk + global _nltk_init + if not _nltk_init: + # run them for the first time + nltk.download('punkt') + nltk.download('averaged_perceptron_tagger') + _nltk_init = True def get_order(x): if x.startswith('NN'): @@ -319,6 +348,7 @@ def get_order(x): return 3 else: return 4 + # syntax masking new_texts = [] for text in texts: @@ -328,8 +358,7 @@ def get_order(x): order_list = [get_order(tag) for _, tag in pos_tags] sorted_ids = np.argsort(np.array(order_list)) sampled_ids = sorted(sorted_ids[:context_length - 2]) # need 2 slots for sot and eot tokens - # sample the tokens and convert to tf.tensor - sampled_tokens = np.take(np.array(list_tokens), sampled_ids, axis=0) + sampled_tokens = np.take(np.array(list_tokens), sampled_ids, axis=0) # sample the tokens new_text = '' for token in sampled_tokens: @@ -338,16 +367,130 @@ def get_order(x): new_texts.append(new_text) texts = new_texts - sot_token = _tokenizer.encoder[""] - eot_token = _tokenizer.encoder[""] - all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + all_tokens = [[sot_token_id] + encode_fn(text) + [eot_token_id] for text in texts] result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) for i, tokens in enumerate(all_tokens): # still need first truncate because some words produces two tokens if len(tokens) > context_length: tokens = tokens[:context_length] # Truncate - tokens[-1] = eot_token + tokens[-1] = eot_token_id result[i, :len(tokens)] = torch.tensor(tokens) - return result \ No newline at end of file + return result + + +def get_reduction_mask_fn(type: str): + """ Choose strategy for dropping (masking) tokens to achieve target context length""" + assert type in ('simple', 'random', 'shuffle', 'syntax') + if type == 'simple': + return simple_mask_tokenize # randomly select block [start:end] + elif type == 'random': + return random_mask_tokenize # randomly drop tokens (keep order) + elif type == 'shuffle': + return partial(random_mask_tokenize, shuffle=True) # randomly drop tokens (shuffle order) + elif type == 'syntax': + return syntax_mask_tokenize # randomly drop prioritized by syntax + + +class HFTokenizer: + """HuggingFace tokenizer wrapper""" + + def __init__( + self, + tokenizer_name: str, + context_length: Optional[int] = DEFAULT_CONTEXT_LENGTH, + clean: str = 'whitespace', + strip_sep_token: bool = False, + ): + from transformers import AutoTokenizer + self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + self.context_length = context_length + self.clean_fn = get_clean_fn(clean) + self.strip_sep_token = strip_sep_token + + def save_pretrained(self, dest): + self.tokenizer.save_pretrained(dest) + + def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.Tensor: + # same cleaning as for default tokenizer, except lowercasing + # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance + if isinstance(texts, str): + texts = [texts] + + context_length = context_length or self.context_length + assert context_length, 'Please set a valid context length in class init or call.' + + texts = [self.clean_fn(text) for text in texts] + input_ids = self.tokenizer( + texts, + return_tensors='pt', + max_length=context_length, + padding='max_length', + truncation=True, + ).input_ids + + if self.strip_sep_token: + input_ids = torch.where( + input_ids == self.tokenizer.sep_token_id, + torch.zeros_like(input_ids), + input_ids, + ) + + return input_ids + + +class SigLipTokenizer: + """HuggingFace tokenizer wrapper for SigLIP T5 compatible sentencepiece vocabs + """ + VOCAB_FILES = { + # english, vocab_size=32_000 + "c4-en": "http://storage.googleapis.com/t5-data/vocabs/cc_en.32000/sentencepiece.model", + # used in multilingual models (mT5, PaLI), vocab_size=250_000 + "mc4": "http://storage.googleapis.com/t5-data/vocabs/mc4.250000.100extra/sentencepiece.model", + } + + def __init__( + self, + tokenizer_name: str, + context_length: Optional[int] = 64, + ): + from transformers import T5TokenizerFast + + if tokenizer_name in self.VOCAB_FILES: + # FIXME temporary hack? + import fsspec + import tempfile + vocab_file = self.VOCAB_FILES[tokenizer_name] + with tempfile.NamedTemporaryFile('wb') as dst: + with fsspec.open(vocab_file, 'rb') as src: + dst.write(src.read()) + self.tokenizer = T5TokenizerFast(dst.name, legacy=False) + else: + self.tokenizer = T5TokenizerFast(tokenizer_name, legacy=False) + + self.tokenizer.pad_token_id = 1 + self.tokenizer.eos_token_id = 1 + self.context_length = context_length + + def save_pretrained(self, dest): + self.tokenizer.save_pretrained(dest) + + def __call__(self, texts: Union[str, List[str]], context_length: Optional[int] = None) -> torch.Tensor: + # same cleaning as for default tokenizer, except lowercasing + # adding lower (for case-sensitive tokenizers) will make it more robust but less sensitive to nuance + if isinstance(texts, str): + texts = [texts] + + context_length = context_length or self.context_length + assert context_length, 'Please set a valid context length in class init or call.' + + texts = [canonicalize_text(basic_clean(text)) for text in texts] + output = self.tokenizer( + texts, + return_tensors='pt', + max_length=context_length, + padding='max_length', + truncation=True, + ) + return output.input_ids diff --git a/src/open_clip/transform.py b/src/open_clip/transform.py index 59f13bb59..45a8e5428 100644 --- a/src/open_clip/transform.py +++ b/src/open_clip/transform.py @@ -1,16 +1,61 @@ +import numbers +import random import warnings from dataclasses import dataclass, asdict -from typing import Any, Dict, Optional, Sequence, Tuple, Union +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -import random import torch -import torch.nn as nn import torchvision.transforms.functional as F - from torchvision.transforms import Normalize, Compose, RandomResizedCrop, InterpolationMode, ToTensor, Resize, \ CenterCrop, ColorJitter, Grayscale from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD +from .utils import to_2tuple + + +@dataclass +class PreprocessCfg: + size: Union[int, Tuple[int, int]] = 224 + mode: str = 'RGB' + mean: Tuple[float, ...] = OPENAI_DATASET_MEAN + std: Tuple[float, ...] = OPENAI_DATASET_STD + interpolation: str = 'bicubic' + resize_mode: str = 'shortest' + fill_color: int = 0 + + def __post_init__(self): + assert self.mode in ('RGB') + + @property + def num_channels(self): + return 3 + + @property + def input_size(self): + return (self.num_channels(),) + to_2tuple(self.size) + +_PREPROCESS_KEYS = set(asdict(PreprocessCfg()).keys()) + + +def merge_preprocess_dict( + base: Union[PreprocessCfg, Dict], + overlay: Dict, +): + """ Merge overlay key-value pairs on top of base preprocess cfg or dict. + Input dicts are filtered based on PreprocessCfg fields. + """ + if isinstance(base, PreprocessCfg): + base_clean = asdict(base) + else: + base_clean = {k: v for k, v in base.items() if k in _PREPROCESS_KEYS} + if overlay: + overlay_clean = {k: v for k, v in overlay.items() if k in _PREPROCESS_KEYS and v is not None} + base_clean.update(overlay_clean) + return base_clean + + +def merge_preprocess_kwargs(base: PreprocessCfg, **kwargs): + return merge_preprocess_dict(base, kwargs) @dataclass @@ -18,41 +63,177 @@ class AugmentationCfg: scale: Tuple[float, float] = (0.9, 1.0) ratio: Optional[Tuple[float, float]] = None color_jitter: Optional[Union[float, Tuple[float, float, float], Tuple[float, float, float, float]]] = None - interpolation: Optional[str] = None re_prob: Optional[float] = None re_count: Optional[int] = None use_timm: bool = False + # params for simclr_jitter_gray color_jitter_prob: float = None gray_scale_prob: float = None -class ResizeMaxSize(nn.Module): +def _setup_size(size, error_msg): + if isinstance(size, numbers.Number): + return int(size), int(size) - def __init__(self, max_size, interpolation=InterpolationMode.BICUBIC, fn='max', fill=0): - super().__init__() - if not isinstance(max_size, int): - raise TypeError(f"Size should be int. Got {type(max_size)}") - self.max_size = max_size + if isinstance(size, Sequence) and len(size) == 1: + return size[0], size[0] + + if len(size) != 2: + raise ValueError(error_msg) + + return size + + +class ResizeKeepRatio: + """ Resize and Keep Ratio + + Copy & paste from `timm` + """ + + def __init__( + self, + size, + longest=0., + interpolation=InterpolationMode.BICUBIC, + random_scale_prob=0., + random_scale_range=(0.85, 1.05), + random_aspect_prob=0., + random_aspect_range=(0.9, 1.11) + ): + if isinstance(size, (list, tuple)): + self.size = tuple(size) + else: + self.size = (size, size) self.interpolation = interpolation - self.fn = min if fn == 'min' else min - self.fill = fill + self.longest = float(longest) # [0, 1] where 0 == shortest edge, 1 == longest + self.random_scale_prob = random_scale_prob + self.random_scale_range = random_scale_range + self.random_aspect_prob = random_aspect_prob + self.random_aspect_range = random_aspect_range - def forward(self, img): - if isinstance(img, torch.Tensor): - height, width = img.shape[:2] + @staticmethod + def get_params( + img, + target_size, + longest, + random_scale_prob=0., + random_scale_range=(0.85, 1.05), + random_aspect_prob=0., + random_aspect_range=(0.9, 1.11) + ): + """Get parameters + """ + source_size = img.size[::-1] # h, w + h, w = source_size + target_h, target_w = target_size + ratio_h = h / target_h + ratio_w = w / target_w + ratio = max(ratio_h, ratio_w) * longest + min(ratio_h, ratio_w) * (1. - longest) + if random_scale_prob > 0 and random.random() < random_scale_prob: + ratio_factor = random.uniform(random_scale_range[0], random_scale_range[1]) + ratio_factor = (ratio_factor, ratio_factor) else: - width, height = img.size - scale = self.max_size / float(max(height, width)) - new_size = tuple(round(dim * scale) for dim in (height, width)) - if scale != 1.0: - img = F.resize(img, new_size, self.interpolation) - if not width == height: - pad_h = self.max_size - new_size[0] - pad_w = self.max_size - new_size[1] - img = F.pad(img, padding=[pad_w//2, pad_h//2, pad_w - pad_w//2, pad_h - pad_h//2], fill=self.fill) + ratio_factor = (1., 1.) + if random_aspect_prob > 0 and random.random() < random_aspect_prob: + aspect_factor = random.uniform(random_aspect_range[0], random_aspect_range[1]) + ratio_factor = (ratio_factor[0] / aspect_factor, ratio_factor[1] * aspect_factor) + size = [round(x * f / ratio) for x, f in zip(source_size, ratio_factor)] + return size + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Resized, padded to at least target size, possibly cropped to exactly target size + """ + size = self.get_params( + img, self.size, self.longest, + self.random_scale_prob, self.random_scale_range, + self.random_aspect_prob, self.random_aspect_range + ) + img = F.resize(img, size, self.interpolation) return img + def __repr__(self): + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += f', interpolation={self.interpolation})' + format_string += f', longest={self.longest:.3f})' + return format_string + + +def center_crop_or_pad(img: torch.Tensor, output_size: List[int], fill=0) -> torch.Tensor: + """Center crops and/or pads the given image. + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + img (PIL Image or Tensor): Image to be cropped. + output_size (sequence or int): (height, width) of the crop box. If int or sequence with single int, + it is used for both directions. + fill (int, Tuple[int]): Padding color + + Returns: + PIL Image or Tensor: Cropped image. + """ + if isinstance(output_size, numbers.Number): + output_size = (int(output_size), int(output_size)) + elif isinstance(output_size, (tuple, list)) and len(output_size) == 1: + output_size = (output_size[0], output_size[0]) + + _, image_height, image_width = F.get_dimensions(img) + crop_height, crop_width = output_size + + if crop_width > image_width or crop_height > image_height: + padding_ltrb = [ + (crop_width - image_width) // 2 if crop_width > image_width else 0, + (crop_height - image_height) // 2 if crop_height > image_height else 0, + (crop_width - image_width + 1) // 2 if crop_width > image_width else 0, + (crop_height - image_height + 1) // 2 if crop_height > image_height else 0, + ] + img = F.pad(img, padding_ltrb, fill=fill) + _, image_height, image_width = F.get_dimensions(img) + if crop_width == image_width and crop_height == image_height: + return img + + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + return F.crop(img, crop_top, crop_left, crop_height, crop_width) + + +class CenterCropOrPad(torch.nn.Module): + """Crops the given image at the center. + If the image is torch Tensor, it is expected + to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions. + If image size is smaller than output size along any edge, image is padded with 0 and then center cropped. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]). + """ + + def __init__(self, size, fill=0): + super().__init__() + self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.") + self.fill = fill + + def forward(self, img): + """ + Args: + img (PIL Image or Tensor): Image to be cropped. + + Returns: + PIL Image or Tensor: Cropped image. + """ + return center_crop_or_pad(img, self.size, fill=self.fill) + + def __repr__(self) -> str: + return f"{self.__class__.__name__}(size={self.size})" + def _convert_to_rgb(image): return image.convert('RGB') @@ -89,12 +270,14 @@ def __call__(self, img): else: return img + def image_transform( - image_size: int, + image_size: Union[int, Tuple[int, int]], is_train: bool, mean: Optional[Tuple[float, ...]] = None, std: Optional[Tuple[float, ...]] = None, - resize_longest_max: bool = False, + resize_mode: Optional[str] = None, + interpolation: Optional[str] = None, fill_color: int = 0, aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, ): @@ -106,15 +289,21 @@ def image_transform( if not isinstance(std, (list, tuple)): std = (std,) * 3 - if isinstance(image_size, (list, tuple)) and image_size[0] == image_size[1]: - # for square size, pass size as int so that Resize() uses aspect preserving shortest edge - image_size = image_size[0] + interpolation = interpolation or 'bicubic' + assert interpolation in ['bicubic', 'bilinear', 'random'] + # NOTE random is ignored for interpolation_mode, so defaults to BICUBIC for inference if set + interpolation_mode = InterpolationMode.BILINEAR if interpolation == 'bilinear' else InterpolationMode.BICUBIC + + resize_mode = resize_mode or 'shortest' + assert resize_mode in ('shortest', 'longest', 'squash') if isinstance(aug_cfg, dict): aug_cfg = AugmentationCfg(**aug_cfg) else: aug_cfg = aug_cfg or AugmentationCfg() + normalize = Normalize(mean=mean, std=std) + if is_train: aug_cfg_dict = {k: v for k, v in asdict(aug_cfg).items() if v is not None} use_timm = aug_cfg_dict.pop('use_timm', False) @@ -125,13 +314,11 @@ def image_transform( input_size = (3,) + image_size[-2:] else: input_size = (3, image_size, image_size) - # by default, timm aug randomly alternates bicubic & bilinear for better robustness at inference time - aug_cfg_dict.setdefault('interpolation', 'random') - aug_cfg_dict.setdefault('color_jitter', None) # disable by default - # drop extra item - aug_cfg_dict.pop('color_jitter_prob', False) - aug_cfg_dict.pop('gray_scale_prob', False) + aug_cfg_dict.setdefault('color_jitter', None) # disable by default + # drop extra non-timm items + aug_cfg_dict.pop('color_jitter_prob', None) + aug_cfg_dict.pop('gray_scale_prob', None) train_transform = create_transform( input_size=input_size, @@ -140,6 +327,7 @@ def image_transform( mean=mean, std=std, re_mode='pixel', + interpolation=interpolation, **aug_cfg_dict, ) else: @@ -169,18 +357,50 @@ def image_transform( warnings.warn(f'Unused augmentation cfg items, specify `use_timm` to use ({list(aug_cfg_dict.keys())}).') return train_transform else: - if resize_longest_max: + if resize_mode == 'longest': transforms = [ - ResizeMaxSize(image_size, fill=fill_color) + ResizeKeepRatio(image_size, interpolation=interpolation_mode, longest=1), + CenterCropOrPad(image_size, fill=fill_color) ] - else: + elif resize_mode == 'squash': + if isinstance(image_size, int): + image_size = (image_size, image_size) transforms = [ - Resize(image_size, interpolation=InterpolationMode.BICUBIC), - CenterCrop(image_size), + Resize(image_size, interpolation=interpolation_mode), ] + else: + assert resize_mode == 'shortest' + if not isinstance(image_size, (tuple, list)): + image_size = (image_size, image_size) + if image_size[0] == image_size[1]: + # simple case, use torchvision built-in Resize w/ shortest edge mode (scalar size arg) + transforms = [ + Resize(image_size[0], interpolation=interpolation_mode) + ] + else: + # resize shortest edge to matching target dim for non-square target + transforms = [ResizeKeepRatio(image_size)] + transforms += [CenterCrop(image_size)] + transforms.extend([ _convert_to_rgb, ToTensor(), normalize, ]) return Compose(transforms) + + +def image_transform_v2( + cfg: PreprocessCfg, + is_train: bool, + aug_cfg: Optional[Union[Dict[str, Any], AugmentationCfg]] = None, +): + return image_transform( + image_size=cfg.size, + is_train=is_train, + mean=cfg.mean, + std=cfg.std, + interpolation=cfg.interpolation, + resize_mode=cfg.resize_mode, + fill_color=cfg.fill_color, + ) \ No newline at end of file diff --git a/src/open_clip/transformer.py b/src/open_clip/transformer.py index ce5e0d3f7..6d4e604d8 100644 --- a/src/open_clip/transformer.py +++ b/src/open_clip/transformer.py @@ -1,6 +1,7 @@ from collections import OrderedDict import math from typing import Callable, Optional, Sequence, Tuple +from functools import partial import torch from torch import nn @@ -8,6 +9,7 @@ from torch.utils.checkpoint import checkpoint from .utils import to_2tuple +from .pos_embed import get_2d_sincos_pos_embed class LayerNormFp32(nn.LayerNorm): @@ -179,12 +181,9 @@ def forward(self, x: torch.Tensor): x = self.ln_k(x).permute(1, 0, 2) # NLD -> LND N = x.shape[1] q = self.ln_q(self.query) - out = self.attn(self._repeat(q, N), x, x, need_weights=False)[0] + out = self.attn(q.unsqueeze(1).expand(-1, N, -1), x, x, need_weights=False)[0] return out.permute(1, 0, 2) # LND -> NLD - def _repeat(self, query, N: int): - return query.unsqueeze(1).repeat(1, N, 1) - class ResidualAttentionBlock(nn.Module): def __init__( @@ -273,8 +272,8 @@ def __init__( mlp_width = int(d_model * mlp_ratio) self.mlp = nn.Sequential(OrderedDict([ ("c_fc", nn.Linear(d_model, mlp_width)), - ('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()), ("gelu", act_layer()), + ('ln', norm_layer(mlp_width) if scale_fc else nn.Identity()), ("c_proj", nn.Linear(mlp_width, d_model)) ])) self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity() @@ -285,6 +284,10 @@ def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): return x +def _expand_token(token, batch_size: int): + return token.view(1, 1, -1).expand(batch_size, -1, -1) + + class Transformer(nn.Module): def __init__( self, @@ -334,44 +337,51 @@ def __init__( heads: int, mlp_ratio: float, ls_init_value: float = None, - global_average_pool: bool = False, attentional_pool: bool = False, - n_queries: int = 256, + attn_pooler_queries: int = 256, attn_pooler_heads: int = 8, output_dim: int = 512, patch_dropout: float = 0., - input_patchnorm: bool = False, + no_ln_pre: bool = False, + pos_embed_type: str = 'learnable', + pool_type: str = 'tok', + final_ln_after_pool: bool = False, act_layer: Callable = nn.GELU, norm_layer: Callable = LayerNorm, - output_tokens: bool = False + output_tokens: bool = False, ): super().__init__() + assert pool_type in ('tok', 'avg', 'none') self.output_tokens = output_tokens image_height, image_width = self.image_size = to_2tuple(image_size) patch_height, patch_width = self.patch_size = to_2tuple(patch_size) self.grid_size = (image_height // patch_height, image_width // patch_width) + self.final_ln_after_pool = final_ln_after_pool # currently ignored w/ attn pool enabled self.output_dim = output_dim - # whether to layernorm each patch, as done in dual patchnorm paper - https://arxiv.org/abs/2302.01327v1 - self.input_patchnorm = input_patchnorm - - if input_patchnorm: - patch_input_dim = patch_height * patch_width * 3 - self.patchnorm_pre_ln = LayerNorm(patch_input_dim) - self.conv1 = nn.Linear(patch_input_dim, width) - else: - self.patchnorm_pre_ln = nn.Identity() - self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) + self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) # class embeddings and positional embeddings scale = width ** -0.5 self.class_embedding = nn.Parameter(scale * torch.randn(width)) - self.positional_embedding = nn.Parameter(scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width)) + if pos_embed_type == 'learnable': + self.positional_embedding = nn.Parameter( + scale * torch.randn(self.grid_size[0] * self.grid_size[1] + 1, width)) + elif pos_embed_type == 'sin_cos_2d': + # fixed sin-cos embedding + assert self.grid_size[0] == self.grid_size[1],\ + 'currently sin cos 2d pos embedding only supports square input' + self.positional_embedding = nn.Parameter( + torch.zeros(self.grid_size[0] * self.grid_size[1] + 1, width), requires_grad=False) + pos_embed_type = get_2d_sincos_pos_embed(width, self.grid_size[0], cls_token=True) + self.positional_embedding.data.copy_(torch.from_numpy(pos_embed_type).float()) + else: + raise ValueError # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0. else nn.Identity() - self.ln_pre = norm_layer(width) + self.ln_pre = nn.Identity() if no_ln_pre else norm_layer(width) self.transformer = Transformer( width, layers, @@ -382,15 +392,43 @@ def __init__( norm_layer=norm_layer, ) - self.global_average_pool = global_average_pool if attentional_pool: - self.attn_pool = AttentionalPooler(output_dim, width, n_head=attn_pooler_heads, n_queries=n_queries) - self.ln_post = norm_layer(output_dim) - self.proj = nn.Parameter(scale * torch.randn(output_dim, output_dim)) + if isinstance(attentional_pool, str): + self.attn_pool_type = attentional_pool + self.pool_type = 'none' + if attentional_pool in ('parallel', 'cascade'): + self.attn_pool = AttentionalPooler( + output_dim, + width, + n_head=attn_pooler_heads, + n_queries=attn_pooler_queries, + ) + self.attn_pool_contrastive = AttentionalPooler( + output_dim, + width, + n_head=attn_pooler_heads, + n_queries=1, + ) + else: + assert False + else: + self.attn_pool_type = '' + self.pool_type = pool_type + self.attn_pool = AttentionalPooler( + output_dim, + width, + n_head=attn_pooler_heads, + n_queries=attn_pooler_queries, + ) + self.attn_pool_contrastive = None + pool_dim = output_dim else: self.attn_pool = None - self.ln_post = norm_layer(width) - self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + pool_dim = width + self.pool_type = pool_type + + self.ln_post = norm_layer(pool_dim) + self.proj = nn.Parameter(scale * torch.randn(pool_dim, output_dim)) self.init_parameters() @@ -452,33 +490,25 @@ def set_grad_checkpointing(self, enable=True): self.transformer.grad_checkpointing = enable def _global_pool(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: - if self.global_average_pool: - return x.mean(dim=1), x + if self.pool_type == 'avg': + pooled, tokens = x[:, 1:].mean(dim=1), x[:, 1:] + elif self.pool_type == 'tok': + pooled, tokens = x[:, 0], x[:, 1:] else: - return x[:, 0], x[:, 1:] + pooled = tokens = x - def forward(self, x: torch.Tensor): + return pooled, tokens - # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1 - if self.input_patchnorm: - # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)') - x = x.reshape(x.shape[0], x.shape[1], self.grid_size[0], self.patch_size[0], self.grid_size[1], self.patch_size[1]) - x = x.permute(0, 2, 4, 1, 3, 5) - x = x.reshape(x.shape[0], self.grid_size[0] * self.grid_size[1], -1) - x = self.patchnorm_pre_ln(x) - x = self.conv1(x) - else: - x = self.conv1(x) # shape = [*, width, grid, grid] - x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] - x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] # class embeddings and positional embeddings - x = torch.cat( - [self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), - x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = torch.cat([_expand_token(self.class_embedding, x.shape[0]).to(x.dtype), x], dim=1) + # shape = [*, grid ** 2 + 1, width] x = x + self.positional_embedding.to(x.dtype) - # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in x = self.patch_dropout(x) x = self.ln_pre(x) @@ -487,12 +517,26 @@ def forward(self, x: torch.Tensor): x = x.permute(1, 0, 2) # LND -> NLD if self.attn_pool is not None: - x = self.attn_pool(x) - x = self.ln_post(x) + if self.attn_pool_contrastive is not None: + # This is untested, WIP pooling that should match paper + x = self.ln_post(x) # TBD LN first or separate one after each pool? + tokens = self.attn_pool(x) + if self.attn_pool_type == 'parallel': + pooled = self.attn_pool_contrastive(x) + else: + assert self.attn_pool_type == 'cascade' + pooled = self.attn_pool_contrastive(tokens) + else: + # this is the original OpenCLIP CoCa setup, does not match paper + x = self.attn_pool(x) + x = self.ln_post(x) + pooled, tokens = self._global_pool(x) + elif self.final_ln_after_pool: pooled, tokens = self._global_pool(x) + pooled = self.ln_post(pooled) else: + x = self.ln_post(x) pooled, tokens = self._global_pool(x) - pooled = self.ln_post(pooled) if self.proj is not None: pooled = pooled @ self.proj @@ -503,6 +547,21 @@ def forward(self, x: torch.Tensor): return pooled +def text_global_pool(x, text: Optional[torch.Tensor] = None, pool_type: str = 'argmax'): + if pool_type == 'first': + pooled, tokens = x[:, 0], x[:, 1:] + elif pool_type == 'last': + pooled, tokens = x[:, -1], x[:, :-1] + elif pool_type == 'argmax': + # take features from the eot embedding (eot_token is the highest number in each sequence) + assert text is not None + pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x + else: + pooled = tokens = x + + return pooled, tokens + + class TextTransformer(nn.Module): output_tokens: torch.jit.Final[bool] @@ -513,15 +572,20 @@ def __init__( width: int = 512, heads: int = 8, layers: int = 12, + mlp_ratio: float = 4.0, ls_init_value: float = None, output_dim: int = 512, - act_layer: Callable = nn.GELU, - norm_layer: Callable = LayerNorm, embed_cls: bool = False, + no_causal_mask: bool = False, pad_id: int = 0, + pool_type: str = 'argmax', + proj_bias: bool = False, + act_layer: Callable = nn.GELU, + norm_layer: Callable = LayerNorm, output_tokens: bool = False, ): super().__init__() + assert pool_type in ('first', 'last', 'argmax', 'none') self.output_tokens = output_tokens self.num_pos = self.context_length = context_length self.vocab_size = vocab_size @@ -529,28 +593,35 @@ def __init__( self.output_dim = output_dim self.heads = heads self.pad_id = pad_id + self.pool_type = pool_type - self.text_projection = nn.Parameter(torch.empty(width, output_dim)) - + self.token_embedding = nn.Embedding(vocab_size, width) if embed_cls: self.cls_emb = nn.Parameter(torch.empty(width)) self.num_pos += 1 else: self.cls_emb = None - - self.token_embedding = nn.Embedding(vocab_size, width) self.positional_embedding = nn.Parameter(torch.empty(self.num_pos, width)) self.transformer = Transformer( width=width, layers=layers, heads=heads, + mlp_ratio=mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer, ) self.ln_final = norm_layer(width) - self.register_buffer('attn_mask', self.build_attention_mask(), persistent=False) + if no_causal_mask: + self.attn_mask = None + else: + self.register_buffer('attn_mask', self.build_causal_mask(), persistent=False) + + if proj_bias: + self.text_projection = nn.Linear(width, output_dim) + else: + self.text_projection = nn.Parameter(torch.empty(width, output_dim)) self.init_parameters() @@ -570,13 +641,18 @@ def init_parameters(self): nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) if self.text_projection is not None: - nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + if isinstance(self.text_projection, nn.Linear): + nn.init.normal_(self.text_projection.weight, std=self.transformer.width ** -0.5) + if self.text_projection.bias is not None: + nn.init.zeros_(self.text_projection.bias) + else: + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) @torch.jit.ignore def set_grad_checkpointing(self, enable=True): self.transformer.grad_checkpointing = enable - def build_attention_mask(self): + def build_causal_mask(self): # lazily create causal attention mask, with full attention between the tokens # pytorch uses additive attention mask; fill with -inf mask = torch.empty(self.num_pos, self.num_pos) @@ -593,9 +669,6 @@ def build_cls_mask(self, text, cast_dtype: torch.dtype): additive_mask = torch.repeat_interleave(additive_mask, self.heads, 0) return additive_mask - def _repeat(self, t, N: int): - return t.reshape(1, 1, -1).repeat(N, 1, 1) - def forward(self, text): cast_dtype = self.transformer.get_cast_dtype() seq_len = text.shape[1] @@ -604,9 +677,10 @@ def forward(self, text): attn_mask = self.attn_mask if self.cls_emb is not None: seq_len += 1 - x = torch.cat([x, self._repeat(self.cls_emb, x.shape[0])], dim=1) + x = torch.cat([x, _expand_token(self.cls_emb, x.shape[0])], dim=1) cls_mask = self.build_cls_mask(text, cast_dtype) - attn_mask = attn_mask[None, :seq_len, :seq_len] + cls_mask[:, :seq_len, :seq_len] + if attn_mask is not None: + attn_mask = attn_mask[None, :seq_len, :seq_len] + cls_mask[:, :seq_len, :seq_len] x = x + self.positional_embedding[:seq_len].to(cast_dtype) x = x.permute(1, 0, 2) # NLD -> LND @@ -614,16 +688,19 @@ def forward(self, text): x = x.permute(1, 0, 2) # LND -> NLD # x.shape = [batch_size, n_ctx, transformer.width] - # take features from the eot embedding (eot_token is the highest number in each sequence) if self.cls_emb is not None: - pooled, tokens = x[:, -1], x[:, :-1] - pooled = self.ln_final(pooled) + # presence of appended cls embed (CoCa) overrides pool_type, always take last token + pooled, tokens = text_global_pool(x, pool_type='last') + pooled = self.ln_final(pooled) # final LN applied after pooling in this case else: x = self.ln_final(x) - pooled, tokens = x[torch.arange(x.shape[0]), text.argmax(dim=-1)], x + pooled, tokens = text_global_pool(x, text, pool_type=self.pool_type) if self.text_projection is not None: - pooled = pooled @ self.text_projection + if isinstance(self.text_projection, nn.Linear): + pooled = self.text_projection(pooled) + else: + pooled = pooled @ self.text_projection if self.output_tokens: return pooled, tokens diff --git a/src/training/main.py b/src/training/main.py index 08d2412e2..94496999f 100644 --- a/src/training/main.py +++ b/src/training/main.py @@ -6,6 +6,7 @@ import sys import random from datetime import datetime +from functools import partial import numpy as np import torch @@ -229,10 +230,12 @@ def main(args): force_custom_text=args.force_custom_text, force_patch_dropout=args.force_patch_dropout, force_image_size=args.force_image_size, - pretrained_image=args.pretrained_image, image_mean=args.image_mean, image_std=args.image_std, + image_interpolation=args.image_interpolation, + image_resize_mode=args.image_resize_mode, # only effective for inference aug_cfg=args.aug_cfg, + pretrained_image=args.pretrained_image, output_dict=True, **model_kwargs, ) @@ -350,7 +353,13 @@ def main(args): logging.info(f"=> loaded checkpoint '{args.resume}' (epoch {start_epoch})") # initialize datasets - data = get_data(args, (preprocess_train, preprocess_val), epoch=start_epoch, tokenizer=get_tokenizer(args.model)) + tokenizer = get_tokenizer(args.model) + data = get_data( + args, + (preprocess_train, preprocess_val), + epoch=start_epoch, + tokenizer=tokenizer, + ) assert len(data), 'At least one train or eval dataset must be specified.' # create scheduler if train @@ -415,7 +424,7 @@ def main(args): from open_clip.utils import convert_int8_model_to_inference_mode convert_int8_model_to_inference_mode(model) # Evaluate. - evaluate(model, data, start_epoch, args, writer) + evaluate(model, data, start_epoch, args, tb_writer=writer, tokenizer=tokenizer) return loss = create_loss(args) @@ -428,7 +437,7 @@ def main(args): completed_epoch = epoch + 1 if any(v in data for v in ('val', 'imagenet-val', 'imagenet-v2')): - evaluate(model, data, completed_epoch, args, writer) + evaluate(model, data, completed_epoch, args, tb_writer=writer, tokenizer=tokenizer) # Saving checkpoints. if args.save_logs: diff --git a/src/training/params.py b/src/training/params.py index 345382e57..3ea5a8f3b 100644 --- a/src/training/params.py +++ b/src/training/params.py @@ -234,6 +234,16 @@ def parse_args(args): parser.add_argument( '--image-std', type=float, nargs='+', default=None, metavar='STD', help='Override default image std deviation of of dataset') + parser.add_argument( + '--image-interpolation', + default=None, type=str, choices=['bicubic', 'bilinear', 'random'], + help="Override default image resize interpolation" + ) + parser.add_argument( + '--image-resize-mode', + default=None, type=str, choices=['shortest', 'longest', 'squash'], + help="Override default image resize (& crop) mode during inference" + ) parser.add_argument('--aug-cfg', nargs='*', default={}, action=ParseKwargs) parser.add_argument( "--grad-checkpointing", @@ -442,6 +452,7 @@ def parse_args(args): action="store_true", help='Use SigLip (sigmoid) loss.' ) + args = parser.parse_args(args) # If some params are not passed, we use the default values based on model name. diff --git a/src/training/profile.py b/src/training/profiler.py similarity index 100% rename from src/training/profile.py rename to src/training/profiler.py diff --git a/src/training/train.py b/src/training/train.py index 902fbe36a..a48a34593 100644 --- a/src/training/train.py +++ b/src/training/train.py @@ -248,14 +248,14 @@ def train_one_epoch(model, data, loss, epoch, optimizer, scaler, scheduler, dist # end for -def evaluate(model, data, epoch, args, tb_writer=None): +def evaluate(model, data, epoch, args, tb_writer=None, tokenizer=None): metrics = {} if not is_master(args): return metrics device = torch.device(args.device) model.eval() - zero_shot_metrics = zero_shot_eval(model, data, epoch, args) + zero_shot_metrics = zero_shot_eval(model, data, epoch, args, tokenizer=tokenizer) metrics.update(zero_shot_metrics) autocast = get_autocast(args.precision) diff --git a/src/training/zero_shot.py b/src/training/zero_shot.py index 8265b424b..06ce7ac09 100644 --- a/src/training/zero_shot.py +++ b/src/training/zero_shot.py @@ -1,7 +1,6 @@ import logging import torch -import torch.nn.functional as F from tqdm import tqdm from open_clip import get_input_dtype, get_tokenizer, build_zero_shot_classifier, \ @@ -42,7 +41,7 @@ def run(model, classifier, dataloader, args): return top1, top5 -def zero_shot_eval(model, data, epoch, args): +def zero_shot_eval(model, data, epoch, args, tokenizer=None): if 'imagenet-val' not in data and 'imagenet-v2' not in data: return {} if args.zeroshot_frequency == 0: @@ -53,11 +52,12 @@ def zero_shot_eval(model, data, epoch, args): model = model.module logging.info('Starting zero-shot imagenet.') + if tokenizer is None: + tokenizer = get_tokenizer(args.model) logging.info('Building zero-shot classifier') autocast = get_autocast(args.precision) with autocast(): - tokenizer = get_tokenizer(args.model) classifier = build_zero_shot_classifier( model, tokenizer=tokenizer, diff --git a/tests/test_hf_model.py b/tests/test_hf_model.py index f9191f1f4..1deb00da8 100644 --- a/tests/test_hf_model.py +++ b/tests/test_hf_model.py @@ -21,7 +21,7 @@ def test_poolers(): def test_pretrained_text_encoder(model_id): bs, sl, d = 2, 10, 64 cfg = AutoConfig.from_pretrained(model_id) - model = HFTextEncoder(model_id, d, proj='linear') + model = HFTextEncoder(model_id, d, proj_type='linear') x = torch.randint(0, cfg.vocab_size, (bs, sl)) with torch.no_grad(): emb = model(x)