Add more unit tests for image processors

huggingface · Nov 9, 2023 · 26b7802 · 26b7802
1 parent dbf970f
commit 26b7802
Showing 1 changed file with 202 additions and 26 deletions.
diff --git a/tests/processors.test.js b/tests/processors.test.js
@@ -1,64 +1,240 @@
 
 import { env, AutoProcessor, RawImage } from '../src/transformers.js';
-import { m } from './init.js';
+import { m, MAX_TEST_EXECUTION_TIME } from './init.js';
 import { compare } from './test_utils.js';
 
 // Initialise the testing environment
 env.allowLocalModels = false;
 env.useFSCache = false;
 
-const sum = (array) => {
-    let sum = 0;
-    for (let i = 0; i < array.length; ++i) {
-        sum += array[i];
-    }
-    return sum;
+const avg = (array) => {
+    return Number(array.reduce((a, b) => a + b, array instanceof BigInt64Array ? 0n : 0)) / array.length;
 }
 
 describe('Processors', () => {
 
     describe('Image processors', () => {
 
-        const models = [
-            'caidas/swin2SR-classical-sr-x2-64',
-            'facebook/sam-vit-base',
-        ];
+        const IMAGE_CACHE = new Map();
+        const load_image = async (url) => {
+            const cached = IMAGE_CACHE.get(url);
+            if (cached) {
+                return cached;
+            }
+            const image = await RawImage.fromURL(url);
+            IMAGE_CACHE.set(url, image);
+            return image;
+        }
+
+        const MODELS = {
+            swin2sr: 'caidas/swin2SR-classical-sr-x2-64',
+            sam: 'facebook/sam-vit-base',
+            'donut-swin': 'naver-clova-ix/donut-base-finetuned-cord-v2',
+            resnet: 'microsoft/resnet-50',
+            vit: 'google/vit-base-patch16-224',
+            mobilevit: 'apple/mobilevit-small',
+            mobilevit_2: 'Xenova/quickdraw-mobilevit-small',
+            deit: 'facebook/deit-tiny-distilled-patch16-224',
+            beit: 'microsoft/beit-base-patch16-224-pt22k-ft22k',
+            detr: 'facebook/detr-resnet-50',
+            yolos: 'hustvl/yolos-small-300',
+        }
+
         const TEST_IMAGES = {
             pattern_3x3: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/pattern_3x3.png',
             checkerboard_8x8: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/checkerboard_8x8.png',
+            receipt: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/receipt.png',
+            tiger: 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/tiger.jpg',
+
+            // grayscale image
+            skateboard: 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/ml-web-games/skateboard.png',
         }
 
-        // do_pad=true, pad_size=8
-        it(models[0], async () => {
-            const processor = await AutoProcessor.from_pretrained(m(models[0]))
+        // Swin2SRImageProcessor
+        //  - tests when padding is a number (do_pad=true, pad_size=8)
+        it(MODELS.swin2sr, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.swin2sr))
 
             { // Pad to multiple of 8 (3x3 -> 8x8)
-                const image = await RawImage.fromURL(TEST_IMAGES.pattern_3x3);
+                const image = await load_image(TEST_IMAGES.pattern_3x3);
                 const { pixel_values } = await processor(image);
 
                 compare(pixel_values.dims, [1, 3, 8, 8]);
-                compare(sum(pixel_values.data), 104.80000066757202);
+                compare(avg(pixel_values.data), 0.5458333368102709);
             }
 
             { // Do not pad if already a multiple of 8 (8x8 -> 8x8)
-                const image = await RawImage.fromURL(TEST_IMAGES.checkerboard_8x8);
+                const image = await load_image(TEST_IMAGES.checkerboard_8x8);
                 const { pixel_values } = await processor(image);
                 compare(pixel_values.dims, [1, 3, 8, 8]);
-                compare(sum(pixel_values.data), 96);
+                compare(avg(pixel_values.data), 0.5);
             }
-        });
+        }, MAX_TEST_EXECUTION_TIME);
 
-        // do_pad=true, "pad_size": {"height": 1024,"width": 1024 },
-        it(models[1], async () => {
-            const processor = await AutoProcessor.from_pretrained(m(models[1]))
+        // SamProcessor/SamImageProcessor
+        //  - tests normal padding (do_pad=true, pad_size={"height":1024,"width":1024})
+        //  - In addition to the image, pass in a list of points
+        it(MODELS.sam, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.sam))
 
             { // Basic test
-                const image = await RawImage.fromURL(TEST_IMAGES.pattern_3x3);
+                const image = await load_image(TEST_IMAGES.pattern_3x3);
                 const { pixel_values } = await processor(image, [[[0, 0]]]);
                 compare(pixel_values.dims, [1, 3, 1024, 1024]);
-                compare(sum(pixel_values.data), -1417375.5943619595);
+                compare(avg(pixel_values.data), -0.4505715670146813);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // DonutProcessor/DonutFeatureExtractor
+        //  - tests thumbnail resizing (do_thumbnail=true, size=[960, 1280])
+        it(MODELS['donut-swin'], async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS['donut-swin']))
+
+            {
+                const image = await load_image(TEST_IMAGES.receipt);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 1280, 960]);
+                compare(avg(pixel_values.data), 0.1229388610053704);
+
+                compare(original_sizes, [[864, 576]]);
+                compare(reshaped_input_sizes, [[1280, 853]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // ConvNextFeatureExtractor
+        it(MODELS.resnet, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.resnet))
+
+            {
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 336]);
+                compare(avg(pixel_values.data), -0.27736667280600913);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 336]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // ViTFeatureExtractor
+        it(MODELS.vit, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.vit))
+
+            {
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 224]);
+                compare(avg(pixel_values.data), -0.22706867939852762);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 224]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // MobileViTFeatureExtractor
+        it(MODELS.mobilevit, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.mobilevit))
+
+            {
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 256, 256]);
+                compare(avg(pixel_values.data), 0.4599160496887033);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[256, 256]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // MobileViTFeatureExtractor
+        //  - tests not converting to rgb (do_convert_rgb=false)
+        it(MODELS.mobilevit_2, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.mobilevit_2))
+
+            { // Tests grayscale image
+                const image = await load_image(TEST_IMAGES.skateboard);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 1, 28, 28]);
+                compare(avg(pixel_values.data), 0.08558923671585128);
+
+                compare(original_sizes, [[28, 28]]);
+                compare(reshaped_input_sizes, [[28, 28]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // DeiTFeatureExtractor
+        it(MODELS.deit, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.deit))
+
+            { // Tests grayscale image
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 224]);
+                compare(avg(pixel_values.data), -0.2760336682859463);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 224]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+        // BeitFeatureExtractor
+        it(MODELS.beit, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.beit))
+
+            { // Tests grayscale image
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 224, 224]);
+                compare(avg(pixel_values.data), -0.22706867939852762);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[224, 224]]);
             }
-        });
+        }, MAX_TEST_EXECUTION_TIME);
 
-    })
+
+        // DetrFeatureExtractor
+        it(MODELS.detr, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.detr))
+
+            { // Tests grayscale image
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes, pixel_mask } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 888, 1333]);
+                compare(avg(pixel_values.data), -0.27840224131001773);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[888, 1333]]);
+
+                compare(pixel_mask.dims, [1, 64, 64]);
+                compare(avg(pixel_mask.data), 1);
+
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+
+
+        // YolosFeatureExtractor
+        it(MODELS.yolos, async () => {
+            const processor = await AutoProcessor.from_pretrained(m(MODELS.yolos))
+
+            { // Tests grayscale image
+                const image = await load_image(TEST_IMAGES.tiger);
+                const { pixel_values, original_sizes, reshaped_input_sizes } = await processor(image);
+
+                compare(pixel_values.dims, [1, 3, 888, 1333]);
+                compare(avg(pixel_values.data), -0.27840224131001773);
+
+                compare(original_sizes, [[408, 612]]);
+                compare(reshaped_input_sizes, [[888, 1333]]);
+            }
+        }, MAX_TEST_EXECUTION_TIME);
+    });
 });