diff --git a/requirements.txt b/requirements.txt
index 94837f6..e21bbff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 segment_anything
-supervision
\ No newline at end of file
+supervision
+ultralytics
diff --git a/sam_utils/autosam.py b/sam_utils/autosam.py
new file mode 100644
index 0000000..3d8b938
--- /dev/null
+++ b/sam_utils/autosam.py
@@ -0,0 +1,284 @@
+from typing import List, Tuple, Union, Optional
+import os
+import glob
+import copy
+from PIL import Image
+import numpy as np
+import torch
+from segment_anything.modeling import Sam
+from thirdparty.fastsam import FastSAM
+from thirdparty.sam_hq.automatic import SamAutomaticMaskGeneratorHQ
+from sam_utils.segment import Segmentation
+from sam_utils.logger import logger
+from sam_utils.util import blend_image_and_seg
+
+
+class AutoSAM:
+    """Automatic segmentation."""
+
+    def __init__(self, sam: Segmentation) -> None:
+        """AutoSAM initialization.
+
+        Args:
+            sam (Segmentation): global Segmentation instance.
+        """
+        self.sam = sam
+        self.auto_sam: Union[SamAutomaticMaskGeneratorHQ, FastSAM] = None
+        self.fastsam_conf = None
+        self.fastsam_iou = None
+    
+
+    def auto_generate(self, img: np.ndarray) -> List[dict]:
+        """Generate segmentation.
+
+        Args:
+            img (np.ndarray): input image.
+
+        Returns:
+            List[dict]: list of segmentation masks.
+        """
+        return self.auto_sam.generate(img) if type(self.auto_sam) == SamAutomaticMaskGeneratorHQ else \
+               self.auto_sam(img, device=self.sam.sam_device, retina_masks=True, imgsz=1024, conf=self.fastsam_conf, iou=self.fastsam_iou)
+
+
+    def strengthen_semantic_seg(self, class_ids: np.ndarray, img: np.ndarray) -> np.ndarray:
+        # TODO: class_ids use multiple dimensions, categorical mask single and batch
+        logger.info("AutoSAM strengthening semantic segmentation")
+        from sam_utils.util import install_pycocotools
+        install_pycocotools()
+        import pycocotools.mask as maskUtils
+        semantc_mask = copy.deepcopy(class_ids)
+        annotations = self.auto_generate(img)
+        annotations = sorted(annotations, key=lambda x: x['area'], reverse=True)
+        logger.info(f"AutoSAM generated {len(annotations)} masks")
+        for ann in annotations:
+            valid_mask = torch.tensor(maskUtils.decode(ann['segmentation'])).bool()
+            propose_classes_ids = torch.tensor(class_ids[valid_mask])
+            num_class_proposals = len(torch.unique(propose_classes_ids))
+            if num_class_proposals == 1:
+                semantc_mask[valid_mask] = propose_classes_ids[0].numpy()
+                continue
+            top_1_propose_class_ids = torch.bincount(propose_classes_ids.flatten()).topk(1).indices
+            semantc_mask[valid_mask] = top_1_propose_class_ids.numpy()
+        logger.info("AutoSAM strengthening process end")
+        return semantc_mask
+
+
+    def random_segmentation(self, img: Image.Image) -> Tuple[List[Image.Image], str]:
+        """Random segmentation for EditAnything
+
+        Args:
+            img (Image.Image): input image.
+
+        Raises:
+            ModuleNotFoundError: ControlNet not installed.
+
+        Returns:
+            Tuple[List[Image.Image], str]: List of 3 displayed images and output message.
+        """
+        logger.info("AutoSAM generating random segmentation for EditAnything")
+        img_np = np.array(img.convert("RGB"))
+        annotations = self.auto_generate(img_np)
+        logger.info(f"AutoSAM generated {len(annotations)} masks")
+        H, W, _ = img_np.shape
+        color_map = np.zeros((H, W, 3), dtype=np.uint8)
+        detected_map_tmp = np.zeros((H, W), dtype=np.uint16)
+        for idx, annotation in enumerate(annotations):
+            current_seg = annotation['segmentation']
+            color_map[current_seg] = np.random.randint(0, 255, (3))
+            detected_map_tmp[current_seg] = idx + 1
+        detected_map = np.zeros((detected_map_tmp.shape[0], detected_map_tmp.shape[1], 3))
+        detected_map[:, :, 0] = detected_map_tmp % 256
+        detected_map[:, :, 1] = detected_map_tmp // 256
+        try:
+            from scripts.processor import HWC3
+        except:
+            raise ModuleNotFoundError("ControlNet extension not found.")
+        detected_map = HWC3(detected_map.astype(np.uint8))
+        logger.info("AutoSAM generation process end")
+        return [blend_image_and_seg(img_np, color_map), Image.fromarray(color_map), Image.fromarray(detected_map)], \
+            "Random segmentation done. Left above (0) is blended image, right above (1) is random segmentation, left below (2) is Edit-Anything control input."
+
+
+    def layout_single_image(self, input_image: Image.Image, output_path: str) -> None:
+        """Single image layout generation.
+
+        Args:
+            input_image (Image.Image): input image.
+            output_path (str): output path.
+        """
+        img_np = np.array(input_image.convert("RGB"))
+        annotations = self.auto_generate(img_np)
+        logger.info(f"AutoSAM generated {len(annotations)} annotations")
+        annotations = sorted(annotations, key=lambda x: x['area'])
+        for idx, annotation in enumerate(annotations):
+            img_tmp = np.zeros((img_np.shape[0], img_np.shape[1], 3))
+            img_tmp[annotation['segmentation']] = img_np[annotation['segmentation']]
+            img_np[annotation['segmentation']] = np.array([0, 0, 0])
+            img_tmp = Image.fromarray(img_tmp.astype(np.uint8))
+            img_tmp.save(os.path.join(output_path, f"{idx}.png"))
+        img_np = Image.fromarray(img_np.astype(np.uint8))
+        img_np.save(os.path.join(output_path, "leftover.png"))
+
+
+    def layout(self, input_image_or_path: Union[str, Image.Image], output_path: str) -> str:
+        """Single or bath layout generation.
+
+        Args:
+            input_image_or_path (Union[str, Image.Image]): input imag or path.
+            output_path (str): output path.
+
+        Returns:
+            str: generation message.
+        """
+        if isinstance(input_image_or_path, str):
+            logger.info("Image layer division batch processing")
+            all_files = glob.glob(os.path.join(input_image_or_path, "*"))
+            for image_index, input_image_file in enumerate(all_files):
+                logger.info(f"Processing {image_index}/{len(all_files)} {input_image_file}")
+                try:
+                    input_image = Image.open(input_image_file)
+                    output_directory = os.path.join(output_path, os.path.splitext(os.path.basename(input_image_file))[0])
+                    from pathlib import Path
+                    Path(output_directory).mkdir(exist_ok=True)
+                except:
+                    logger.warn(f"File {input_image_file} not image, skipped.")
+                    continue
+                self.layout_single_image(input_image, output_directory)
+        else:
+            self.layout_single_image(input_image_or_path, output_path)
+        return "Done"
+
+
+    def semantic_segmentation(self, input_image: Image.Image, annotator_name: str, processor_res: int, 
+                              use_pixel_perfect: bool, resize_mode: int, target_W: int, target_H: int) -> Tuple[List[Image.Image], str]:
+        """Semantic segmentation enhanced by segment anything.
+
+        Args:
+            input_image (Image.Image): input image.
+            annotator_name (str): annotator name. Should be one of "seg_ufade20k"|"seg_ofade20k"|"seg_ofcoco".
+            processor_res (int): processor resolution. Support 64-2048.
+            use_pixel_perfect (bool): whether to use pixel perfect written by lllyasviel.
+            resize_mode (int): resize mode for pixel perfect, should be 0|1|2.
+            target_W (int): target width for pixel perfect.
+            target_H (int): target height for pixel perfect.
+
+        Raises:
+            ModuleNotFoundError: ControlNet not installed.
+
+        Returns:
+            Tuple[List[Image.Image], str]: list of 4 displayed images and message.
+        """
+        assert input_image is not None, "No input image."
+        if "seg" in annotator_name:
+            try:
+                from scripts.processor import uniformer, oneformer_coco, oneformer_ade20k
+                from scripts.external_code import pixel_perfect_resolution
+                oneformers = {
+                    "ade20k": oneformer_ade20k,
+                    "coco": oneformer_coco
+                }
+            except:
+                raise ModuleNotFoundError("ControlNet extension not found.")
+            input_image_np = np.array(input_image)
+            if use_pixel_perfect:
+                processor_res = pixel_perfect_resolution(input_image_np, resize_mode, target_W, target_H)
+            logger.info("Generating semantic segmentation without SAM")
+            if annotator_name == "seg_ufade20k":
+                original_semantic = uniformer(input_image_np, processor_res)
+            else:
+                dataset = annotator_name.split('_')[-1][2:]
+                original_semantic = oneformers[dataset](input_image_np, processor_res)
+            logger.info("Generating semantic segmentation with SAM")
+            sam_semantic = self.strengthen_semantic_seg(np.array(original_semantic), input_image_np)
+            output_gallery = [original_semantic, sam_semantic, blend_image_and_seg(input_image, original_semantic), blend_image_and_seg(input_image, sam_semantic)]
+            return output_gallery, "Done. Left is segmentation before SAM, right is segmentation after SAM."
+        else:
+            return self.random_segmentation(input_image)
+
+
+    def categorical_mask_image(self, annotator_name: str, processor_res: int, category_input: List[int], input_image: Image.Image,
+                               use_pixel_perfect: bool, resize_mode: int, target_W: int, target_H: int) -> Tuple[np.ndarray, Image.Image]:
+        """Single image categorical mask.
+
+        Args:
+            annotator_name (str): annotator name. Should be one of "seg_ufade20k"|"seg_ofade20k"|"seg_ofcoco".
+            processor_res (int): processor resolution. Support 64-2048.
+            category_input (List[int]): category input.
+            input_image (Image.Image): input image.
+            use_pixel_perfect (bool): whether to use pixel perfect written by lllyasviel.
+            resize_mode (int): resize mode for pixel perfect, should be 0|1|2.
+            target_W (int): target width for pixel perfect.
+            target_H (int): target height for pixel perfect.
+
+        Raises:
+            ModuleNotFoundError: ControlNet not installed.
+            AssertionError: Illegal class id.
+
+        Returns:
+            Tuple[np.ndarray, Image.Image]: mask in resized shape and resized input image.
+        """
+        assert input_image is not None, "No input image."
+        try:
+            from scripts.processor import uniformer, oneformer_coco, oneformer_ade20k
+            from scripts.external_code import pixel_perfect_resolution
+            oneformers = {
+                "ade20k": oneformer_ade20k,
+                "coco": oneformer_coco
+            }
+        except:
+            raise ModuleNotFoundError("ControlNet extension not found.")
+        filter_classes = category_input
+        assert len(filter_classes) > 0, "No class selected."
+        try:
+            filter_classes = [int(i) for i in filter_classes]
+        except:
+            raise AssertionError("Illegal class id. You may have input some string.")
+        input_image_np = np.array(input_image)
+        if use_pixel_perfect:
+            processor_res = pixel_perfect_resolution(input_image_np, resize_mode, target_W, target_H)
+        crop_input_image_copy = copy.deepcopy(input_image)
+        logger.info(f"Generating categories with processor {annotator_name}")
+        if annotator_name == "seg_ufade20k":
+            original_semantic = uniformer(input_image_np, processor_res)
+        else:
+            dataset = annotator_name.split('_')[-1][2:]
+            original_semantic = oneformers[dataset](input_image_np, processor_res)
+        sam_semantic = self.strengthen_semantic_seg(np.array(original_semantic), input_image_np)
+        mask = np.zeros(sam_semantic.shape, dtype=np.bool_)
+        from sam_utils.config import SEMANTIC_CATEGORIES
+        for i in filter_classes:
+            mask[np.equal(sam_semantic, SEMANTIC_CATEGORIES[annotator_name][i])] = True
+        return mask, crop_input_image_copy
+
+
+    def register(
+        self,
+        sam_model_name: str, 
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+        fastsam_conf: float = 0.4,
+        fastsam_iou: float = 0.9) -> None:
+        """Register AutoSAM module."""
+        self.sam.load_sam_model(sam_model_name)
+        assert type(self.sam.sam_model) in [FastSAM, Sam], f"{sam_model_name} does not support auto segmentation."
+        if type(self.sam.sam_model) == FastSAM:
+            self.fastsam_conf = fastsam_conf
+            self.fastsam_iou = fastsam_iou
+            self.auto_sam = self.sam.sam_model
+        else:
+            self.auto_sam = SamAutomaticMaskGeneratorHQ(
+                self.sam.sam_model, points_per_side, points_per_batch, pred_iou_thresh,
+                stability_score_thresh, stability_score_offset, box_nms_thresh,
+                crop_n_layers, crop_nms_thresh, crop_overlap_ratio, crop_n_points_downscale_factor, None, 
+                min_mask_region_area, output_mode)
diff --git a/sam_utils/config.py b/sam_utils/config.py
new file mode 100644
index 0000000..20aad98
--- /dev/null
+++ b/sam_utils/config.py
@@ -0,0 +1,339 @@
+ADE20K_150_CATEGORIES = [
+    {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
+    {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
+    {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
+    {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
+    {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
+    {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
+    {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
+    {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
+    {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
+    {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
+    {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
+    {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
+    {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
+    {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
+    {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
+    {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
+    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
+    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
+    {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
+    {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
+    {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
+    {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
+    {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
+    {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
+    {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
+    {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
+    {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
+    {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
+    {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
+    {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
+    {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
+    {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
+    {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
+    {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
+    {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
+    {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
+    {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
+    {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
+    {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
+    {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
+    {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
+    {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
+    {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
+    {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
+    {
+        "color": [6, 51, 255],
+        "id": 44,
+        "isthing": 1,
+        "name": "chest of drawers, chest, bureau, dresser",
+    },
+    {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
+    {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
+    {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
+    {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
+    {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
+    {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
+    {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
+    {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
+    {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
+    {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
+    {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
+    {
+        "color": [255, 71, 0],
+        "id": 56,
+        "isthing": 1,
+        "name": "pool table, billiard table, snooker table",
+    },
+    {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
+    {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
+    {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
+    {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
+    {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
+    {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
+    {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
+    {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
+    {
+        "color": [0, 255, 133],
+        "id": 65,
+        "isthing": 1,
+        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
+    },
+    {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
+    {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
+    {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
+    {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
+    {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
+    {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
+    {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
+    {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
+    {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
+    {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
+    {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
+    {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
+    {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
+    {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
+    {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
+    {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
+    {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
+    {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
+    {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
+    {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
+    {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
+    {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
+    {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
+    {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
+    {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
+    {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
+    {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
+    {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
+    {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
+    {
+        "color": [0, 122, 255],
+        "id": 95,
+        "isthing": 1,
+        "name": "bannister, banister, balustrade, balusters, handrail",
+    },
+    {
+        "color": [0, 255, 163],
+        "id": 96,
+        "isthing": 0,
+        "name": "escalator, moving staircase, moving stairway",
+    },
+    {
+        "color": [255, 153, 0],
+        "id": 97,
+        "isthing": 1,
+        "name": "ottoman, pouf, pouffe, puff, hassock",
+    },
+    {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
+    {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
+    {
+        "color": [143, 255, 0],
+        "id": 100,
+        "isthing": 0,
+        "name": "poster, posting, placard, notice, bill, card",
+    },
+    {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
+    {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
+    {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
+    {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
+    {
+        "color": [133, 0, 255],
+        "id": 105,
+        "isthing": 0,
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+    },
+    {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
+    {
+        "color": [184, 0, 255],
+        "id": 107,
+        "isthing": 1,
+        "name": "washer, automatic washer, washing machine",
+    },
+    {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
+    {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
+    {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
+    {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
+    {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
+    {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
+    {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
+    {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
+    {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
+    {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
+    {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
+    {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
+    {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
+    {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
+    {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
+    {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
+    {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
+    {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
+    {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
+    {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
+    {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
+    {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
+    {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
+    {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
+    {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
+    {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
+    {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
+    {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
+    {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
+    {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
+    {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
+    {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
+    {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
+    {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
+    {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
+    {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
+    {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
+    {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
+    {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
+    {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
+    {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
+    {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
+]
+
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},
+]
+
+SEMANTIC_CATEGORIES = {
+    "seg_ufade20k": ADE20K_150_CATEGORIES,
+    "seg_ofade20k": ADE20K_150_CATEGORIES,
+    "seg_ofcoco": COCO_CATEGORIES,
+}
diff --git a/sam_utils/detect.py b/sam_utils/detect.py
new file mode 100644
index 0000000..d509b96
--- /dev/null
+++ b/sam_utils/detect.py
@@ -0,0 +1,221 @@
+from typing import List
+import os
+from PIL import Image
+import torch
+
+from modules import shared
+from modules.devices import device
+from sam_utils.logger import logger
+
+
+class Detection:
+    """Detection related process.
+    """
+
+    def __init__(self) -> None:
+        """Initialize detection related process.
+        """
+        self.dino_model = None
+        self.dino_model_type = ""
+        from scripts.sam_state import sam_extension_dir
+        self.dino_model_dir = os.path.join(sam_extension_dir, "models/grounding-dino")
+        self.dino_model_info = {
+            "GroundingDINO_SwinT_OGC (694MB)": {
+                "checkpoint": "groundingdino_swint_ogc.pth",
+                "config": os.path.join(self.dino_model_dir, "GroundingDINO_SwinT_OGC.py"),
+                "url": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth",
+            },
+            "GroundingDINO_SwinB (938MB)": {
+                "checkpoint": "groundingdino_swinb_cogcoor.pth",
+                "config": os.path.join(self.dino_model_dir, "GroundingDINO_SwinB.cfg.py"),
+                "url": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth"
+            },
+        }
+
+
+    def _load_dino_model(self, dino_checkpoint: str, use_pip_dino: bool) -> None:
+        """Load GroundignDINO model to device.
+
+        Args:
+            dino_checkpoint (str): GroundingDINO checkpoint name.
+            use_pip_dino (bool): If True, use pip installed GroundingDINO. If False, use local GroundingDINO.
+        """
+        logger.info(f"Initializing GroundingDINO {dino_checkpoint}")
+        if self.dino_model is None or dino_checkpoint != self.dino_model_type:
+            self.clear()
+            if use_pip_dino:
+                from groundingdino.models import build_model
+                from groundingdino.util.slconfig import SLConfig
+                from groundingdino.util.utils import clean_state_dict
+            else:
+                from thirdparty.groundingdino.models import build_model
+                from thirdparty.groundingdino.util.slconfig import SLConfig
+                from thirdparty.groundingdino.util.utils import clean_state_dict
+            args = SLConfig.fromfile(self.dino_model_info[dino_checkpoint]["config"])
+            dino = build_model(args)
+            checkpoint = torch.hub.load_state_dict_from_url(
+                self.dino_model_info[dino_checkpoint]["url"], self.dino_model_dir)
+            dino.load_state_dict(clean_state_dict(
+                checkpoint['model']), strict=False)
+            dino.eval()
+            self.dino_model = dino
+            self.dino_model_type = dino_checkpoint
+        self.dino_model.to(device=device)
+
+
+    def _load_dino_image(self, image_pil: Image.Image, use_pip_dino: bool) -> torch.Tensor:
+        """Transform image to make the image applicable to GroundingDINO.
+
+        Args:
+            image_pil (Image.Image): Input image in PIL format.
+            use_pip_dino (bool): If True, use pip installed GroundingDINO. If False, use local GroundingDINO.
+
+        Returns:
+            torch.Tensor: Transformed image in torch.Tensor format.
+        """
+        if use_pip_dino:
+            import groundingdino.datasets.transforms as T
+        else:
+            from thirdparty.groundingdino.datasets import transforms as T
+        transform = T.Compose(
+            [
+                T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        image, _ = transform(image_pil, None)  # 3, h, w
+        return image
+
+
+    def _get_grounding_output(self, image: torch.Tensor, caption: str, box_threshold: float) -> torch.Tensor:
+        """Inference GroundingDINO model.
+
+        Args:
+            image (torch.Tensor): transformed input image.
+            caption (str): string caption.
+            box_threshold (float): bbox threshold.
+
+        Returns:
+            torch.Tensor: generated bounding boxes.
+        """
+        caption = caption.lower()
+        caption = caption.strip()
+        if not caption.endswith("."):
+            caption = caption + "."
+        image = image.to(device)
+        with torch.no_grad():
+            outputs = self.dino_model(image[None], captions=[caption])
+        if shared.cmd_opts.lowvram:
+            self.unload_model()
+        logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
+        boxes = outputs["pred_boxes"][0]  # (nq, 4)
+
+        # filter output
+        logits_filt = logits.clone()
+        boxes_filt = boxes.clone()
+        filt_mask = logits_filt.max(dim=1)[0] > box_threshold
+        logits_filt = logits_filt[filt_mask]  # num_filt, 256
+        boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
+
+        return boxes_filt.cpu()
+
+
+    def dino_predict(self, input_image: Image.Image, dino_model_name: str, text_prompt: str, box_threshold: float) -> List[List[float]]:
+        """Exposed API for GroundingDINO inference.
+
+        Args:
+            input_image (Image.Image): input image.
+            dino_model_name (str): GroundingDINO model name.
+            text_prompt (str): string prompt.
+            box_threshold (float): bbox threshold.
+
+        Returns:
+            List[List[float]]: generated N * xyxy bounding boxes.
+        """
+        from sam_utils.util import install_goundingdino
+        install_success = install_goundingdino()
+        logger.info("Running GroundingDINO Inference")
+        dino_image = self._load_dino_image(input_image.convert("RGB"), install_success)
+        self._load_dino_model(dino_model_name, install_success)
+
+        boxes_filt = self._get_grounding_output(
+            dino_image, text_prompt, box_threshold
+        )
+
+        H, W = input_image.size[1], input_image.size[0]
+        for i in range(boxes_filt.size(0)):
+            boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
+            boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
+            boxes_filt[i][2:] += boxes_filt[i][:2]
+        return boxes_filt.tolist()
+    
+
+    def check_yolo_availability(self) -> List[str]:
+        """Check if YOLO models are available. Do not check if YOLO not enabled.
+
+        Returns:
+            List[str]: available YOLO models.
+        """
+        if shared.opts.data.get("sam_use_local_yolo", False):
+            from modules.paths import models_path
+            sd_yolo_model_dir = os.path.join(models_path, "ultralytics")
+            return [name for name in os.listdir(sd_yolo_model_dir) if (".pth" in name or ".pt" in name)]
+        else:
+            return []
+    
+
+    def yolo_predict(self, input_image: Image.Image, yolo_model_name: str, conf=0.4) -> List[List[float]]:
+        """Run detection inference with models based on YOLO.
+
+        Args:
+            input_image (np.ndarray): input image, expect shape HW3.
+            conf (float, optional): object confidence threshold. Defaults to 0.4.
+
+        Raises:
+            RuntimeError: not getting any bbox. Might be caused by high conf or non-detection/segmentation model.
+
+        Returns:
+            np.ndarray: generated N * xyxy bounding boxes.
+        """
+        from ultralytics import YOLO
+        assert shared.opts.data.get("sam_use_yolo_models", False), "YOLO models are not enabled. Please enable in settings/Segment Anything."
+        logger.info("Loading YOLO model.")
+        from modules.paths import models_path
+        sd_yolo_model_dir = os.path.join(models_path, "ultralytics")
+        self.dino_model = YOLO(os.path.join(sd_yolo_model_dir, yolo_model_name)).to(device)
+        self.dino_model_type = yolo_model_name
+        logger.info("Running YOLO inference.")
+        pred = self.dino_model(input_image, conf=conf)
+        bboxes = pred[0].boxes.xyxy.cpu().numpy()
+        if bboxes.size == 0:
+            error_msg = "You are not getting any bbox. There are 2 possible reasons. "\
+                        "1. You set up a high conf which means that you should lower the conf. "\
+                        "2. You are using a non-detection model which means that you should check your model type."
+            raise RuntimeError(error_msg)
+        else:
+            return bboxes.tolist()
+
+
+    def __call__(self, input_image: Image.Image, model_name: str, text_prompt: str, box_threshold: float, conf=0.4) -> List[List[float]]:
+        """Exposed API for detection inference."""
+        if model_name in self.dino_model_info.keys():
+            return self.dino_predict(input_image, model_name, text_prompt, box_threshold)
+        elif model_name in self.check_yolo_availability():
+            return self.yolo_predict(input_image, model_name, conf)
+        else:
+            raise ValueError(f"Detection model {model_name} not found.")
+
+
+    def clear(self) -> None:
+        """Clear detection model from any memory.
+        """
+        del self.dino_model
+        self.dino_model = None
+
+
+    def unload_model(self) -> None:
+        """Unload detection model from GPU to CPU.
+        """
+        if self.dino_model is not None:
+            self.dino_model.cpu()
diff --git a/sam_utils/logger.py b/sam_utils/logger.py
new file mode 100644
index 0000000..d986248
--- /dev/null
+++ b/sam_utils/logger.py
@@ -0,0 +1,42 @@
+# This code is copied from [@huchenlei](https://github.com/huchenlei), in case some users do not use ControlNet.
+import logging
+import copy
+import sys
+
+from modules import shared
+
+
+class ColoredFormatter(logging.Formatter):
+    COLORS = {
+        "DEBUG": "\033[0;36m",  # CYAN
+        "INFO": "\033[0;32m",  # GREEN
+        "WARNING": "\033[0;33m",  # YELLOW
+        "ERROR": "\033[0;31m",  # RED
+        "CRITICAL": "\033[0;37;41m",  # WHITE ON RED
+        "RESET": "\033[0m",  # RESET COLOR
+    }
+
+    def format(self, record):
+        colored_record = copy.copy(record)
+        levelname = colored_record.levelname
+        seq = self.COLORS.get(levelname, self.COLORS["RESET"])
+        colored_record.levelname = f"{seq}{levelname}{self.COLORS['RESET']}"
+        return super().format(colored_record)
+
+
+# Create a new logger
+logger = logging.getLogger("Segment Anything")
+logger.propagate = False
+
+# Add handler if we don't have one.
+if not logger.handlers:
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(
+        ColoredFormatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    )
+    logger.addHandler(handler)
+
+# Configure logger
+loglevel_string = getattr(shared.cmd_opts, "sam_loglevel", "INFO")
+loglevel = getattr(logging, loglevel_string.upper(), None)
+logger.setLevel(loglevel)
diff --git a/sam_utils/segment.py b/sam_utils/segment.py
new file mode 100644
index 0000000..f7a7a89
--- /dev/null
+++ b/sam_utils/segment.py
@@ -0,0 +1,447 @@
+from typing import List
+import gc
+import os
+import torch
+import numpy as np
+from ultralytics import YOLO
+from thirdparty.fastsam import FastSAM, FastSAMPrompt
+from modules import shared
+from modules.safe import unsafe_torch_load, load
+from modules.devices import get_device_for, cpu, torch_gc
+from modules.paths import models_path
+from scripts.sam_state import sam_extension_dir
+from sam_utils.logger import logger
+from sam_utils.util import ModelInfo
+from thirdparty.sam_hq.build_sam_hq import sam_model_registry
+from thirdparty.sam_hq.predictor import SamPredictorHQ
+from thirdparty.mam.m2m import SamM2M
+
+
+class Segmentation:
+    """Segmentation related process."""
+
+    def __init__(self) -> None:
+        """Initialize segmentation related process."""
+        self.sam_model_info = {
+            "sam_vit_h_4b8939.pth"  : ModelInfo("SAM", "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", "Meta", "2.56GB"),
+            "sam_vit_l_0b3195.pth"  : ModelInfo("SAM", "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth", "Meta", "1.25GB"),
+            "sam_vit_b_01ec64.pth"  : ModelInfo("SAM", "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth", "Meta", "375MB"),
+            "sam_hq_vit_h.pth"      : ModelInfo("SAM-HQ", "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_h.pth", "SysCV", "2.57GB"),
+            "sam_hq_vit_l.pth"      : ModelInfo("SAM-HQ", "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_l.pth", "SysCV", "1.25GB"),
+            "sam_hq_vit_b.pth"      : ModelInfo("SAM-HQ", "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_b.pth", "SysCV", "379MB"),
+            "mobile_sam.pt"         : ModelInfo("SAM", "https://github.com/ChaoningZhang/MobileSAM/raw/master/weights/mobile_sam.pt", "KHU", "39MB"),
+            "FastSAM-x.pt"          : ModelInfo("SAM-YOLO", "https://huggingface.co/conrevo/SAM4WebUI-Extension-Models/resolve/main/FastSAM-x.pt", "CASIA-IVA-Lab", "138MB"),
+        }
+        self.check_model_availability()
+        self.sam_model = None
+        self.sam_model_name = ""
+        self.sam_model_wrapper = None
+        self.sam_device = get_device_for("sam")
+        self.sam_m2m = SamM2M()
+
+
+    def check_model_availability(self) -> List[str]:
+        """retrieve all models in all the model directories
+
+        Returns:
+            List[str]: Model information displayed on the UI
+        """
+        user_sam_model_dir = shared.opts.data.get("sam_model_path", "")
+        sd_sam_model_dir = os.path.join(models_path, "sam")
+        scripts_sam_model_dir = os.path.join(sam_extension_dir, "models/sam")
+        sd_yolo_model_dir = os.path.join(models_path, "ultralytics")
+        sam_model_dirs = [sd_sam_model_dir, scripts_sam_model_dir]
+        if user_sam_model_dir != "":
+            sam_model_dirs.append(user_sam_model_dir)
+        if shared.opts.data.get("sam_use_yolo_models", False):
+            sam_model_dirs.append(sd_yolo_model_dir)
+        for dir in sam_model_dirs:
+            if os.path.isdir(dir):
+                # for each model inside sam_model_info, update its information
+                sam_model_names = [name for name in os.listdir(dir) if (".pth" in name or ".pt" in name)]
+                for name in sam_model_names:
+                    if name in self.sam_model_info.keys():
+                        self.sam_model_info[name].local_path(os.path.join(dir, name))
+                    elif shared.opts.data.get("sam_use_yolo_models", False):
+                        logger.warn(f"Model {name} not found in support list, default to use YOLO as initializer")
+                        self.sam_model_info[name] = ModelInfo("YOLO", os.path.join(dir, name), "?", "?", "downloaded")
+        return [val.get_info(key) for key, val in self.sam_model_info.items()]
+
+
+    def load_sam_model(self, sam_checkpoint_name: str) -> None:
+        """Load segmentation model.
+
+        Args:
+            sam_checkpoint_name (str): The model filename. Do not change.
+
+        Raises:
+            RuntimeError: Model file not found in either support list or local model directory.
+            RuntimeError: Cannot automatically download model from remote server.
+        """
+        sam_checkpoint_name = sam_checkpoint_name.split(" ")[0]
+        if self.sam_model is None or self.sam_model_name != sam_checkpoint_name:
+            if sam_checkpoint_name not in self.sam_model_info.keys():
+                error_msg = f"{sam_checkpoint_name} not found and cannot be auto downloaded"
+                raise RuntimeError(error_msg)
+            if "http" in self.sam_model_info[sam_checkpoint_name].url:
+                sam_url = self.sam_model_info[sam_checkpoint_name].url
+                user_dir = shared.opts.data.get("sam_model_path", "")
+                sd_dir = os.path.join(models_path, "sam")
+                scripts_dir = os.path.join(sam_extension_dir, "models/sam")
+                sam_model_dir = user_dir if user_dir != "" else (sd_dir if os.path.isdir(sd_dir) else scripts_dir)
+                logger.info(f"Downloading segmentation model {sam_checkpoint_name} from {sam_url} to {sam_model_dir}")
+                try:
+                    torch.hub.download_url_to_file(sam_url, sam_model_dir)
+                except:
+                    error_msg = f"Cannot connect to {sam_url}. Set terminal proxy or download via browser to {sam_model_dir}"
+                    logger.error(f"{error_msg}")
+                    raise Exception(error_msg)
+            device_name = "GPU" if "cuda" in str(self.sam_device).lower() else "CPU"
+            logger.info(f"Initializing {sam_checkpoint_name} to {device_name}")
+            model_type = self.sam_model_info[sam_checkpoint_name].model_type
+            model_path = self.sam_model_info[sam_checkpoint_name].url
+            torch.load = unsafe_torch_load
+            if "YOLO" not in model_type:
+                logger.info(f"Loading SAM model from {model_path}")
+                self.sam_model = sam_model_registry[sam_checkpoint_name](checkpoint=model_path)
+                self.sam_model_wrapper = SamPredictorHQ(self.sam_model, 'HQ' in model_type)
+            elif "SAM" in model_type:
+                logger.info(f"Loading FastSAM model from {model_path}")
+                self.sam_model = FastSAM(model_path)
+                self.sam_model_wrapper = self.sam_model
+            elif shared.opts.data.get("sam_use_yolo_models", False):
+                logger.info(f"Loading YOLO model from {model_path}")
+                self.sam_model = YOLO(model_path)
+                self.sam_model_wrapper = self.sam_model
+            else:
+                error_msg = f"Unsupported model type {model_type}"
+                raise RuntimeError(error_msg)
+            self.sam_model_name = sam_checkpoint_name
+            torch.load = load
+        self.sam_model.to(self.sam_device)
+    
+
+    def change_device(self, use_cpu: bool) -> None:
+        """Change the device of the segmentation model.
+
+        Args:
+            use_cpu (bool): Whether to use CPU for SAM inference.
+        """
+        self.sam_device = cpu if use_cpu else get_device_for("sam")
+
+
+    def sam_predict(self, 
+        input_image: np.ndarray,
+        positive_points: List[List[int]]=None,
+        negative_points: List[List[int]]=None, 
+        positive_bbox: List[List[float]]=None,
+        negative_bbox: List[List[float]]=None,
+        merge_positive=True,
+        multimask_output=True,
+        point_with_box=False,
+        use_mam=False,
+        mam_guidance_mode: str="mask") -> np.ndarray:
+        """Run segmentation inference with models based on segment anything.
+
+        Args:
+            input_image (np.ndarray): input image, expect shape HW3.
+            positive_points (List[List[int]], optional): positive point prompts, expect N * xy. Defaults to None.
+            negative_points (List[List[int]], optional): negative point prompts, expect N * xy. Defaults to None.
+            positive_bbox (List[List[float]], optional): positive bbox prompts, expect N * xyxy. Defaults to None.
+            negative_bbox (List[List[float]], optional): negative bbox prompts, expect N * xyxy. Defaults to None.
+            merge_positive (bool, optional): OR all positive masks. Defaults to True. Valid only if point_with_box is False.
+            multimask_output (bool, optional): output 3 masks or not. Defaults to True.
+            point_with_box (bool, optional): always send bboxes and points to the model at the same time. Defaults to False.
+            use_mam (bool, optional): use Matting-Anything. Defaults to False.
+            mam_guidance_mode (str, optional): guidance model for Matting-Anything. Expect "mask" or "bbox". Defaults to "mask". Valid only if use_mam is True.
+
+        Returns:
+            np.ndarray: mask output, expect shape 11HW or 31HW.
+        """
+        has_points = positive_points is not None or negative_points is not None
+        has_bbox = positive_bbox is not None or negative_bbox is not None
+        assert has_points or has_bbox, "No input provided. Please provide at least one point or one bbox."
+        assert type(self.sam_model_wrapper) == SamPredictorHQ, "Incorrect SAM model wrapper. Expect SamPredictorHQ here."
+        mask_shape = ((3, 1) if multimask_output else (1, 1)) + input_image.shape[:2]
+        self.sam_model_wrapper.set_image(input_image)
+
+        # If use Matting-Anything, load mam model.
+        if use_mam:
+            try:
+                self.sam_m2m.load_m2m()
+            except:
+                use_mam = False
+
+        # Matting-Anything inference for each SAM inference.
+        def _mam_infer(mask: np.ndarray, low_res_mask: np.ndarray) -> np.ndarray:
+            low_res_mask_logits = low_res_mask > self.sam_model_wrapper.model.mask_threshold
+            if use_mam:
+                mask = self.sam_m2m.forward(
+                    self.sam_model_wrapper.features, torch.tensor(input_image), low_res_mask_logits, mask, 
+                    self.sam_model_wrapper.original_size, self.sam_model_wrapper.input_size, mam_guidance_mode)
+            return mask
+
+
+        # When always send bboxes and points to SAM at the same time.
+        if has_points and has_bbox and point_with_box:
+            logger.info(f"SAM {self.sam_model_name} inference with "
+                        f"{len(positive_points)} positive points, {len(negative_points)} negative points, "
+                        f"{len(positive_bbox)} positive bboxes, {len(negative_bbox)} negative bboxes. "
+                        f"For each bbox, all point prompts are affective. Masks for each bbox will be merged.")
+            point_coords = np.array(positive_points + negative_points)
+            point_labels = np.array([1] * len(positive_points) + [0] * len(negative_points))
+            point_labels_neg = np.array([0] * len(positive_points) + [1] * len(negative_points))
+
+            def _box_infer(_bbox, _point_labels):
+                _masks = []
+                for box in _bbox:
+                    mask, _, low_res_mask = self.sam_model_wrapper.predict(
+                        point_coords=point_coords,
+                        point_labels=_point_labels,
+                        box=np.array(box),
+                        multimask_output=multimask_output)
+                    mask = mask[:, None, ...]
+                    low_res_mask = low_res_mask[:, None, ...]
+                    _masks.append(_mam_infer(mask, low_res_mask))
+                return np.logical_or.reduce(_masks)
+            
+            mask_bbox_positive = _box_infer(positive_bbox, point_labels) if positive_bbox is not None else np.ones(shape=mask_shape, dtype=np.bool_)
+            mask_bbox_negative = _box_infer(negative_bbox, point_labels_neg) if negative_bbox is not None else np.zeros(shape=mask_shape, dtype=np.bool_)
+
+            return mask_bbox_positive & ~mask_bbox_negative
+
+        # When separate bbox inference from point inference.
+        if has_points:
+            logger.info(f"SAM {self.sam_model_name} inference with "
+                        f"{len(positive_points)} positive points, {len(negative_points)} negative points")
+            point_coords = np.array(positive_points + negative_points)
+            point_labels = np.array([1] * len(positive_points) + [0] * len(negative_points))
+            mask_points_positive, _, low_res_masks_points_positive = self.sam_model_wrapper.predict(
+                point_coords=point_coords,
+                point_labels=point_labels,
+                box=None,
+                multimask_output=multimask_output)
+            mask_points_positive = mask_points_positive[:, None, ...]
+            low_res_masks_points_positive = low_res_masks_points_positive[:, None, ...]
+            mask_points_positive = _mam_infer(mask_points_positive, low_res_masks_points_positive)
+        else:
+            mask_points_positive = np.ones(shape=mask_shape, dtype=np.bool_)
+
+        def _box_infer(_bbox, _character):
+            logger.info(f"SAM {self.sam_model_name} inference with {len(positive_bbox)} {_character} bboxes")
+            transformed_boxes = self.sam_model_wrapper.transform.apply_boxes_torch(torch.tensor(_bbox), input_image.shape[:2])
+            mask, _, low_res_mask = self.sam_model_wrapper.predict_torch(
+                point_coords=None,
+                point_labels=None,
+                boxes=transformed_boxes.to(self.sam_device),
+                multimask_output=multimask_output)
+            mask = mask.permute(1, 0, 2, 3).cpu().numpy()
+            low_res_mask = low_res_mask.permute(1, 0, 2, 3).cpu().numpy()
+            return _mam_infer(mask, low_res_mask)
+        mask_bbox_positive = _box_infer(positive_bbox, "positive") if positive_bbox is not None else np.ones(shape=mask_shape, dtype=np.bool_)
+        mask_bbox_negative = _box_infer(negative_bbox, "negative") if negative_bbox is not None else np.zeros(shape=mask_shape, dtype=np.bool_)
+
+        if merge_positive:
+            return (mask_points_positive | mask_bbox_positive) & ~mask_bbox_negative
+        else:
+            return (mask_points_positive & mask_bbox_positive) & ~mask_bbox_negative
+    
+
+    def fastsam_predict(self,
+        input_image: np.ndarray,
+        positive_points: List[List[int]]=None,
+        negative_points: List[List[int]]=None, 
+        positive_bbox: List[List[float]]=None,
+        negative_bbox: List[List[float]]=None,
+        positive_text: str="",
+        negative_text: str="",
+        merge_positive=True,
+        merge_negative=True,
+        conf=0.4, iou=0.9,) -> np.ndarray:
+        """Run segmentation inference with models based on FastSAM. (This is a special kind of YOLO model)
+
+        Args:
+            input_image (np.ndarray): input image, expect shape HW3.
+            positive_points (List[List[int]], optional): positive point prompts, expect N * xy Defaults to None.
+            negative_points (List[List[int]], optional): negative point prompts, expect N * xy Defaults to None.
+            positive_bbox (List[List[float]], optional): positive bbox prompts, expect N * xyxy. Defaults to None.
+            negative_bbox (List[List[float]], optional): negative bbox prompts, expect N * xyxy. Defaults to None.
+            positive_text (str, optional): positive text prompts. Defaults to "".
+            negative_text (str, optional): negative text prompts. Defaults to "".
+            merge_positive (bool, optional): OR all positive masks. Defaults to True.
+            merge_negative (bool, optional): OR all negative masks. Defaults to True.
+            conf (float, optional): object confidence threshold. Defaults to 0.4.
+            iou (float, optional): iou threshold for filtering the annotations. Defaults to 0.9.
+
+        Returns:
+            np.ndarray:  mask output, expect shape 11HW. FastSAM does not support multi-mask selection.
+        """
+        assert type(self.sam_model_wrapper) == FastSAM, "Incorrect SAM model wrapper. Expect FastSAM here."
+        logger.info(f"Running FastSAM {self.sam_model_name} inference.")
+        annotation = self.sam_model_wrapper(
+            input_image, device=self.sam_device, retina_masks=True, imgsz=1024, conf=conf, iou=iou)
+        has_points = positive_points is not None or negative_points is not None
+        has_bbox = positive_bbox is not None or negative_bbox is not None
+        has_text = positive_text != "" or negative_text != ""
+        assert has_points or has_bbox or has_text, "No input provided. Please provide at least one point or one bbox or one text."
+        logger.info("Post-processing FastSAM inference.")
+        prompt_process = FastSAMPrompt(input_image, annotation, device=self.sam_device)
+        mask_shape = (1, 1) + input_image.shape[:2]
+        mask_bbox_positive = prompt_process.box_prompt(bboxes=positive_bbox) if positive_bbox is not None else np.ones(shape=mask_shape, dtype=np.bool_)
+        mask_bbox_negative = prompt_process.box_prompt(bboxes=negative_bbox) if negative_bbox is not None else np.zeros(shape=mask_shape, dtype=np.bool_)
+
+        mask_text_positive = prompt_process.text_prompt(text=positive_text) if positive_text != "" else np.ones(shape=mask_shape, dtype=np.bool_)
+        mask_text_negative = prompt_process.text_prompt(text=negative_text) if negative_text != "" else np.zeros(shape=mask_shape, dtype=np.bool_)
+
+        point_coords = np.array(positive_points + negative_points)
+        point_labels = np.array([1] * len(positive_points) + [0] * len(negative_points))
+        mask_points_positive = prompt_process.point_prompt(points=point_coords, pointlabel=point_labels) if has_points else np.ones(shape=mask_shape, dtype=np.bool_)
+
+        if merge_positive:
+            mask_positive = mask_bbox_positive | mask_text_positive | mask_points_positive
+        else:
+            mask_positive = mask_bbox_positive & mask_text_positive & mask_points_positive
+        if merge_negative:
+            mask_negative = mask_bbox_negative | mask_text_negative
+        else:
+            mask_negative = mask_bbox_negative & mask_text_negative
+        return mask_positive & ~mask_negative
+
+
+    def yolo_predict(self, input_image: np.ndarray, conf=0.4) -> np.ndarray:
+        """Run segmentation inference with models based on YOLO.
+
+        Args:
+            input_image (np.ndarray): input image, expect shape HW3.
+            conf (float, optional): object confidence threshold. Defaults to 0.4.
+
+        Raises:
+            RuntimeError: not getting any bbox. Might be caused by high conf or non-detection/segmentation model.
+
+        Returns:
+            np.ndarray: mask output, expect shape 11HW. YOLO does not support multi-mask selection.
+        """
+        assert shared.opts.data.get("sam_use_yolo_models", False), "YOLO models are not enabled. Please enable in settings/Segment Anything."
+        assert type(self.sam_model_wrapper) == YOLO, "Incorrect SAM model wrapper. Expect YOLO here."
+        logger.info("Running YOLO inference.")
+        pred = self.sam_model_wrapper(input_image, conf=conf)
+        bboxes = pred[0].boxes.xyxy.cpu().numpy()
+        if bboxes.size == 0:
+            error_msg = "You are not getting any bbox. There are 2 possible reasons. "\
+                        "1. You set up a high conf which means that you should lower the conf. "\
+                        "2. You are using a non-detection/segmentation model which means that you should check your model type."
+            logger.error(error_msg)
+            raise RuntimeError(error_msg)
+
+        if pred[0].masks is None:
+            logger.warn("You are not using a segmentation model. Will use bbox to create masks.")
+            masks = []
+            for bbox in bboxes:
+                mask_shape = (1, 1) + input_image.shape[:2]
+                mask = np.zeros(mask_shape, dtype=bool)
+                x1, y1, x2, y2 = bbox
+                mask[:, :, y1:y2+1, x1:x2+1] = True
+            return np.logical_or.reduce(masks, axis=0)
+        else:
+            return np.logical_or.reduce(pred[0].masks.data, axis=0)
+
+
+    def clear(self):
+        """Clear segmentation model from CPU & GPU."""
+        del self.sam_model
+        self.sam_model = None
+        self.sam_model_name = ""
+        self.sam_model_wrapper = None
+        self.sam_m2m.clear()
+
+
+    def unload_model(self):
+        """Move all segmentation models to CPU."""
+        if self.sam_model is not None:
+            self.sam_model.cpu()
+        self.sam_m2m.unload_model()
+
+
+    def __call__(self, 
+        sam_checkpoint_name: str, 
+        input_image: np.ndarray,
+        positive_points: List[List[int]]=None,
+        negative_points: List[List[int]]=None, 
+        positive_bbox: List[List[float]]=None,
+        negative_bbox: List[List[float]]=None,
+        positive_text: str="",
+        negative_text: str="",
+        merge_positive=True,
+        merge_negative=True,
+        multimask_output=True,
+        point_with_box=False,
+        use_mam=False,
+        mam_guidance_mode: str="mask",
+        conf=0.4, iou=0.9,) -> np.ndarray:
+        # use_cpu: bool=False,) -> np.ndarray:
+        """Entry for segmentation inference. Load model, run inference, unload model if lowvram.
+
+        Args:
+            sam_checkpoint_name (str): The model filename. Do not change.
+            input_image (np.ndarray): input image, expect shape HW3.
+            positive_points (List[List[int]], optional): positive point prompts, expect N * xy. Defaults to None. Valid for SAM & FastSAM.
+            negative_points (List[List[int]], optional): negative point prompts, expect N * xy. Defaults to None. Valid for SAM & FastSAM.
+            positive_bbox (List[List[float]], optional): positive bbox prompts, expect N * xyxy. Defaults to None. Valid for SAM & FastSAM.
+            negative_bbox (List[List[float]], optional): negative bbox prompts, expect N * xyxy. Defaults to None. Valid for SAM & FastSAM.
+            positive_text (str, optional): positive text prompts. Defaults to "". Valid for FastSAM.
+            negative_text (str, optional): negative text prompts. Defaults to "". Valid for FastSAM.
+            merge_positive (bool, optional): OR all positive masks. Defaults to True. Valid for SAM (point_with_box is True) & FastSAM.
+            merge_negative (bool, optional): OR all negative masks. Defaults to True. Valid for FastSAM.
+            multimask_output (bool, optional): output 3 masks or not. Defaults to True. Valid for SAM.
+            point_with_box (bool, optional): always send bboxes and points to the model at the same time. Defaults to False. Valid for SAM.
+            use_mam (bool, optional): use Matting-Anything. Defaults to False. Valid for SAM.
+            mam_guidance_mode (str, optional): guidance model for Matting-Anything. Expect "mask" or "bbox". Defaults to "mask". Valid for SAM and use_mam is True.
+            conf (float, optional): object confidence threshold. Defaults to 0.4. Valid for FastSAM & YOLO.
+            iou (float, optional): iou threshold for filtering the annotations. Defaults to 0.9. Valid for FastSAM.
+            use_cpu (bool, optional): use CPU for SAM inference. Defaults to False.
+
+        Returns:
+            np.ndarray: _description_
+        """
+        # self.change_device(use_cpu)
+        self.load_sam_model(sam_checkpoint_name)
+        if type(self.sam_model_wrapper) == SamPredictorHQ:
+            masks = self.sam_predict(
+                input_image=input_image,
+                positive_points=positive_points,
+                negative_points=negative_points, 
+                positive_bbox=positive_bbox,
+                negative_bbox=negative_bbox,
+                merge_positive=merge_positive,
+                multimask_output=multimask_output,
+                point_with_box=point_with_box,
+                use_mam=use_mam,
+                mam_guidance_mode=mam_guidance_mode)
+        elif type(self.sam_model_wrapper) == FastSAM:
+            masks = self.fastsam_predict(
+                input_image=input_image,
+                positive_points=positive_points,
+                negative_points=negative_points, 
+                positive_bbox=positive_bbox,
+                negative_bbox=negative_bbox,
+                positive_text=positive_text,
+                negative_text=negative_text,
+                merge_positive=merge_positive,
+                merge_negative=merge_negative,
+                conf=conf, iou=iou)
+        else:
+            masks = self.yolo_predict(
+                input_image=input_image,
+                conf=conf)
+        if shared.cmd_opts.lowvram:
+            self.unload_model()
+        gc.collect()
+        torch_gc()
+        return masks
+
+
+# how to use yolo for auto
+# category name dropdown and dynamic ui
+# yolo model for segmentation and detection
+# zoom in, unify box+point
+# make masks smaller
diff --git a/sam_utils/util.py b/sam_utils/util.py
new file mode 100644
index 0000000..629218d
--- /dev/null
+++ b/sam_utils/util.py
@@ -0,0 +1,204 @@
+from typing import Tuple, List
+import os
+import numpy as np
+import cv2
+import copy
+from PIL import Image
+
+
+class ModelInfo:
+
+    def __init__(self, model_type: str, url: str, author: str, size: str, download_info: str="auto download"):
+        self.model_type = model_type
+        self.url = url
+        self.author = author
+        self.size = size
+        self.download_info = download_info
+
+
+    def get_info(self, model_name: str):
+        return f"{model_name} ({self.size}, {self.author}, {self.model_type}, {self.download_info})"
+    
+
+    def local_path(self, path: str):
+        self.url = path
+        self.download_info = "downloaded"
+
+
+def show_boxes(image_np: np.ndarray, boxes: np.ndarray, color=(255, 0, 0, 255), thickness=2, show_index=False) -> np.ndarray:
+    if boxes is None:
+        return image_np
+    image = copy.deepcopy(image_np)
+    for idx, box in enumerate(boxes):
+        x, y, w, h = box
+        cv2.rectangle(image, (x, y), (w, h), color, thickness)
+        if show_index:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            text = str(idx)
+            textsize = cv2.getTextSize(text, font, 1, 2)[0]
+            cv2.putText(image, text, (x, y+textsize[1]), font, 1, color, thickness)
+    return image
+
+
+def show_masks(image_np: np.ndarray, masks: np.ndarray, alpha=0.5) -> np.ndarray:
+    image = copy.deepcopy(image_np)
+    np.random.seed(0)
+    for mask in masks:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        image[mask] = image[mask] * (1 - alpha) + 255 * color.reshape(1, 1, -1) * alpha
+    return image.astype(np.uint8)
+
+
+def dilate_mask(mask: np.ndarray, dilation_amt: int) -> Tuple[Image.Image, np.ndarray]:
+    from scipy.ndimage import binary_dilation
+    x, y = np.meshgrid(np.arange(dilation_amt), np.arange(dilation_amt))
+    center = dilation_amt // 2
+    dilation_kernel = ((x - center)**2 + (y - center)**2 <= center**2).astype(np.uint8)
+    dilated_binary_img = binary_dilation(mask, dilation_kernel)
+    dilated_mask = Image.fromarray(dilated_binary_img.astype(np.uint8) * 255)
+    return dilated_mask, dilated_binary_img
+
+
+def update_mask(mask_gallery, chosen_mask: int, dilation_amt: float, input_image: Image.Image):
+    if isinstance(mask_gallery, list):
+        mask_image = Image.open(mask_gallery[chosen_mask + 3]['name'])
+    else:
+        mask_image = mask_gallery
+    binary_img = np.array(mask_image.convert('1'))
+    if dilation_amt:
+        mask_image, binary_img = dilate_mask(binary_img, dilation_amt)
+    blended_image = Image.fromarray(show_masks(np.array(input_image), binary_img.astype(np.bool_)[None, ...]))
+    matted_image = np.array(input_image)
+    matted_image[~binary_img] = np.array([0, 0, 0, 0])
+    return [blended_image, mask_image, Image.fromarray(matted_image)]
+
+
+def create_mask_output(image_np: np.ndarray, masks: np.ndarray, boxes_filt: np.ndarray) -> List[Image.Image]:
+    mask_images, masks_gallery, matted_images = [], [], []
+    boxes_filt = boxes_filt.astype(int) if boxes_filt is not None else None
+    for mask in masks:
+        masks_gallery.append(Image.fromarray(np.any(mask, axis=0)))
+        blended_image = show_masks(show_boxes(image_np, boxes_filt), mask)
+        mask_images.append(Image.fromarray(blended_image))
+        image_np_copy = copy.deepcopy(image_np)
+        image_np_copy[~np.any(mask, axis=0)] = np.array([0, 0, 0, 0])
+        matted_images.append(Image.fromarray(image_np_copy))
+    return mask_images + masks_gallery + matted_images
+
+
+def create_mask_batch_output(
+    input_image_filename: str, dest_dir: str, 
+    image_np: np.ndarray, masks: np.ndarray, boxes_filt: np.ndarray, dilation_amt: float, 
+    save_image: bool, save_mask: bool, save_background: bool, save_image_with_mask: bool):
+    filename, ext = os.path.splitext(os.path.basename(input_image_filename))
+    ext = ".png" # JPEG not compatible with RGBA
+    for idx, mask in enumerate(masks):
+        blended_image = show_masks(show_boxes(image_np, boxes_filt), mask)
+        merged_mask = np.any(mask, axis=0)
+        if save_background:
+            merged_mask = ~merged_mask
+        if dilation_amt:
+            _, merged_mask = dilate_mask(merged_mask, dilation_amt)
+        image_np_copy = copy.deepcopy(image_np)
+        image_np_copy[~merged_mask] = np.array([0, 0, 0, 0])
+        if save_image:
+            output_image = Image.fromarray(image_np_copy)
+            output_image.save(os.path.join(dest_dir, f"{filename}_{idx}_output{ext}"))
+        if save_mask:
+            output_mask = Image.fromarray(merged_mask)
+            output_mask.save(os.path.join(dest_dir, f"{filename}_{idx}_mask{ext}"))
+        if save_image_with_mask:
+            output_blend = Image.fromarray(blended_image)
+            output_blend.save(os.path.join(dest_dir, f"{filename}_{idx}_blend{ext}"))
+
+
+def blend_image_and_seg(image: np.ndarray, seg: np.ndarray, alpha=0.5) -> Image.Image:
+    image_blend = image * (1 - alpha) + np.array(seg) * alpha
+    return Image.fromarray(image_blend.astype(np.uint8))
+
+
+def install_pycocotools():
+    # install pycocotools if needed
+    from sam_utils.logger import logger
+    try:
+        import pycocotools.mask as maskUtils
+    except:
+        logger.warn("pycocotools not found, will try installing C++ based pycocotools")
+        try:
+            from launch import run_pip
+            run_pip(f"install pycocotools", f"AutoSAM requirement: pycocotools")
+            import pycocotools.mask as maskUtils
+        except:
+            import traceback
+            traceback.print_exc()
+            import sys
+            if sys.platform == "win32":
+                logger.warn("Unable to install pycocotools, will try installing pycocotools-windows")
+                try:
+                    run_pip("install pycocotools-windows", "AutoSAM requirement: pycocotools-windows")
+                    import pycocotools.mask as maskUtils
+                except:
+                    error_msg = "Unable to install pycocotools-windows"
+                    logger.error(error_msg)
+                    traceback.print_exc()
+                    raise RuntimeError(error_msg)
+            else:
+                error_msg = "Unable to install pycocotools"
+                logger.error(error_msg)
+                traceback.print_exc()
+                raise RuntimeError(error_msg)
+
+
+def install_goundingdino() -> bool:
+    """Automatically install GroundingDINO.
+
+    Returns:
+        bool: False if use local GroundingDINO, True if use pip installed GroundingDINO.
+    """
+    from sam_utils.logger import logger
+    from modules import shared
+    dino_install_issue_text = "Please permanently switch to local GroundingDINO on Settings/Segment Anything or submit an issue to https://github.com/IDEA-Research/Grounded-Segment-Anything/issues."
+    if shared.opts.data.get("sam_use_local_groundingdino", False):
+        logger.info("Using local groundingdino.")
+        return False
+
+    def verify_dll(install_local=True):
+        try:
+            from groundingdino import _C
+            logger.info("GroundingDINO dynamic library have been successfully built.")
+            return True
+        except Exception:
+            import traceback
+            traceback.print_exc()
+            def run_pip_uninstall(command, desc=None):
+                from launch import python, run
+                default_command_live = (os.environ.get('WEBUI_LAUNCH_LIVE_OUTPUT') == "1")
+                return run(f'"{python}" -m pip uninstall -y {command}', desc=f"Uninstalling {desc}", errdesc=f"Couldn't uninstall {desc}", live=default_command_live)
+            if install_local:
+                logger.warn(f"Failed to build dymanic library. Will uninstall GroundingDINO from pip and fall back to local GroundingDINO this time. {dino_install_issue_text}")
+                run_pip_uninstall(
+                    f"groundingdino",
+                    f"sd-webui-segment-anything requirement: groundingdino")
+            else:
+                logger.warn(f"Failed to build dymanic library. Will uninstall GroundingDINO from pip and re-try installing from GitHub source code. {dino_install_issue_text}")
+                run_pip_uninstall(
+                    f"uninstall groundingdino",
+                    f"sd-webui-segment-anything requirement: groundingdino")
+            return False
+
+    import launch
+    if launch.is_installed("groundingdino"):
+        logger.info("Found GroundingDINO in pip. Verifying if dynamic library build success.")
+        if verify_dll(install_local=False):
+            return True
+    try:
+        launch.run_pip(
+            f"install git+https://github.com/IDEA-Research/GroundingDINO",
+            f"sd-webui-segment-anything requirement: groundingdino")
+        logger.info("GroundingDINO install success. Verifying if dynamic library build success.")
+        return verify_dll()
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        logger.warn(f"GroundingDINO install failed. Will fall back to local groundingdino this time. {dino_install_issue_text}")
+        return False
diff --git a/scripts/auto.py b/scripts/auto.py
deleted file mode 100644
index 088ab13..0000000
--- a/scripts/auto.py
+++ /dev/null
@@ -1,334 +0,0 @@
-import os
-import gc
-import glob
-import copy
-from PIL import Image
-from collections import OrderedDict
-import numpy as np
-import torch
-import cv2
-from sam_hq.automatic import SamAutomaticMaskGeneratorHQ
-from modules import scripts, shared
-from modules.paths import extensions_dir
-from modules.devices import torch_gc
-
-
-global_sam: SamAutomaticMaskGeneratorHQ = None
-sem_seg_cache = OrderedDict()
-sam_annotator_dir = os.path.join(scripts.basedir(), "annotator")
-original_uniformer_inference_segmentor = None
-
-
-def pad64(x):
-    return int(np.ceil(float(x) / 64.0) * 64 - x)
-
-
-def safer_memory(x):
-    # Fix many MAC/AMD problems
-    return np.ascontiguousarray(x.copy()).copy()
-
-
-def resize_image_with_pad(input_image, resolution):
-    from annotator.util import HWC3
-    img = HWC3(input_image)
-    H_raw, W_raw, _ = img.shape
-    k = float(resolution) / float(min(H_raw, W_raw))
-    interpolation = cv2.INTER_CUBIC if k > 1 else cv2.INTER_AREA
-    H_target = int(np.round(float(H_raw) * k))
-    W_target = int(np.round(float(W_raw) * k))
-    img = cv2.resize(img, (W_target, H_target), interpolation=interpolation)
-    H_pad, W_pad = pad64(H_target), pad64(W_target)
-    img_padded = np.pad(img, [[0, H_pad], [0, W_pad], [0, 0]], mode='edge')
-
-    def remove_pad(x):
-        return safer_memory(x[:H_target, :W_target])
-
-    return safer_memory(img_padded), remove_pad
-
-
-def blend_image_and_seg(image, seg, alpha=0.5):
-    image_blend = image * (1 - alpha) + np.array(seg) * alpha
-    return Image.fromarray(image_blend.astype(np.uint8))
-
-
-def create_symbolic_link():
-    cnet_annotator_dir = os.path.join(extensions_dir, "sd-webui-controlnet/annotator")
-    if os.path.isdir(cnet_annotator_dir):
-        if not os.path.isdir(sam_annotator_dir):
-            os.symlink(cnet_annotator_dir, sam_annotator_dir, target_is_directory=True)
-        return True
-    return False
-
-
-def clear_sem_sam_cache():
-    sem_seg_cache.clear()
-    gc.collect()
-    torch_gc()
-    
-
-def sem_sam_garbage_collect():
-    if shared.cmd_opts.lowvram:
-        for model_key, model in sem_seg_cache:
-            if model_key == "uniformer":
-                from annotator.uniformer import unload_uniformer_model
-                unload_uniformer_model()
-            else:
-                model.unload_model()
-    gc.collect()
-    torch_gc()
-
-
-def strengthen_sem_seg(class_ids, img):
-    print("Auto SAM strengthening semantic segmentation")
-    import pycocotools.mask as maskUtils
-    semantc_mask = copy.deepcopy(class_ids)
-    annotations = global_sam.generate(img)
-    annotations = sorted(annotations, key=lambda x: x['area'], reverse=True)
-    print(f"Auto SAM generated {len(annotations)} masks")
-    for ann in annotations:
-        valid_mask = torch.tensor(maskUtils.decode(ann['segmentation'])).bool()
-        propose_classes_ids = torch.tensor(class_ids[valid_mask])
-        num_class_proposals = len(torch.unique(propose_classes_ids))
-        if num_class_proposals == 1:
-            semantc_mask[valid_mask] = propose_classes_ids[0].numpy()
-            continue
-        top_1_propose_class_ids = torch.bincount(propose_classes_ids.flatten()).topk(1).indices
-        semantc_mask[valid_mask] = top_1_propose_class_ids.numpy()
-    print("Auto SAM strengthen process end")
-    return semantc_mask
-
-
-def random_segmentation(img):
-    print("Auto SAM generating random segmentation for Edit-Anything")
-    img_np = np.array(img.convert("RGB"))
-    annotations = global_sam.generate(img_np)
-    # annotations = sorted(annotations, key=lambda x: x['area'], reverse=True)
-    print(f"Auto SAM generated {len(annotations)} masks")
-    H, W, _ = img_np.shape
-    color_map = np.zeros((H, W, 3), dtype=np.uint8)
-    detected_map_tmp = np.zeros((H, W), dtype=np.uint16)
-    for idx, annotation in enumerate(annotations):
-        current_seg = annotation['segmentation']
-        color_map[current_seg] = np.random.randint(0, 255, (3))
-        detected_map_tmp[current_seg] = idx + 1
-    detected_map = np.zeros((detected_map_tmp.shape[0], detected_map_tmp.shape[1], 3))
-    detected_map[:, :, 0] = detected_map_tmp % 256
-    detected_map[:, :, 1] = detected_map_tmp // 256
-    from annotator.util import HWC3
-    detected_map = HWC3(detected_map.astype(np.uint8))
-    print("Auto SAM generation process end")
-    return [blend_image_and_seg(img_np, color_map), Image.fromarray(color_map), Image.fromarray(detected_map)], \
-        "Random segmentation done. Left above (0) is blended image, right above (1) is random segmentation, left below (2) is Edit-Anything control input."
-
-
-def image_layer_image(layout_input_image, layout_output_path):
-    img_np = np.array(layout_input_image.convert("RGB"))
-    annotations = global_sam.generate(img_np)
-    print(f"AutoSAM generated {len(annotations)} annotations")
-    annotations = sorted(annotations, key=lambda x: x['area'])
-    for idx, annotation in enumerate(annotations):
-        img_tmp = np.zeros((img_np.shape[0], img_np.shape[1], 3))
-        img_tmp[annotation['segmentation']] = img_np[annotation['segmentation']]
-        img_np[annotation['segmentation']] = np.array([0, 0, 0])
-        img_tmp = Image.fromarray(img_tmp.astype(np.uint8))
-        img_tmp.save(os.path.join(layout_output_path, f"{idx}.png"))
-    img_np = Image.fromarray(img_np.astype(np.uint8))
-    img_np.save(os.path.join(layout_output_path, "leftover.png"))
-
-
-def image_layer_internal(layout_input_image_or_path, layout_output_path):
-    if isinstance(layout_input_image_or_path, str):
-        print("Image layer division batch processing")
-        all_files = glob.glob(os.path.join(layout_input_image_or_path, "*"))
-        for image_index, input_image_file in enumerate(all_files):
-            print(f"Processing {image_index}/{len(all_files)} {input_image_file}")
-            try:
-                input_image = Image.open(input_image_file)
-                output_directory = os.path.join(layout_output_path, os.path.splitext(os.path.basename(input_image_file))[0])
-                from pathlib import Path
-                Path(output_directory).mkdir(exist_ok=True)
-            except:
-                print(f"File {input_image_file} not image, skipped.")
-                continue
-            image_layer_image(input_image, output_directory)
-    else:
-        image_layer_image(layout_input_image_or_path, layout_output_path)
-    return "Done"
-
-
-def inject_uniformer_inference_segmentor(model, img):
-    original_result = original_uniformer_inference_segmentor(model, img)
-    original_result[0] = strengthen_sem_seg(original_result[0], img)
-    return original_result
-
-
-def inject_uniformer_show_result_pyplot(model, img, result, palette=None, fig_size=(15, 10), opacity=0.5, title='', block=True):
-    return result[0]
-
-
-def inject_oneformer_semantic_run(img, predictor, metadata):
-    predictions = predictor(img[:, :, ::-1], "semantic")  # Predictor of OneFormer must use BGR image !!!
-    original_result = predictions["sem_seg"].argmax(dim=0).cpu().numpy()
-    from annotator.oneformer.oneformer.demo.visualizer import Visualizer, ColorMode
-    visualizer_map = Visualizer(img, is_img=False, metadata=metadata, instance_mode=ColorMode.IMAGE)
-    out_map = visualizer_map.draw_sem_seg(torch.tensor(strengthen_sem_seg(original_result, img), device='cpu'), alpha=1, is_text=False).get_image()
-    return out_map
-
-
-def inject_oneformer_semantic_run_categorical_mask(img, predictor, metadata):
-    predictions = predictor(img[:, :, ::-1], "semantic")  # Predictor of OneFormer must use BGR image !!!
-    original_result = predictions["sem_seg"].argmax(dim=0).cpu().numpy()
-    return strengthen_sem_seg(original_result, img)
-
-
-def inject_oneformer_detector_call(self, img):
-    if self.model is None:
-        self.load_model()
-    self.model.model.to(self.device)
-    return inject_oneformer_semantic_run(img, self.model, self.metadata)
-
-
-def inject_oneformer_detector_call_categorical_mask(self, img):
-    if self.model is None:
-        self.load_model()
-    self.model.model.to(self.device)
-    return inject_oneformer_semantic_run_categorical_mask(img, self.model, self.metadata)
-
-
-def _uniformer(img):
-    if "uniformer" not in sem_seg_cache:
-        from annotator.uniformer import apply_uniformer
-        sem_seg_cache["uniformer"] = apply_uniformer
-    result = sem_seg_cache["uniformer"](img)
-    return result
-
-
-def _oneformer(img, dataset="coco"):
-    oneformer_key = f"oneformer_{dataset}"
-    if oneformer_key not in sem_seg_cache:
-        from annotator.oneformer import OneformerDetector
-        sem_seg_cache[oneformer_key] = OneformerDetector(OneformerDetector.configs[dataset])
-    result = sem_seg_cache[oneformer_key](img)
-    return result
-
-
-def semantic_segmentation(input_image, annotator_name, processor_res, 
-                          use_pixel_perfect, resize_mode, target_W, target_H):
-    if input_image is None:
-        return [], "No input image."
-    if "seg" in annotator_name:
-        if not os.path.isdir(os.path.join(scripts.basedir(), "annotator")) and not create_symbolic_link():
-            return [], "ControlNet extension not found."
-        global original_uniformer_inference_segmentor
-        input_image_np = np.array(input_image)
-        processor_res = pixel_perfect_lllyasviel(input_image_np, processor_res, use_pixel_perfect, resize_mode, target_W, target_H)
-        input_image, remove_pad = resize_image_with_pad(input_image_np, processor_res)
-        print("Generating semantic segmentation without SAM")
-        if annotator_name == "seg_ufade20k":
-            original_semseg = _uniformer(input_image)
-            print("Generating semantic segmentation with SAM")
-            import annotator.uniformer as uniformer
-            original_uniformer_inference_segmentor = uniformer.inference_segmentor
-            uniformer.inference_segmentor = inject_uniformer_inference_segmentor
-            sam_semseg = _uniformer(input_image)
-            uniformer.inference_segmentor = original_uniformer_inference_segmentor
-            original_semseg = remove_pad(original_semseg)
-            sam_semseg = remove_pad(sam_semseg)
-            input_image = remove_pad(input_image)
-            output_gallery = [original_semseg, sam_semseg, blend_image_and_seg(input_image, original_semseg), blend_image_and_seg(input_image, sam_semseg)]
-            return output_gallery, "Uniformer semantic segmentation of ade20k done. Left is segmentation before SAM, right is segmentation after SAM."
-        else:
-            dataset = annotator_name.split('_')[-1][2:]
-            original_semseg = _oneformer(input_image, dataset=dataset)
-            print("Generating semantic segmentation with SAM")
-            from annotator.oneformer import OneformerDetector
-            original_oneformer_call = OneformerDetector.__call__
-            OneformerDetector.__call__ = inject_oneformer_detector_call
-            sam_semseg = _oneformer(input_image, dataset=dataset)
-            OneformerDetector.__call__ = original_oneformer_call
-            original_semseg = remove_pad(original_semseg)
-            sam_semseg = remove_pad(sam_semseg)
-            input_image = remove_pad(input_image)
-            output_gallery = [original_semseg, sam_semseg, blend_image_and_seg(input_image, original_semseg), blend_image_and_seg(input_image, sam_semseg)]
-            return output_gallery, f"Oneformer semantic segmentation of {dataset} done. Left is segmentation before SAM, right is segmentation after SAM."
-    else:
-        return random_segmentation(input_image)
-
-
-def categorical_mask_image(crop_processor, crop_processor_res, crop_category_input, crop_input_image,
-                           crop_pixel_perfect, crop_resize_mode, target_W, target_H):
-    if crop_input_image is None:
-        return "No input image."
-    if not os.path.isdir(os.path.join(scripts.basedir(), "annotator")) and not create_symbolic_link():
-        return "ControlNet extension not found."
-    filter_classes = crop_category_input.split('+')
-    if len(filter_classes) == 0:
-        return "No class selected."
-    try:
-        filter_classes = [int(i) for i in filter_classes]
-    except:
-        return "Illegal class id. You may have input some string."
-    crop_input_image_np = np.array(crop_input_image)
-    crop_processor_res = pixel_perfect_lllyasviel(crop_input_image_np, crop_processor_res, crop_pixel_perfect, crop_resize_mode, target_W, target_H)
-    crop_input_image, remove_pad = resize_image_with_pad(crop_input_image_np, crop_processor_res)
-    crop_input_image_copy = copy.deepcopy(crop_input_image)
-    global original_uniformer_inference_segmentor
-    print(f"Generating categories with processor {crop_processor}")
-    if crop_processor == "seg_ufade20k":
-        import annotator.uniformer as uniformer
-        original_uniformer_inference_segmentor = uniformer.inference_segmentor
-        uniformer.inference_segmentor = inject_uniformer_inference_segmentor
-        tmp_ouis = uniformer.show_result_pyplot
-        uniformer.show_result_pyplot = inject_uniformer_show_result_pyplot
-        sam_semseg = _uniformer(crop_input_image)
-        uniformer.inference_segmentor = original_uniformer_inference_segmentor
-        uniformer.show_result_pyplot = tmp_ouis
-    else:
-        dataset = crop_processor.split('_')[-1][2:]
-        from annotator.oneformer import OneformerDetector
-        original_oneformer_call = OneformerDetector.__call__
-        OneformerDetector.__call__ = inject_oneformer_detector_call_categorical_mask
-        sam_semseg = _oneformer(crop_input_image, dataset=dataset)
-        OneformerDetector.__call__ = original_oneformer_call
-    sam_semseg = remove_pad(sam_semseg)
-    mask = np.zeros(sam_semseg.shape, dtype=np.bool_)
-    for i in filter_classes:
-        mask[sam_semseg == i] = True
-    return mask, remove_pad(crop_input_image_copy)
-
-
-def register_auto_sam(sam, 
-    auto_sam_points_per_side, auto_sam_points_per_batch, auto_sam_pred_iou_thresh, 
-    auto_sam_stability_score_thresh, auto_sam_stability_score_offset, auto_sam_box_nms_thresh, 
-    auto_sam_crop_n_layers, auto_sam_crop_nms_thresh, auto_sam_crop_overlap_ratio, 
-    auto_sam_crop_n_points_downscale_factor, auto_sam_min_mask_region_area, auto_sam_output_mode):
-    global global_sam
-    global_sam = SamAutomaticMaskGeneratorHQ(
-        sam, auto_sam_points_per_side, auto_sam_points_per_batch, 
-        auto_sam_pred_iou_thresh, auto_sam_stability_score_thresh, 
-        auto_sam_stability_score_offset, auto_sam_box_nms_thresh, 
-        auto_sam_crop_n_layers, auto_sam_crop_nms_thresh, auto_sam_crop_overlap_ratio, 
-        auto_sam_crop_n_points_downscale_factor, None, 
-        auto_sam_min_mask_region_area, auto_sam_output_mode)
-
-
-def pixel_perfect_lllyasviel(input_image, processor_res, use_pixel_perfect, resize_mode, target_W, target_H):
-    preprocessor_resolution = processor_res
-    if use_pixel_perfect:
-        raw_H, raw_W, _ = input_image.shape
-        k0 = float(target_H) / float(raw_H)
-        k1 = float(target_W) / float(raw_W)
-        if resize_mode == 2:
-            estimation = min(k0, k1) * float(min(raw_H, raw_W))
-        else:
-            estimation = max(k0, k1) * float(min(raw_H, raw_W))
-        preprocessor_resolution = int(np.round(float(estimation) / 64.0)) * 64
-        print(f'Pixel Perfect Mode (written by lllyasviel) Enabled.')
-        print(f'resize_mode = {str(resize_mode)}')
-        print(f'raw_H = {raw_H}')
-        print(f'raw_W = {raw_W}')
-        print(f'target_H = {target_H}')
-        print(f'target_W = {target_W}')
-        print(f'estimation = {estimation}')
-    print(f'preprocessor resolution = {preprocessor_resolution}')
-    return preprocessor_resolution
diff --git a/scripts/dino.py b/scripts/dino.py
deleted file mode 100644
index a034196..0000000
--- a/scripts/dino.py
+++ /dev/null
@@ -1,187 +0,0 @@
-import os
-import gc
-import cv2
-import copy
-import torch
-from collections import OrderedDict
-
-from modules import scripts, shared
-from modules.devices import device, torch_gc, cpu
-import local_groundingdino
-
-
-dino_model_cache = OrderedDict()
-sam_extension_dir = scripts.basedir()
-dino_model_dir = os.path.join(sam_extension_dir, "models/grounding-dino")
-dino_model_list = ["GroundingDINO_SwinT_OGC (694MB)", "GroundingDINO_SwinB (938MB)"]
-dino_model_info = {
-    "GroundingDINO_SwinT_OGC (694MB)": {
-        "checkpoint": "groundingdino_swint_ogc.pth",
-        "config": os.path.join(dino_model_dir, "GroundingDINO_SwinT_OGC.py"),
-        "url": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth",
-    },
-    "GroundingDINO_SwinB (938MB)": {
-        "checkpoint": "groundingdino_swinb_cogcoor.pth",
-        "config": os.path.join(dino_model_dir, "GroundingDINO_SwinB.cfg.py"),
-        "url": "https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swinb_cogcoor.pth"
-    },
-}
-dino_install_issue_text = "permanently switch to local groundingdino on Settings/Segment Anything or submit an issue to https://github.com/IDEA-Research/Grounded-Segment-Anything/issues."
-
-
-def install_goundingdino():
-    if shared.opts.data.get("sam_use_local_groundingdino", False):
-        print("Using local groundingdino.")
-        return False
-
-    def verify_dll(install_local=True):
-        try:
-            from groundingdino import _C
-            print("GroundingDINO dynamic library have been successfully built.")
-            return True
-        except Exception:
-            import traceback
-            traceback.print_exc()
-            def run_pip_uninstall(command, desc=None):
-                from launch import python, run
-                default_command_live = (os.environ.get('WEBUI_LAUNCH_LIVE_OUTPUT') == "1")
-                return run(f'"{python}" -m pip uninstall -y {command}', desc=f"Uninstalling {desc}", errdesc=f"Couldn't uninstall {desc}", live=default_command_live)
-            if install_local:
-                print(f"Failed to build dymanic library. Will uninstall GroundingDINO from pip and fall back to local groundingdino this time. Please {dino_install_issue_text}")
-                run_pip_uninstall(
-                    f"groundingdino",
-                    f"sd-webui-segment-anything requirement: groundingdino")
-            else:
-                print(f"Failed to build dymanic library. Will uninstall GroundingDINO from pip and re-try installing from GitHub source code. Please {dino_install_issue_text}")
-                run_pip_uninstall(
-                    f"uninstall groundingdino",
-                    f"sd-webui-segment-anything requirement: groundingdino")
-            return False
-
-    import launch
-    if launch.is_installed("groundingdino"):
-        print("Found GroundingDINO in pip. Verifying if dynamic library build success.")
-        if verify_dll(install_local=False):
-            return True
-    try:
-        launch.run_pip(
-            f"install git+https://github.com/IDEA-Research/GroundingDINO",
-            f"sd-webui-segment-anything requirement: groundingdino")
-        print("GroundingDINO install success. Verifying if dynamic library build success.")
-        return verify_dll()
-    except Exception:
-        import traceback
-        traceback.print_exc()
-        print(f"GroundingDINO install failed. Will fall back to local groundingdino this time. Please {dino_install_issue_text}")
-        return False
-
-
-def show_boxes(image_np, boxes, color=(255, 0, 0, 255), thickness=2, show_index=False):
-    if boxes is None:
-        return image_np
-
-    image = copy.deepcopy(image_np)
-    for idx, box in enumerate(boxes):
-        x, y, w, h = box
-        cv2.rectangle(image, (x, y), (w, h), color, thickness)
-        if show_index:
-            font = cv2.FONT_HERSHEY_SIMPLEX
-            text = str(idx)
-            textsize = cv2.getTextSize(text, font, 1, 2)[0]
-            cv2.putText(image, text, (x, y+textsize[1]), font, 1, color, thickness)
-
-    return image
-
-
-def clear_dino_cache():
-    dino_model_cache.clear()
-    gc.collect()
-    torch_gc()
-
-
-def load_dino_model(dino_checkpoint, dino_install_success):
-    print(f"Initializing GroundingDINO {dino_checkpoint}")
-    if dino_checkpoint in dino_model_cache:
-        dino = dino_model_cache[dino_checkpoint]
-        if shared.cmd_opts.lowvram:
-            dino.to(device=device)
-    else:
-        clear_dino_cache()
-        if dino_install_success:
-            from groundingdino.models import build_model
-            from groundingdino.util.slconfig import SLConfig
-            from groundingdino.util.utils import clean_state_dict
-        else:
-            from local_groundingdino.models import build_model
-            from local_groundingdino.util.slconfig import SLConfig
-            from local_groundingdino.util.utils import clean_state_dict
-        args = SLConfig.fromfile(dino_model_info[dino_checkpoint]["config"])
-        dino = build_model(args)
-        checkpoint = torch.hub.load_state_dict_from_url(
-            dino_model_info[dino_checkpoint]["url"], dino_model_dir)
-        dino.load_state_dict(clean_state_dict(
-            checkpoint['model']), strict=False)
-        dino.to(device=device)
-        dino_model_cache[dino_checkpoint] = dino
-    dino.eval()
-    return dino
-
-
-def load_dino_image(image_pil, dino_install_success):
-    if dino_install_success:
-        import groundingdino.datasets.transforms as T
-    else:
-        from local_groundingdino.datasets import transforms as T
-    transform = T.Compose(
-        [
-            T.RandomResize([800], max_size=1333),
-            T.ToTensor(),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
-        ]
-    )
-    image, _ = transform(image_pil, None)  # 3, h, w
-    return image
-
-
-def get_grounding_output(model, image, caption, box_threshold):
-    caption = caption.lower()
-    caption = caption.strip()
-    if not caption.endswith("."):
-        caption = caption + "."
-    image = image.to(device)
-    with torch.no_grad():
-        outputs = model(image[None], captions=[caption])
-    if shared.cmd_opts.lowvram:
-        model.to(cpu)
-    logits = outputs["pred_logits"].sigmoid()[0]  # (nq, 256)
-    boxes = outputs["pred_boxes"][0]  # (nq, 4)
-
-    # filter output
-    logits_filt = logits.clone()
-    boxes_filt = boxes.clone()
-    filt_mask = logits_filt.max(dim=1)[0] > box_threshold
-    logits_filt = logits_filt[filt_mask]  # num_filt, 256
-    boxes_filt = boxes_filt[filt_mask]  # num_filt, 4
-
-    return boxes_filt.cpu()
-
-
-def dino_predict_internal(input_image, dino_model_name, text_prompt, box_threshold):
-    install_success = install_goundingdino()
-    print("Running GroundingDINO Inference")
-    dino_image = load_dino_image(input_image.convert("RGB"), install_success)
-    dino_model = load_dino_model(dino_model_name, install_success)
-    install_success = install_success or shared.opts.data.get("sam_use_local_groundingdino", False)
-
-    boxes_filt = get_grounding_output(
-        dino_model, dino_image, text_prompt, box_threshold
-    )
-
-    H, W = input_image.size[1], input_image.size[0]
-    for i in range(boxes_filt.size(0)):
-        boxes_filt[i] = boxes_filt[i] * torch.Tensor([W, H, W, H])
-        boxes_filt[i][:2] -= boxes_filt[i][2:] / 2
-        boxes_filt[i][2:] += boxes_filt[i][:2]
-    gc.collect()
-    torch_gc()
-    return boxes_filt, install_success
diff --git a/scripts/sam.py b/scripts/sam.py
index a40a2ec..c37c962 100644
--- a/scripts/sam.py
+++ b/scripts/sam.py
@@ -7,29 +7,18 @@
 import torch
 import gradio as gr
 from collections import OrderedDict
-from scipy.ndimage import binary_dilation
 from modules import scripts, shared, script_callbacks
 from modules.ui import gr_show
 from modules.ui_components import FormRow
 from modules.safe import unsafe_torch_load, load
 from modules.processing import StableDiffusionProcessingImg2Img, StableDiffusionProcessing
 from modules.devices import device, torch_gc, cpu
-from modules.paths import models_path
+
 from sam_hq.predictor import SamPredictorHQ
 from sam_hq.build_sam_hq import sam_model_registry
-from scripts.dino import dino_model_list, dino_predict_internal, show_boxes, clear_dino_cache, dino_install_issue_text
-from scripts.auto import clear_sem_sam_cache, register_auto_sam, semantic_segmentation, sem_sam_garbage_collect, image_layer_internal, categorical_mask_image
-from scripts.process_params import SAMProcessUnit, max_cn_num
-
-
-refresh_symbol = '\U0001f504'       # 🔄
-sam_model_cache = OrderedDict()
-scripts_sam_model_dir = os.path.join(scripts.basedir(), "models/sam") 
-sd_sam_model_dir = os.path.join(models_path, "sam")
-sam_model_dir = sd_sam_model_dir if os.path.exists(sd_sam_model_dir) else scripts_sam_model_dir 
-sam_model_list = [f for f in os.listdir(sam_model_dir) if os.path.isfile(os.path.join(sam_model_dir, f)) and f.split('.')[-1] != 'txt']
-sam_device = device
-
+from scripts.sam_dino import dino_model_list, dino_predict_internal, show_boxes, clear_dino_cache, dino_install_issue_text
+from scripts.sam_auto import clear_sem_sam_cache, register_auto_sam, semantic_segmentation, sem_sam_garbage_collect, image_layer_internal, categorical_mask_image
+from scripts.sam_process import SAMProcessUnit, max_cn_num
 
 txt2img_width: gr.Slider = None
 txt2img_height: gr.Slider = None
@@ -45,43 +34,6 @@ def __init__(self, **kwargs):
 
     def get_block_name(self):
         return "button"
-        
-
-def show_masks(image_np, masks: np.ndarray, alpha=0.5):
-    image = copy.deepcopy(image_np)
-    np.random.seed(0)
-    for mask in masks:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-        image[mask] = image[mask] * (1 - alpha) + 255 * color.reshape(1, 1, -1) * alpha
-    return image.astype(np.uint8)
-
-
-def update_mask(mask_gallery, chosen_mask, dilation_amt, input_image):
-    print("Dilation Amount: ", dilation_amt)
-    if isinstance(mask_gallery, list):
-        mask_image = Image.open(mask_gallery[chosen_mask + 3]['name'])
-    else:
-        mask_image = mask_gallery
-    binary_img = np.array(mask_image.convert('1'))
-    if dilation_amt:
-        mask_image, binary_img = dilate_mask(binary_img, dilation_amt)
-    blended_image = Image.fromarray(show_masks(np.array(input_image), binary_img.astype(np.bool_)[None, ...]))
-    matted_image = np.array(input_image)
-    matted_image[~binary_img] = np.array([0, 0, 0, 0])
-    return [blended_image, mask_image, Image.fromarray(matted_image)]
-
-
-def load_sam_model(sam_checkpoint):
-    model_type = sam_checkpoint.split('.')[0]
-    if 'hq' not in model_type:
-        model_type = '_'.join(model_type.split('_')[:-1])
-    sam_checkpoint_path = os.path.join(sam_model_dir, sam_checkpoint)
-    torch.load = unsafe_torch_load
-    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint_path)
-    sam.to(device=sam_device)
-    sam.eval()
-    torch.load = load
-    return sam
 
 
 def clear_sam_cache():
@@ -103,86 +55,6 @@ def garbage_collect(sam):
     torch_gc()
 
 
-def refresh_sam_models(*inputs):
-    global sam_model_list
-    sam_model_list = [f for f in os.listdir(sam_model_dir) if os.path.isfile(
-        os.path.join(sam_model_dir, f)) and f.split('.')[-1] != 'txt']
-    dd = inputs[0]
-    if dd in sam_model_list:
-        selected = dd
-    elif len(sam_model_list) > 0:
-        selected = sam_model_list[0]
-    else:
-        selected = None
-    return gr.Dropdown.update(choices=sam_model_list, value=selected)
-
-
-def init_sam_model(sam_model_name):
-    print(f"Initializing SAM to {sam_device}")
-    if sam_model_name in sam_model_cache:
-        sam = sam_model_cache[sam_model_name]
-        if shared.cmd_opts.lowvram or (str(sam_device) not in str(sam.device)):
-            sam.to(device=sam_device)
-        return sam
-    elif sam_model_name in sam_model_list:
-        clear_sam_cache()
-        sam_model_cache[sam_model_name] = load_sam_model(sam_model_name)
-        return sam_model_cache[sam_model_name]
-    else:
-        raise Exception(
-            f"{sam_model_name} not found, please download model to models/sam.")
-
-
-def dilate_mask(mask, dilation_amt):
-    x, y = np.meshgrid(np.arange(dilation_amt), np.arange(dilation_amt))
-    center = dilation_amt // 2
-    dilation_kernel = ((x - center)**2 + (y - center)**2 <= center**2).astype(np.uint8)
-    dilated_binary_img = binary_dilation(mask, dilation_kernel)
-    dilated_mask = Image.fromarray(dilated_binary_img.astype(np.uint8) * 255)
-    return dilated_mask, dilated_binary_img
-
-
-def create_mask_output(image_np, masks, boxes_filt):
-    print("Creating output image")
-    mask_images, masks_gallery, matted_images = [], [], []
-    boxes_filt = boxes_filt.numpy().astype(int) if boxes_filt is not None else None
-    for mask in masks:
-        masks_gallery.append(Image.fromarray(np.any(mask, axis=0)))
-        blended_image = show_masks(show_boxes(image_np, boxes_filt), mask)
-        mask_images.append(Image.fromarray(blended_image))
-        image_np_copy = copy.deepcopy(image_np)
-        image_np_copy[~np.any(mask, axis=0)] = np.array([0, 0, 0, 0])
-        matted_images.append(Image.fromarray(image_np_copy))
-    return mask_images + masks_gallery + matted_images
-
-
-def create_mask_batch_output(
-    input_image_file, dino_batch_dest_dir, 
-    image_np, masks, boxes_filt, batch_dilation_amt, 
-    dino_batch_save_image, dino_batch_save_mask, dino_batch_save_background, dino_batch_save_image_with_mask):
-    print("Creating batch output image")
-    filename, ext = os.path.splitext(os.path.basename(input_image_file))
-    ext = ".png" # JPEG not compatible with RGBA
-    for idx, mask in enumerate(masks):
-        blended_image = show_masks(show_boxes(image_np, boxes_filt), mask)
-        merged_mask = np.any(mask, axis=0)
-        if dino_batch_save_background:
-            merged_mask = ~merged_mask
-        if batch_dilation_amt:
-            _, merged_mask = dilate_mask(merged_mask, batch_dilation_amt)
-        image_np_copy = copy.deepcopy(image_np)
-        image_np_copy[~merged_mask] = np.array([0, 0, 0, 0])
-        if dino_batch_save_image:
-            output_image = Image.fromarray(image_np_copy)
-            output_image.save(os.path.join(dino_batch_dest_dir, f"{filename}_{idx}_output{ext}"))
-        if dino_batch_save_mask:
-            output_mask = Image.fromarray(merged_mask)
-            output_mask.save(os.path.join(dino_batch_dest_dir, f"{filename}_{idx}_mask{ext}"))
-        if dino_batch_save_image_with_mask:
-            output_blend = Image.fromarray(blended_image)
-            output_blend.save(os.path.join(dino_batch_dest_dir, f"{filename}_{idx}_blend{ext}"))
-
-
 def sam_predict(sam_model_name, input_image, positive_points, negative_points,
                 dino_checkbox, dino_model_name, text_prompt, box_threshold,
                 dino_preview_checkbox, dino_preview_boxes_selection):
@@ -812,6 +684,7 @@ def on_after_component(component, **_kwargs):
 def on_ui_settings():
     section = ('segment_anything', "Segment Anything")
     shared.opts.add_option("sam_use_local_groundingdino", shared.OptionInfo(False, "Use local groundingdino to bypass C++ problem", section=section))
+    shared.opts.add_option("sam_model_path", shared.OptionInfo("", "Specify SAM model path", section=section))
 
 
 script_callbacks.on_ui_settings(on_ui_settings)
diff --git a/scripts/api.py b/scripts/sam_api.py
similarity index 98%
rename from scripts/api.py
rename to scripts/sam_api.py
index 0d602db..af6d157 100644
--- a/scripts/api.py
+++ b/scripts/sam_api.py
@@ -21,7 +21,7 @@ def decode_to_pil(image):
     elif type(image) is np.ndarray:
         return Image.fromarray(image)
     else:
-        Exception("Not an image")
+        raise Exception("Not an image")
 
 
 def encode_to_base64(image):
@@ -33,7 +33,7 @@ def encode_to_base64(image):
         pil = Image.fromarray(image)
         return encode_pil_to_base64(pil).decode()
     else:
-        Exception("Invalid type")
+        raise Exception("Invalid type")
 
 
 def sam_api(_: gr.Blocks, app: FastAPI):    
@@ -243,4 +243,4 @@ async def api_category_mask(payload: CategoryMaskRequest = Body(...),
     import modules.script_callbacks as script_callbacks
     script_callbacks.on_app_started(sam_api)
 except:
-    print("SAM Web UI API failed to initialize")
\ No newline at end of file
+    print("SAM Web UI API failed to initialize")
diff --git a/scripts/process_params.py b/scripts/sam_process.py
similarity index 98%
rename from scripts/process_params.py
rename to scripts/sam_process.py
index 7ff18d0..a49805d 100644
--- a/scripts/process_params.py
+++ b/scripts/sam_process.py
@@ -1,6 +1,5 @@
 from typing import Tuple, List, Dict
-from PIL import Image, ImageOps, ImageEnhance, ImageFilter
-import numpy as np
+from PIL import Image, ImageOps
 from modules import shared
 
 
diff --git a/scripts/sam_state.py b/scripts/sam_state.py
new file mode 100644
index 0000000..5b8e35d
--- /dev/null
+++ b/scripts/sam_state.py
@@ -0,0 +1,3 @@
+from modules import scripts
+
+sam_extension_dir = scripts.basedir()
diff --git a/scripts/sam_ui.py b/scripts/sam_ui.py
new file mode 100644
index 0000000..72d9b0e
--- /dev/null
+++ b/scripts/sam_ui.py
@@ -0,0 +1,20 @@
+import gradio as gr
+
+class SamUI:
+
+    def __init__(self) -> None:
+        pass
+
+    def render_sam_model(self) -> None:
+        with gr.Row():
+            with gr.Column(scale=10):
+                with gr.Row():
+                    sam_model_name = gr.Dropdown(label="SAM Model", choices=sam_model_list, value=sam_model_list[0] if len(sam_model_list) > 0 else None)
+                    sam_refresh_models = ToolButton(value=refresh_symbol)
+                    sam_refresh_models.click(refresh_sam_models, sam_model_name, sam_model_name)
+            with gr.Column(scale=1):
+                sam_use_cpu = gr.Checkbox(value=False, label="Use CPU for SAM")
+                def change_sam_device(use_cpu=False):
+                    global sam_device
+                    sam_device = "cpu" if use_cpu else device
+                sam_use_cpu.change(fn=change_sam_device, inputs=[sam_use_cpu], show_progress=False)
\ No newline at end of file
diff --git a/thirdparty/fastsam/__init__.py b/thirdparty/fastsam/__init__.py
new file mode 100644
index 0000000..7f99d8d
--- /dev/null
+++ b/thirdparty/fastsam/__init__.py
@@ -0,0 +1,9 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+
+from .model import FastSAM
+from .predict import FastSAMPredictor
+from .prompt import FastSAMPrompt
+# from .val import FastSAMValidator
+from .decoder import FastSAMDecoder
+
+__all__ = 'FastSAMPredictor', 'FastSAM', 'FastSAMPrompt', 'FastSAMDecoder'
diff --git a/thirdparty/fastsam/decoder.py b/thirdparty/fastsam/decoder.py
new file mode 100644
index 0000000..5b92ed8
--- /dev/null
+++ b/thirdparty/fastsam/decoder.py
@@ -0,0 +1,131 @@
+from .model import FastSAM
+import numpy as np
+from PIL import Image
+import clip
+from typing import Optional, List, Tuple, Union
+
+
+class FastSAMDecoder:
+    def __init__(
+        self,
+        model: FastSAM,
+        device: str='cpu',
+        conf: float=0.4, 
+        iou: float=0.9,
+        imgsz: int=1024,
+        retina_masks: bool=True,
+        ):
+        self.model = model
+        self.device = device
+        self.retina_masks = retina_masks
+        self.imgsz = imgsz
+        self.conf = conf
+        self.iou = iou
+        self.image = None
+        self.image_embedding = None
+        
+    def run_encoder(self, image):
+        if isinstance(image,str):
+            image =  np.array(Image.open(image))
+        self.image = image
+        image_embedding = self.model(
+            self.image, 
+            device=self.device, 
+            retina_masks=self.retina_masks, 
+            imgsz=self.imgsz, 
+            conf=self.conf, 
+            iou=self.iou
+            )
+        return image_embedding[0].numpy()
+
+    def run_decoder(
+            self,
+            image_embedding, 
+            point_prompt: Optional[np.ndarray]=None,
+            point_label: Optional[np.ndarray]=None,
+            box_prompt: Optional[np.ndarray]=None,
+            text_prompt: Optional[str]=None,
+            )->np.ndarray:
+        self.image_embedding = image_embedding
+        if point_prompt is not None:
+            ann = self.point_prompt(points=point_prompt, pointlabel=point_label)
+            return ann
+        elif box_prompt is not None:
+            ann = self.box_prompt(bbox=box_prompt)
+            return ann
+        elif text_prompt is not None:
+            ann = self.text_prompt(text=text_prompt)
+            return ann
+        else:
+            return None
+
+    def box_prompt(self, bbox):
+        assert (bbox[2] != 0 and bbox[3] != 0)
+        masks = self.image_embedding.masks.data
+        target_height = self.ori_img.shape[0]
+        target_width = self.ori_img.shape[1]
+        h = masks.shape[1]
+        w = masks.shape[2]
+        if h != target_height or w != target_width:
+            bbox = [
+                int(bbox[0] * w / target_width),
+                int(bbox[1] * h / target_height),
+                int(bbox[2] * w / target_width),
+                int(bbox[3] * h / target_height), ]
+        bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
+        bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
+        bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
+        bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
+
+        # IoUs = torch.zeros(len(masks), dtype=torch.float32)
+        bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+
+        masks_area = np.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], axis=(1, 2))
+        orig_masks_area = np.sum(masks, axis=(1, 2))
+
+        union = bbox_area + orig_masks_area - masks_area
+        IoUs = masks_area / union
+        max_iou_index = np.argmax(IoUs)
+
+        return np.array([masks[max_iou_index].cpu().numpy()])
+
+    def point_prompt(self, points, pointlabel):  # numpy 
+
+        masks = self._format_results(self.results[0], 0)
+        target_height = self.ori_img.shape[0]
+        target_width = self.ori_img.shape[1]
+        h = masks[0]['segmentation'].shape[0]
+        w = masks[0]['segmentation'].shape[1]
+        if h != target_height or w != target_width:
+            points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
+        onemask = np.zeros((h, w))
+        masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+        for i, annotation in enumerate(masks):
+            if type(annotation) == dict:
+                mask = annotation['segmentation']
+            else:
+                mask = annotation
+            for i, point in enumerate(points):
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
+                    onemask[mask] = 1
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
+                    onemask[mask] = 0
+        onemask = onemask >= 1
+        return np.array([onemask])
+    
+    def _format_results(self, result, filter=0):
+        annotations = []
+        n = len(result.masks.data)
+        for i in range(n):
+            annotation = {}
+            mask = result.masks.data[i] == 1.0
+
+            if np.sum(mask) < filter:
+                continue
+            annotation['id'] = i
+            annotation['segmentation'] = mask
+            annotation['bbox'] = result.boxes.data[i]
+            annotation['score'] = result.boxes.conf[i]
+            annotation['area'] = annotation['segmentation'].sum()
+            annotations.append(annotation)
+        return annotations
\ No newline at end of file
diff --git a/thirdparty/fastsam/model.py b/thirdparty/fastsam/model.py
new file mode 100644
index 0000000..5450cb3
--- /dev/null
+++ b/thirdparty/fastsam/model.py
@@ -0,0 +1,104 @@
+# Ultralytics YOLO 🚀, AGPL-3.0 license
+"""
+YOLO-NAS model interface.
+
+Usage - Predict:
+    from ultralytics import FastSAM
+
+    model = FastSAM('last.pt')
+    results = model.predict('ultralytics/assets/bus.jpg')
+"""
+
+from ultralytics.yolo.cfg import get_cfg
+from ultralytics.yolo.engine.exporter import Exporter
+from ultralytics.yolo.engine.model import YOLO
+from ultralytics.yolo.utils import DEFAULT_CFG, LOGGER, ROOT, is_git_dir
+from ultralytics.yolo.utils.checks import check_imgsz
+
+from ultralytics.yolo.utils.torch_utils import model_info, smart_inference_mode
+from .predict import FastSAMPredictor
+
+
+class FastSAM(YOLO):
+
+    @smart_inference_mode()
+    def predict(self, source=None, stream=False, **kwargs):
+        """
+        Perform prediction using the YOLO model.
+
+        Args:
+            source (str | int | PIL | np.ndarray): The source of the image to make predictions on.
+                          Accepts all source types accepted by the YOLO model.
+            stream (bool): Whether to stream the predictions or not. Defaults to False.
+            **kwargs : Additional keyword arguments passed to the predictor.
+                       Check the 'configuration' section in the documentation for all available options.
+
+        Returns:
+            (List[ultralytics.yolo.engine.results.Results]): The prediction results.
+        """
+        if source is None:
+            source = ROOT / 'assets' if is_git_dir() else 'https://ultralytics.com/images/bus.jpg'
+            LOGGER.warning(f"WARNING ⚠️ 'source' is missing. Using 'source={source}'.")
+        overrides = self.overrides.copy()
+        overrides['conf'] = 0.25
+        overrides.update(kwargs)  # prefer kwargs
+        overrides['mode'] = kwargs.get('mode', 'predict')
+        assert overrides['mode'] in ['track', 'predict']
+        overrides['save'] = kwargs.get('save', False)  # do not save by default if called in Python
+        self.predictor = FastSAMPredictor(overrides=overrides)
+        self.predictor.setup_model(model=self.model, verbose=False)
+
+        return self.predictor(source, stream=stream)
+
+    def train(self, **kwargs):
+        """Function trains models but raises an error as FastSAM models do not support training."""
+        raise NotImplementedError("FastSAM models don't support training")
+
+    def val(self, **kwargs):
+        """Run validation given dataset."""
+        overrides = dict(task='segment', mode='val')
+        overrides.update(kwargs)  # prefer kwargs
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.imgsz = check_imgsz(args.imgsz, max_dim=1)
+        validator = FastSAM(args=args)
+        validator(model=self.model)
+        self.metrics = validator.metrics
+        return validator.metrics
+
+    @smart_inference_mode()
+    def export(self, **kwargs):
+        """
+        Export model.
+
+        Args:
+            **kwargs : Any other args accepted by the predictors. To see all args check 'configuration' section in docs
+        """
+        overrides = dict(task='detect')
+        overrides.update(kwargs)
+        overrides['mode'] = 'export'
+        args = get_cfg(cfg=DEFAULT_CFG, overrides=overrides)
+        args.task = self.task
+        if args.imgsz == DEFAULT_CFG.imgsz:
+            args.imgsz = self.model.args['imgsz']  # use trained imgsz unless custom value is passed
+        if args.batch == DEFAULT_CFG.batch:
+            args.batch = 1  # default to 1 if not modified
+        return Exporter(overrides=args)(model=self.model)
+
+    def info(self, detailed=False, verbose=True):
+        """
+        Logs model info.
+
+        Args:
+            detailed (bool): Show detailed information about model.
+            verbose (bool): Controls verbosity.
+        """
+        return model_info(self.model, detailed=detailed, verbose=verbose, imgsz=640)
+
+    def __call__(self, source=None, stream=False, **kwargs):
+        """Calls the 'predict' function with given arguments to perform object detection."""
+        return self.predict(source, stream, **kwargs)
+
+    def __getattr__(self, attr):
+        """Raises error if object has no requested attribute."""
+        name = self.__class__.__name__
+        raise AttributeError(f"'{name}' object has no attribute '{attr}'. See valid attributes below.\n{self.__doc__}")
diff --git a/thirdparty/fastsam/predict.py b/thirdparty/fastsam/predict.py
new file mode 100644
index 0000000..5e3ca40
--- /dev/null
+++ b/thirdparty/fastsam/predict.py
@@ -0,0 +1,52 @@
+import torch
+
+from ultralytics.yolo.engine.results import Results
+from ultralytics.yolo.utils import DEFAULT_CFG, ops
+from ultralytics.yolo.v8.detect.predict import DetectionPredictor
+from .utils import bbox_iou
+
+class FastSAMPredictor(DetectionPredictor):
+
+    def __init__(self, cfg=DEFAULT_CFG, overrides=None, _callbacks=None):
+        super().__init__(cfg, overrides, _callbacks)
+        self.args.task = 'segment'
+
+    def postprocess(self, preds, img, orig_imgs):
+        """TODO: filter by classes."""
+        p = ops.non_max_suppression(preds[0],
+                                    self.args.conf,
+                                    self.args.iou,
+                                    agnostic=self.args.agnostic_nms,
+                                    max_det=self.args.max_det,
+                                    nc=len(self.model.names),
+                                    classes=self.args.classes)
+        
+        full_box = torch.zeros_like(p[0][0])
+        full_box[2], full_box[3], full_box[4], full_box[6:] = img.shape[3], img.shape[2], 1.0, 1.0
+        full_box = full_box.view(1, -1)
+        critical_iou_index = bbox_iou(full_box[0][:4], p[0][:, :4], iou_thres=0.9, image_shape=img.shape[2:])
+        if critical_iou_index.numel() != 0:
+            full_box[0][4] = p[0][critical_iou_index][:,4]
+            full_box[0][6:] = p[0][critical_iou_index][:,6:]
+            p[0][critical_iou_index] = full_box
+        
+        results = []
+        proto = preds[1][-1] if len(preds[1]) == 3 else preds[1]  # second output is len 3 if pt, but only 1 if exported
+        for i, pred in enumerate(p):
+            orig_img = orig_imgs[i] if isinstance(orig_imgs, list) else orig_imgs
+            path = self.batch[0]
+            img_path = path[i] if isinstance(path, list) else path
+            if not len(pred):  # save empty boxes
+                results.append(Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6]))
+                continue
+            if self.args.retina_masks:
+                if not isinstance(orig_imgs, torch.Tensor):
+                    pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
+                masks = ops.process_mask_native(proto[i], pred[:, 6:], pred[:, :4], orig_img.shape[:2])  # HWC
+            else:
+                masks = ops.process_mask(proto[i], pred[:, 6:], pred[:, :4], img.shape[2:], upsample=True)  # HWC
+                if not isinstance(orig_imgs, torch.Tensor):
+                    pred[:, :4] = ops.scale_boxes(img.shape[2:], pred[:, :4], orig_img.shape)
+            results.append(
+                Results(orig_img=orig_img, path=img_path, names=self.model.names, boxes=pred[:, :6], masks=masks))
+        return results
diff --git a/thirdparty/fastsam/prompt.py b/thirdparty/fastsam/prompt.py
new file mode 100644
index 0000000..e67143a
--- /dev/null
+++ b/thirdparty/fastsam/prompt.py
@@ -0,0 +1,417 @@
+import os
+import sys
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from PIL import Image
+
+
+
+try:
+    import clip  # for linear_assignment
+
+except (ImportError, AssertionError, AttributeError):
+    from ultralytics.yolo.utils.checks import check_requirements
+
+    check_requirements('git+https://github.com/openai/CLIP.git')  # required before installing lap from source
+    import clip
+
+
+class FastSAMPrompt:
+
+    def __init__(self, img_path, results, device='cuda') -> None:
+        # self.img_path = img_path
+        self.device = device
+        self.results = results
+        self.img_path = img_path
+        self.ori_img = cv2.imread(img_path)
+
+    def _segment_image(self, image, bbox):
+        image_array = np.array(image)
+        segmented_image_array = np.zeros_like(image_array)
+        x1, y1, x2, y2 = bbox
+        segmented_image_array[y1:y2, x1:x2] = image_array[y1:y2, x1:x2]
+        segmented_image = Image.fromarray(segmented_image_array)
+        black_image = Image.new('RGB', image.size, (255, 255, 255))
+        # transparency_mask = np.zeros_like((), dtype=np.uint8)
+        transparency_mask = np.zeros((image_array.shape[0], image_array.shape[1]), dtype=np.uint8)
+        transparency_mask[y1:y2, x1:x2] = 255
+        transparency_mask_image = Image.fromarray(transparency_mask, mode='L')
+        black_image.paste(segmented_image, mask=transparency_mask_image)
+        return black_image
+
+    def _format_results(self, result, filter=0):
+        annotations = []
+        n = len(result.masks.data)
+        for i in range(n):
+            annotation = {}
+            mask = result.masks.data[i] == 1.0
+
+            if torch.sum(mask) < filter:
+                continue
+            annotation['id'] = i
+            annotation['segmentation'] = mask.cpu().numpy()
+            annotation['bbox'] = result.boxes.data[i]
+            annotation['score'] = result.boxes.conf[i]
+            annotation['area'] = annotation['segmentation'].sum()
+            annotations.append(annotation)
+        return annotations
+
+    def filter_masks(annotations):  # filte the overlap mask
+        annotations.sort(key=lambda x: x['area'], reverse=True)
+        to_remove = set()
+        for i in range(0, len(annotations)):
+            a = annotations[i]
+            for j in range(i + 1, len(annotations)):
+                b = annotations[j]
+                if i != j and j not in to_remove:
+                    # check if
+                    if b['area'] < a['area']:
+                        if (a['segmentation'] & b['segmentation']).sum() / b['segmentation'].sum() > 0.8:
+                            to_remove.add(j)
+
+        return [a for i, a in enumerate(annotations) if i not in to_remove], to_remove
+
+    def _get_bbox_from_mask(self, mask):
+        mask = mask.astype(np.uint8)
+        contours, hierarchy = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+        x1, y1, w, h = cv2.boundingRect(contours[0])
+        x2, y2 = x1 + w, y1 + h
+        if len(contours) > 1:
+            for b in contours:
+                x_t, y_t, w_t, h_t = cv2.boundingRect(b)
+                # Merge multiple bounding boxes into one.
+                x1 = min(x1, x_t)
+                y1 = min(y1, y_t)
+                x2 = max(x2, x_t + w_t)
+                y2 = max(y2, y_t + h_t)
+            h = y2 - y1
+            w = x2 - x1
+        return [x1, y1, x2, y2]
+
+    def plot(self,
+             annotations,
+             output,
+             bboxes=None,
+             points=None,
+             point_label=None,
+             mask_random_color=True,
+             better_quality=True,
+             retina=False,
+             withContours=True):
+        if isinstance(annotations[0], dict):
+            annotations = [annotation['segmentation'] for annotation in annotations]
+        result_name = os.path.basename(self.img_path)
+        image = self.ori_img
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        original_h = image.shape[0]
+        original_w = image.shape[1]
+        if sys.platform == "darwin":
+            plt.switch_backend("TkAgg")
+        plt.figure(figsize=(original_w / 100, original_h / 100))
+        # Add subplot with no margin.
+        plt.subplots_adjust(top=1, bottom=0, right=1, left=0, hspace=0, wspace=0)
+        plt.margins(0, 0)
+        plt.gca().xaxis.set_major_locator(plt.NullLocator())
+        plt.gca().yaxis.set_major_locator(plt.NullLocator())
+
+        plt.imshow(image)
+        if better_quality:
+            if isinstance(annotations[0], torch.Tensor):
+                annotations = np.array(annotations.cpu())
+            for i, mask in enumerate(annotations):
+                mask = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_CLOSE, np.ones((3, 3), np.uint8))
+                annotations[i] = cv2.morphologyEx(mask.astype(np.uint8), cv2.MORPH_OPEN, np.ones((8, 8), np.uint8))
+        if self.device == 'cpu':
+            annotations = np.array(annotations)
+            self.fast_show_mask(
+                annotations,
+                plt.gca(),
+                random_color=mask_random_color,
+                bboxes=bboxes,
+                points=points,
+                pointlabel=point_label,
+                retinamask=retina,
+                target_height=original_h,
+                target_width=original_w,
+            )
+        else:
+            if isinstance(annotations[0], np.ndarray):
+                annotations = torch.from_numpy(annotations)
+            self.fast_show_mask_gpu(
+                annotations,
+                plt.gca(),
+                random_color=mask_random_color,
+                bboxes=bboxes,
+                points=points,
+                pointlabel=point_label,
+                retinamask=retina,
+                target_height=original_h,
+                target_width=original_w,
+            )
+        if isinstance(annotations, torch.Tensor):
+            annotations = annotations.cpu().numpy()
+        if withContours:
+            contour_all = []
+            temp = np.zeros((original_h, original_w, 1))
+            for i, mask in enumerate(annotations):
+                if type(mask) == dict:
+                    mask = mask['segmentation']
+                annotation = mask.astype(np.uint8)
+                if not retina:
+                    annotation = cv2.resize(
+                        annotation,
+                        (original_w, original_h),
+                        interpolation=cv2.INTER_NEAREST,
+                    )
+                contours, hierarchy = cv2.findContours(annotation, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
+                for contour in contours:
+                    contour_all.append(contour)
+            cv2.drawContours(temp, contour_all, -1, (255, 255, 255), 2)
+            color = np.array([0 / 255, 0 / 255, 255 / 255, 0.8])
+            contour_mask = temp / 255 * color.reshape(1, 1, -1)
+            plt.imshow(contour_mask)
+
+        save_path = output
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        plt.axis('off')
+        fig = plt.gcf()
+        plt.draw()
+
+        try:
+            buf = fig.canvas.tostring_rgb()
+        except AttributeError:
+            fig.canvas.draw()
+            buf = fig.canvas.tostring_rgb()
+        cols, rows = fig.canvas.get_width_height()
+        img_array = np.frombuffer(buf, dtype=np.uint8).reshape(rows, cols, 3)
+        cv2.imwrite(os.path.join(save_path, result_name), cv2.cvtColor(img_array, cv2.COLOR_RGB2BGR))
+
+    #   CPU post process
+    def fast_show_mask(
+        self,
+        annotation,
+        ax,
+        random_color=False,
+        bboxes=None,
+        points=None,
+        pointlabel=None,
+        retinamask=True,
+        target_height=960,
+        target_width=960,
+    ):
+        msak_sum = annotation.shape[0]
+        height = annotation.shape[1]
+        weight = annotation.shape[2]
+        #Sort annotations based on area.
+        areas = np.sum(annotation, axis=(1, 2))
+        sorted_indices = np.argsort(areas)
+        annotation = annotation[sorted_indices]
+
+        index = (annotation != 0).argmax(axis=0)
+        if random_color:
+            color = np.random.random((msak_sum, 1, 1, 3))
+        else:
+            color = np.ones((msak_sum, 1, 1, 3)) * np.array([30 / 255, 144 / 255, 255 / 255])
+        transparency = np.ones((msak_sum, 1, 1, 1)) * 0.6
+        visual = np.concatenate([color, transparency], axis=-1)
+        mask_image = np.expand_dims(annotation, -1) * visual
+
+        show = np.zeros((height, weight, 4))
+        h_indices, w_indices = np.meshgrid(np.arange(height), np.arange(weight), indexing='ij')
+        indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+        # Use vectorized indexing to update the values of 'show'.
+        show[h_indices, w_indices, :] = mask_image[indices]
+        if bboxes is not None:
+            for bbox in bboxes:
+                x1, y1, x2, y2 = bbox
+                ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
+        # draw point
+        if points is not None:
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+                s=20,
+                c='y',
+            )
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+                s=20,
+                c='m',
+            )
+
+        if not retinamask:
+            show = cv2.resize(show, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
+        ax.imshow(show)
+
+    def fast_show_mask_gpu(
+        self,
+        annotation,
+        ax,
+        random_color=False,
+        bboxes=None,
+        points=None,
+        pointlabel=None,
+        retinamask=True,
+        target_height=960,
+        target_width=960,
+    ):
+        msak_sum = annotation.shape[0]
+        height = annotation.shape[1]
+        weight = annotation.shape[2]
+        areas = torch.sum(annotation, dim=(1, 2))
+        sorted_indices = torch.argsort(areas, descending=False)
+        annotation = annotation[sorted_indices]
+        # Find the index of the first non-zero value at each position.
+        index = (annotation != 0).to(torch.long).argmax(dim=0)
+        if random_color:
+            color = torch.rand((msak_sum, 1, 1, 3)).to(annotation.device)
+        else:
+            color = torch.ones((msak_sum, 1, 1, 3)).to(annotation.device) * torch.tensor([
+                30 / 255, 144 / 255, 255 / 255]).to(annotation.device)
+        transparency = torch.ones((msak_sum, 1, 1, 1)).to(annotation.device) * 0.6
+        visual = torch.cat([color, transparency], dim=-1)
+        mask_image = torch.unsqueeze(annotation, -1) * visual
+        # Select data according to the index. The index indicates which batch's data to choose at each position, converting the mask_image into a single batch form.
+        show = torch.zeros((height, weight, 4)).to(annotation.device)
+        h_indices, w_indices = torch.meshgrid(torch.arange(height), torch.arange(weight), indexing='ij')
+        indices = (index[h_indices, w_indices], h_indices, w_indices, slice(None))
+        # Use vectorized indexing to update the values of 'show'.
+        show[h_indices, w_indices, :] = mask_image[indices]
+        show_cpu = show.cpu().numpy()
+        if bboxes is not None:
+            for bbox in bboxes:
+                x1, y1, x2, y2 = bbox
+                ax.add_patch(plt.Rectangle((x1, y1), x2 - x1, y2 - y1, fill=False, edgecolor='b', linewidth=1))
+        # draw point
+        if points is not None:
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 1],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 1],
+                s=20,
+                c='y',
+            )
+            plt.scatter(
+                [point[0] for i, point in enumerate(points) if pointlabel[i] == 0],
+                [point[1] for i, point in enumerate(points) if pointlabel[i] == 0],
+                s=20,
+                c='m',
+            )
+        if not retinamask:
+            show_cpu = cv2.resize(show_cpu, (target_width, target_height), interpolation=cv2.INTER_NEAREST)
+        ax.imshow(show_cpu)
+
+    # clip
+    @torch.no_grad()
+    def retrieve(self, model, preprocess, elements, search_text: str, device) -> int:
+        preprocessed_images = [preprocess(image).to(device) for image in elements]
+        tokenized_text = clip.tokenize([search_text]).to(device)
+        stacked_images = torch.stack(preprocessed_images)
+        image_features = model.encode_image(stacked_images)
+        text_features = model.encode_text(tokenized_text)
+        image_features /= image_features.norm(dim=-1, keepdim=True)
+        text_features /= text_features.norm(dim=-1, keepdim=True)
+        probs = 100.0 * image_features @ text_features.T
+        return probs[:, 0].softmax(dim=0)
+
+    def _crop_image(self, format_results):
+
+        image = Image.fromarray(cv2.cvtColor(self.ori_img, cv2.COLOR_BGR2RGB))
+        ori_w, ori_h = image.size
+        annotations = format_results
+        mask_h, mask_w = annotations[0]['segmentation'].shape
+        if ori_w != mask_w or ori_h != mask_h:
+            image = image.resize((mask_w, mask_h))
+        cropped_boxes = []
+        cropped_images = []
+        not_crop = []
+        filter_id = []
+        # annotations, _ = filter_masks(annotations)
+        # filter_id = list(_)
+        for _, mask in enumerate(annotations):
+            if np.sum(mask['segmentation']) <= 100:
+                filter_id.append(_)
+                continue
+            bbox = self._get_bbox_from_mask(mask['segmentation'])  # mask 的 bbox
+            cropped_boxes.append(self._segment_image(image, bbox))  
+            # cropped_boxes.append(segment_image(image,mask["segmentation"]))
+            cropped_images.append(bbox)  # Save the bounding box of the cropped image.
+
+        return cropped_boxes, cropped_images, not_crop, filter_id, annotations
+
+    def box_prompt(self, bbox=None, bboxes=None):
+
+        assert bbox or bboxes
+        if bboxes is None:
+            bboxes = [bbox]
+        max_iou_index = []
+        for bbox in bboxes:
+            assert (bbox[2] != 0 and bbox[3] != 0)
+            masks = self.results[0].masks.data
+            target_height = self.ori_img.shape[0]
+            target_width = self.ori_img.shape[1]
+            h = masks.shape[1]
+            w = masks.shape[2]
+            if h != target_height or w != target_width:
+                bbox = [
+                    int(bbox[0] * w / target_width),
+                    int(bbox[1] * h / target_height),
+                    int(bbox[2] * w / target_width),
+                    int(bbox[3] * h / target_height), ]
+            bbox[0] = round(bbox[0]) if round(bbox[0]) > 0 else 0
+            bbox[1] = round(bbox[1]) if round(bbox[1]) > 0 else 0
+            bbox[2] = round(bbox[2]) if round(bbox[2]) < w else w
+            bbox[3] = round(bbox[3]) if round(bbox[3]) < h else h
+
+            # IoUs = torch.zeros(len(masks), dtype=torch.float32)
+            bbox_area = (bbox[3] - bbox[1]) * (bbox[2] - bbox[0])
+
+            masks_area = torch.sum(masks[:, bbox[1]:bbox[3], bbox[0]:bbox[2]], dim=(1, 2))
+            orig_masks_area = torch.sum(masks, dim=(1, 2))
+
+            union = bbox_area + orig_masks_area - masks_area
+            IoUs = masks_area / union
+            max_iou_index.append(int(torch.argmax(IoUs)))
+        max_iou_index = list(set(max_iou_index))
+        return np.array(masks[max_iou_index].cpu().numpy())
+
+    def point_prompt(self, points, pointlabel):  # numpy 
+
+        masks = self._format_results(self.results[0], 0)
+        target_height = self.ori_img.shape[0]
+        target_width = self.ori_img.shape[1]
+        h = masks[0]['segmentation'].shape[0]
+        w = masks[0]['segmentation'].shape[1]
+        if h != target_height or w != target_width:
+            points = [[int(point[0] * w / target_width), int(point[1] * h / target_height)] for point in points]
+        onemask = np.zeros((h, w))
+        masks = sorted(masks, key=lambda x: x['area'], reverse=True)
+        for i, annotation in enumerate(masks):
+            if type(annotation) == dict:
+                mask = annotation['segmentation']
+            else:
+                mask = annotation
+            for i, point in enumerate(points):
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 1:
+                    onemask[mask] = 1
+                if mask[point[1], point[0]] == 1 and pointlabel[i] == 0:
+                    onemask[mask] = 0
+        onemask = onemask >= 1
+        return np.array([onemask])
+
+    def text_prompt(self, text):
+        format_results = self._format_results(self.results[0], 0)
+        cropped_boxes, cropped_images, not_crop, filter_id, annotations = self._crop_image(format_results)
+        clip_model, preprocess = clip.load('ViT-B/32', device=self.device)
+        scores = self.retrieve(clip_model, preprocess, cropped_boxes, text, device=self.device)
+        max_idx = scores.argsort()
+        max_idx = max_idx[-1]
+        max_idx += sum(np.array(filter_id) <= int(max_idx))
+        return np.array([annotations[max_idx]['segmentation']])
+
+    def everything_prompt(self):
+        return self.results[0].masks.data
+        
\ No newline at end of file
diff --git a/thirdparty/fastsam/utils.py b/thirdparty/fastsam/utils.py
new file mode 100644
index 0000000..d828c2c
--- /dev/null
+++ b/thirdparty/fastsam/utils.py
@@ -0,0 +1,66 @@
+import torch
+
+def adjust_bboxes_to_image_border(boxes, image_shape, threshold=20):
+    '''Adjust bounding boxes to stick to image border if they are within a certain threshold.
+    Args:
+    boxes: (n, 4)
+    image_shape: (height, width)
+    threshold: pixel threshold
+    Returns:
+    adjusted_boxes: adjusted bounding boxes
+    '''
+
+    # Image dimensions
+    h, w = image_shape
+
+    # Adjust boxes
+    boxes[:, 0] = torch.where(boxes[:, 0] < threshold, 0, boxes[:, 0])  # x1
+    boxes[:, 1] = torch.where(boxes[:, 1] < threshold, 0, boxes[:, 1])  # y1
+    boxes[:, 2] = torch.where(boxes[:, 2] > w - threshold, w, boxes[:, 2])  # x2
+    boxes[:, 3] = torch.where(boxes[:, 3] > h - threshold, h, boxes[:, 3])  # y2
+
+    return boxes
+
+def convert_box_xywh_to_xyxy(box):
+    x1 = box[0]
+    y1 = box[1]
+    x2 = box[0] + box[2]
+    y2 = box[1] + box[3]
+    return [x1, y1, x2, y2]
+
+def bbox_iou(box1, boxes, iou_thres=0.9, image_shape=(640, 640), raw_output=False):
+    '''Compute the Intersection-Over-Union of a bounding box with respect to an array of other bounding boxes.
+    Args:
+    box1: (4, )
+    boxes: (n, 4)
+    Returns:
+    high_iou_indices: Indices of boxes with IoU > thres
+    '''
+    boxes = adjust_bboxes_to_image_border(boxes, image_shape)
+    # obtain coordinates for intersections
+    x1 = torch.max(box1[0], boxes[:, 0])
+    y1 = torch.max(box1[1], boxes[:, 1])
+    x2 = torch.min(box1[2], boxes[:, 2])
+    y2 = torch.min(box1[3], boxes[:, 3])
+
+    # compute the area of intersection
+    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)
+
+    # compute the area of both individual boxes
+    box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    box2_area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+    # compute the area of union
+    union = box1_area + box2_area - intersection
+
+    # compute the IoU
+    iou = intersection / union  # Should be shape (n, )
+    if raw_output:
+        if iou.numel() == 0:
+            return 0
+        return iou
+
+    # get indices of boxes with IoU > thres
+    high_iou_indices = torch.nonzero(iou > iou_thres).flatten()
+
+    return high_iou_indices
\ No newline at end of file
diff --git a/local_groundingdino/datasets/__init__.py b/thirdparty/groundingdino/datasets/__init__.py
similarity index 100%
rename from local_groundingdino/datasets/__init__.py
rename to thirdparty/groundingdino/datasets/__init__.py
diff --git a/local_groundingdino/datasets/transforms.py b/thirdparty/groundingdino/datasets/transforms.py
similarity index 100%
rename from local_groundingdino/datasets/transforms.py
rename to thirdparty/groundingdino/datasets/transforms.py
diff --git a/local_groundingdino/models/GroundingDINO/__init__.py b/thirdparty/groundingdino/models/GroundingDINO/__init__.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/__init__.py
rename to thirdparty/groundingdino/models/GroundingDINO/__init__.py
diff --git a/local_groundingdino/models/GroundingDINO/backbone/__init__.py b/thirdparty/groundingdino/models/GroundingDINO/backbone/__init__.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/backbone/__init__.py
rename to thirdparty/groundingdino/models/GroundingDINO/backbone/__init__.py
diff --git a/local_groundingdino/models/GroundingDINO/backbone/backbone.py b/thirdparty/groundingdino/models/GroundingDINO/backbone/backbone.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/backbone/backbone.py
rename to thirdparty/groundingdino/models/GroundingDINO/backbone/backbone.py
diff --git a/local_groundingdino/models/GroundingDINO/backbone/position_encoding.py b/thirdparty/groundingdino/models/GroundingDINO/backbone/position_encoding.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/backbone/position_encoding.py
rename to thirdparty/groundingdino/models/GroundingDINO/backbone/position_encoding.py
diff --git a/local_groundingdino/models/GroundingDINO/backbone/swin_transformer.py b/thirdparty/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/backbone/swin_transformer.py
rename to thirdparty/groundingdino/models/GroundingDINO/backbone/swin_transformer.py
diff --git a/local_groundingdino/models/GroundingDINO/bertwarper.py b/thirdparty/groundingdino/models/GroundingDINO/bertwarper.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/bertwarper.py
rename to thirdparty/groundingdino/models/GroundingDINO/bertwarper.py
diff --git a/local_groundingdino/models/GroundingDINO/fuse_modules.py b/thirdparty/groundingdino/models/GroundingDINO/fuse_modules.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/fuse_modules.py
rename to thirdparty/groundingdino/models/GroundingDINO/fuse_modules.py
diff --git a/local_groundingdino/models/GroundingDINO/groundingdino.py b/thirdparty/groundingdino/models/GroundingDINO/groundingdino.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/groundingdino.py
rename to thirdparty/groundingdino/models/GroundingDINO/groundingdino.py
diff --git a/local_groundingdino/models/GroundingDINO/ms_deform_attn.py b/thirdparty/groundingdino/models/GroundingDINO/ms_deform_attn.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/ms_deform_attn.py
rename to thirdparty/groundingdino/models/GroundingDINO/ms_deform_attn.py
diff --git a/local_groundingdino/models/GroundingDINO/transformer.py b/thirdparty/groundingdino/models/GroundingDINO/transformer.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/transformer.py
rename to thirdparty/groundingdino/models/GroundingDINO/transformer.py
diff --git a/local_groundingdino/models/GroundingDINO/transformer_vanilla.py b/thirdparty/groundingdino/models/GroundingDINO/transformer_vanilla.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/transformer_vanilla.py
rename to thirdparty/groundingdino/models/GroundingDINO/transformer_vanilla.py
diff --git a/local_groundingdino/models/GroundingDINO/utils.py b/thirdparty/groundingdino/models/GroundingDINO/utils.py
similarity index 100%
rename from local_groundingdino/models/GroundingDINO/utils.py
rename to thirdparty/groundingdino/models/GroundingDINO/utils.py
diff --git a/local_groundingdino/models/__init__.py b/thirdparty/groundingdino/models/__init__.py
similarity index 100%
rename from local_groundingdino/models/__init__.py
rename to thirdparty/groundingdino/models/__init__.py
diff --git a/local_groundingdino/models/registry.py b/thirdparty/groundingdino/models/registry.py
similarity index 100%
rename from local_groundingdino/models/registry.py
rename to thirdparty/groundingdino/models/registry.py
diff --git a/local_groundingdino/util/__init__.py b/thirdparty/groundingdino/util/__init__.py
similarity index 100%
rename from local_groundingdino/util/__init__.py
rename to thirdparty/groundingdino/util/__init__.py
diff --git a/local_groundingdino/util/box_ops.py b/thirdparty/groundingdino/util/box_ops.py
similarity index 100%
rename from local_groundingdino/util/box_ops.py
rename to thirdparty/groundingdino/util/box_ops.py
diff --git a/local_groundingdino/util/get_tokenlizer.py b/thirdparty/groundingdino/util/get_tokenlizer.py
similarity index 100%
rename from local_groundingdino/util/get_tokenlizer.py
rename to thirdparty/groundingdino/util/get_tokenlizer.py
diff --git a/local_groundingdino/util/inference.py b/thirdparty/groundingdino/util/inference.py
similarity index 100%
rename from local_groundingdino/util/inference.py
rename to thirdparty/groundingdino/util/inference.py
diff --git a/local_groundingdino/util/misc.py b/thirdparty/groundingdino/util/misc.py
similarity index 100%
rename from local_groundingdino/util/misc.py
rename to thirdparty/groundingdino/util/misc.py
diff --git a/local_groundingdino/util/slconfig.py b/thirdparty/groundingdino/util/slconfig.py
similarity index 100%
rename from local_groundingdino/util/slconfig.py
rename to thirdparty/groundingdino/util/slconfig.py
diff --git a/local_groundingdino/util/slio.py b/thirdparty/groundingdino/util/slio.py
similarity index 100%
rename from local_groundingdino/util/slio.py
rename to thirdparty/groundingdino/util/slio.py
diff --git a/local_groundingdino/util/utils.py b/thirdparty/groundingdino/util/utils.py
similarity index 100%
rename from local_groundingdino/util/utils.py
rename to thirdparty/groundingdino/util/utils.py
diff --git a/thirdparty/mam/__init__.py b/thirdparty/mam/__init__.py
new file mode 100644
index 0000000..6177189
--- /dev/null
+++ b/thirdparty/mam/__init__.py
@@ -0,0 +1 @@
+from .m2m import *
\ No newline at end of file
diff --git a/thirdparty/mam/m2m.py b/thirdparty/mam/m2m.py
new file mode 100644
index 0000000..d3ad32a
--- /dev/null
+++ b/thirdparty/mam/m2m.py
@@ -0,0 +1,85 @@
+# ------------------------------------------------------------------------
+# Modified from MGMatting (https://github.com/yucornetto/MGMatting)
+# ------------------------------------------------------------------------
+import os
+import torch
+from torch.nn import Module, functional as F
+
+import m2ms
+from modules.devices import get_device_for
+from scripts.sam_state import sam_extension_dir
+from scripts.sam_log import logger
+from mam import utils
+
+class SamM2M(Module):
+
+    def __init__(self):
+        super(SamM2M, self).__init__()
+        self.m2m = None
+        self.m2m_device = get_device_for("sam")
+
+
+    def load_m2m(self, m2m='sam_decoder_deep', ckpt_path: str=None):
+        if m2m not in m2ms.__all__:
+            raise NotImplementedError(f"Unknown M2M {m2m}")
+        self.m2m: Module = m2ms.__dict__[m2m](nc=256)
+        if ckpt_path is None:
+            ckpt_path = os.path.join(sam_extension_dir, 'models/mam')
+        try:
+            logger.info(f"Loading mam from path: {ckpt_path}/mam.pth to device: {self.m2m_device}")
+            state_dict = torch.load(os.path.join(ckpt_path, 'mam.pth'), map_location=self.m2m_device)
+        except:
+            mam_url = "https://huggingface.co/conrevo/SAM4WebUI-Extension-Models/resolve/main/mam.pth"
+            logger.info(f"Loading mam from url: {mam_url} to path: {ckpt_path}, device: {self.m2m_device}")
+            try:
+                state_dict = torch.hub.load_state_dict_from_url(mam_url, ckpt_path, self.m2m_device)
+            except:
+                error_msg = f"Unable to connect to {mam_url}, thus unable to download Matting-Anything model."
+                logger.error(error_msg)
+                raise RuntimeError(error_msg)
+        self.m2m.load_state_dict(state_dict)
+        self.m2m.eval()
+
+
+    def forward(self, features: torch.Tensor, image: torch.Tensor, 
+                low_res_masks: torch.Tensor, masks: torch.Tensor, 
+                ori_shape: torch.Tensor, pad_shape: torch.Tensor, guidance_mode: str):
+        logger.info("Applying Matting-Anything.")
+        self.m2m.to(self.m2m_device)
+        pred = self.m2m(features, image, low_res_masks)
+        alpha_pred_os1, alpha_pred_os4, alpha_pred_os8 = pred['alpha_os1'], pred['alpha_os4'], pred['alpha_os8']
+        alpha_pred_os8 = alpha_pred_os8[..., : pad_shape[0], : pad_shape[1]]
+        alpha_pred_os4 = alpha_pred_os4[..., : pad_shape[0], : pad_shape[1]]
+        alpha_pred_os1 = alpha_pred_os1[..., : pad_shape[0], : pad_shape[1]]
+
+        alpha_pred_os8 = F.interpolate(alpha_pred_os8, ori_shape, mode="bilinear", align_corners=False)
+        alpha_pred_os4 = F.interpolate(alpha_pred_os4, ori_shape, mode="bilinear", align_corners=False)
+        alpha_pred_os1 = F.interpolate(alpha_pred_os1, ori_shape, mode="bilinear", align_corners=False)
+        
+        if guidance_mode == 'mask':
+            weight_os8 = utils.get_unknown_tensor_from_mask_oneside(masks, rand_width=10, train_mode=False)
+            masks[weight_os8 > 0] = alpha_pred_os8[weight_os8 > 0]
+            alpha_pred = masks.clone().detach()
+        else:
+            weight_os8 = utils.get_unknown_box_from_mask(masks)
+            alpha_pred_os8[weight_os8>0] = masks[weight_os8 > 0]
+            alpha_pred = alpha_pred_os8.clone().detach()
+
+        weight_os4 = utils.get_unknown_tensor_from_pred_oneside(alpha_pred, rand_width=20, train_mode=False)
+        alpha_pred[weight_os4 > 0] = alpha_pred_os4[weight_os4 > 0]
+        
+        weight_os1 = utils.get_unknown_tensor_from_pred_oneside(alpha_pred, rand_width=10, train_mode=False)
+        alpha_pred[weight_os1 > 0] = alpha_pred_os1[weight_os1 > 0]
+       
+        alpha_pred = alpha_pred[0][0].cpu().numpy()
+        return alpha_pred
+
+
+    def clear(self):
+        del self.m2m
+        self.m2m = None
+
+
+    def unload_model(self):
+        if self.m2m is not None:
+            self.m2m.cpu()
diff --git a/thirdparty/mam/m2ms/__init__.py b/thirdparty/mam/m2ms/__init__.py
new file mode 100644
index 0000000..7c06875
--- /dev/null
+++ b/thirdparty/mam/m2ms/__init__.py
@@ -0,0 +1,7 @@
+from .conv_sam import SAM_Decoder_Deep
+
+__all__ = ['sam_decoder_deep']
+
+def sam_decoder_deep(nc, **kwargs):
+    model = SAM_Decoder_Deep(nc, [2, 3, 3, 2], **kwargs)
+    return model
\ No newline at end of file
diff --git a/thirdparty/mam/m2ms/conv_sam.py b/thirdparty/mam/m2ms/conv_sam.py
new file mode 100644
index 0000000..7c2bc15
--- /dev/null
+++ b/thirdparty/mam/m2ms/conv_sam.py
@@ -0,0 +1,192 @@
+# ------------------------------------------------------------------------
+# Modified from MGMatting (https://github.com/yucornetto/MGMatting)
+# ------------------------------------------------------------------------
+import logging
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+from   mam.ops import SpectralNorm
+
+def conv5x5(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """5x5 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=5, stride=stride,
+                     padding=2, groups=groups, bias=False, dilation=dilation)
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=dilation, groups=groups, bias=False, dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, upsample=None, norm_layer=None, large_kernel=False):
+        super(BasicBlock, self).__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.stride = stride
+        conv = conv5x5 if large_kernel else conv3x3
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        if self.stride > 1:
+            self.conv1 = SpectralNorm(nn.ConvTranspose2d(inplanes, inplanes, kernel_size=4, stride=2, padding=1, bias=False))
+        else:
+            self.conv1 = SpectralNorm(conv(inplanes, inplanes))
+        self.bn1 = norm_layer(inplanes)
+        self.activation = nn.LeakyReLU(0.2, inplace=True)
+        self.conv2 = SpectralNorm(conv(inplanes, planes))
+        self.bn2 = norm_layer(planes)
+        self.upsample = upsample
+
+    def forward(self, x):
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.activation(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.upsample is not None:
+            identity = self.upsample(x)
+
+        out += identity
+        out = self.activation(out)
+
+        return out
+
+class SAM_Decoder_Deep(nn.Module):
+    def __init__(self, nc, layers, block=BasicBlock, norm_layer=None, large_kernel=False, late_downsample=False):
+        super(SAM_Decoder_Deep, self).__init__()
+        self.logger = logging.getLogger("Logger")
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+        self.large_kernel = large_kernel
+        self.kernel_size = 5 if self.large_kernel else 3
+
+        #self.inplanes = 512 if layers[0] > 0 else 256
+        self.inplanes = 256
+        self.late_downsample = late_downsample
+        self.midplanes = 64 if late_downsample else 32
+
+        self.conv1 = SpectralNorm(nn.ConvTranspose2d(self.midplanes, 32, kernel_size=4, stride=2, padding=1, bias=False))
+        self.bn1 = norm_layer(32)
+        self.leaky_relu = nn.LeakyReLU(0.2, inplace=True)
+
+        self.upsample = nn.UpsamplingNearest2d(scale_factor=2)
+        self.tanh = nn.Tanh()
+        #self.layer1 = self._make_layer(block, 256, layers[0], stride=2)
+        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
+        self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
+        self.layer4 = self._make_layer(block, self.midplanes, layers[3], stride=2)
+
+        self.refine_OS1 = nn.Sequential(
+            nn.Conv2d(32, 32, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2, bias=False),
+            norm_layer(32),
+            self.leaky_relu,
+            nn.Conv2d(32, 1, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2),)
+        
+        self.refine_OS4 = nn.Sequential(
+            nn.Conv2d(64, 32, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2, bias=False),
+            norm_layer(32),
+            self.leaky_relu,
+            nn.Conv2d(32, 1, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2),)
+
+        self.refine_OS8 = nn.Sequential(
+            nn.Conv2d(128, 32, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2, bias=False),
+            norm_layer(32),
+            self.leaky_relu,
+            nn.Conv2d(32, 1, kernel_size=self.kernel_size, stride=1, padding=self.kernel_size//2),)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if hasattr(m, "weight_bar"):
+                    nn.init.xavier_uniform_(m.weight_bar)
+                else:
+                    nn.init.xavier_uniform_(m.weight)
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        for m in self.modules():
+            if isinstance(m, BasicBlock):
+                nn.init.constant_(m.bn2.weight, 0)
+
+        self.logger.debug(self)
+
+    def _make_layer(self, block, planes, blocks, stride=1):
+        if blocks == 0:
+            return nn.Sequential(nn.Identity())
+        norm_layer = self._norm_layer
+        upsample = None
+        if stride != 1:
+            upsample = nn.Sequential(
+                nn.UpsamplingNearest2d(scale_factor=2),
+                SpectralNorm(conv1x1(self.inplanes + 4, planes * block.expansion)),
+                norm_layer(planes * block.expansion),
+            )
+        elif self.inplanes != planes * block.expansion:
+            upsample = nn.Sequential(
+                SpectralNorm(conv1x1(self.inplanes + 4, planes * block.expansion)),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = [block(self.inplanes + 4, planes, stride, upsample, norm_layer, self.large_kernel)]
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(block(self.inplanes, planes, norm_layer=norm_layer, large_kernel=self.large_kernel))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x_os16, img, mask):
+        ret = {}
+        mask_os16 = F.interpolate(mask, x_os16.shape[2:], mode='bilinear', align_corners=False)
+        img_os16 = F.interpolate(img, x_os16.shape[2:], mode='bilinear', align_corners=False)
+
+        x = self.layer2(torch.cat((x_os16, img_os16, mask_os16), dim=1)) # N x 128 x 128 x 128
+
+        x_os8 = self.refine_OS8(x)
+        
+        mask_os8 = F.interpolate(mask, x.shape[2:], mode='bilinear', align_corners=False)
+        img_os8 = F.interpolate(img, x.shape[2:], mode='bilinear', align_corners=False)
+
+        x = self.layer3(torch.cat((x, img_os8, mask_os8), dim=1)) # N x 64 x 256 x 256
+
+        x_os4 = self.refine_OS4(x)
+
+        mask_os4 = F.interpolate(mask, x.shape[2:], mode='bilinear', align_corners=False)
+        img_os4 = F.interpolate(img, x.shape[2:], mode='bilinear', align_corners=False)
+
+        x = self.layer4(torch.cat((x, img_os4, mask_os4), dim=1)) # N x 32 x 512 x 512
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.leaky_relu(x) # N x 32 x 1024 x 1024
+
+        x_os1 = self.refine_OS1(x) # N
+        
+        x_os4 = F.interpolate(x_os4, scale_factor=4.0, mode='bilinear', align_corners=False)
+        x_os8 = F.interpolate(x_os8, scale_factor=8.0, mode='bilinear', align_corners=False)
+        
+        x_os1 = (torch.tanh(x_os1) + 1.0) / 2.0
+        x_os4 = (torch.tanh(x_os4) + 1.0) / 2.0
+        x_os8 = (torch.tanh(x_os8) + 1.0) / 2.0
+
+        mask_os1 = F.interpolate(mask, x_os1.shape[2:], mode='bilinear', align_corners=False)
+
+        ret['alpha_os1'] = x_os1
+        ret['alpha_os4'] = x_os4
+        ret['alpha_os8'] = x_os8
+        ret['mask'] = mask_os1
+        
+        return ret
\ No newline at end of file
diff --git a/thirdparty/mam/ops.py b/thirdparty/mam/ops.py
new file mode 100644
index 0000000..af88f6e
--- /dev/null
+++ b/thirdparty/mam/ops.py
@@ -0,0 +1,134 @@
+import torch
+from   torch import nn
+from   torch.nn import Parameter
+
+
+def l2normalize(v, eps=1e-12):
+    return v / (v.norm() + eps)
+
+
+class SpectralNorm(nn.Module):
+    """
+    Based on https://github.com/heykeetae/Self-Attention-GAN/blob/master/spectral.py
+    and add _noupdate_u_v() for evaluation
+    """
+    def __init__(self, module, name='weight', power_iterations=1):
+        super(SpectralNorm, self).__init__()
+        self.module = module
+        self.name = name
+        self.power_iterations = power_iterations
+        if not self._made_params():
+            self._make_params()
+
+    def _update_u_v(self):
+        u = getattr(self.module, self.name + "_u")
+        v = getattr(self.module, self.name + "_v")
+        w = getattr(self.module, self.name + "_bar")
+
+        height = w.data.shape[0]
+        for _ in range(self.power_iterations):
+            v.data = l2normalize(torch.mv(torch.t(w.view(height,-1).data), u.data))
+            u.data = l2normalize(torch.mv(w.view(height,-1).data, v.data))
+
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+
+    def _noupdate_u_v(self):
+        u = getattr(self.module, self.name + "_u")
+        v = getattr(self.module, self.name + "_v")
+        w = getattr(self.module, self.name + "_bar")
+
+        height = w.data.shape[0]
+        sigma = u.dot(w.view(height, -1).mv(v))
+        setattr(self.module, self.name, w / sigma.expand_as(w))
+
+    def _made_params(self):
+        try:
+            u = getattr(self.module, self.name + "_u")
+            v = getattr(self.module, self.name + "_v")
+            w = getattr(self.module, self.name + "_bar")
+            return True
+        except AttributeError:
+            return False
+
+    def _make_params(self):
+        w = getattr(self.module, self.name)
+
+        height = w.data.shape[0]
+        width = w.view(height, -1).data.shape[1]
+
+        u = Parameter(w.data.new(height).normal_(0, 1), requires_grad=False)
+        v = Parameter(w.data.new(width).normal_(0, 1), requires_grad=False)
+        u.data = l2normalize(u.data)
+        v.data = l2normalize(v.data)
+        w_bar = Parameter(w.data)
+
+        del self.module._parameters[self.name]
+
+        self.module.register_parameter(self.name + "_u", u)
+        self.module.register_parameter(self.name + "_v", v)
+        self.module.register_parameter(self.name + "_bar", w_bar)
+
+    def forward(self, *args):
+        # if torch.is_grad_enabled() and self.module.training:
+        if self.module.training:
+            self._update_u_v()
+        else:
+            self._noupdate_u_v()
+        return self.module.forward(*args)
+
+
+class ASPP(nn.Module):
+    '''
+    based on https://github.com/chenxi116/DeepLabv3.pytorch/blob/master/deeplab.py
+    '''
+    def __init__(self, in_channel, out_channel, conv=nn.Conv2d, norm=nn.BatchNorm2d):
+        super(ASPP, self).__init__()
+        mid_channel = 256
+        dilations = [1, 2, 4, 8]
+
+        self.global_pooling = nn.AdaptiveAvgPool2d(1)
+        self.relu = nn.ReLU(inplace=True)
+        self.aspp1 = conv(in_channel, mid_channel, kernel_size=1, stride=1, dilation=dilations[0], bias=False)
+        self.aspp2 = conv(in_channel, mid_channel, kernel_size=3, stride=1,
+                               dilation=dilations[1], padding=dilations[1],
+                               bias=False)
+        self.aspp3 = conv(in_channel, mid_channel, kernel_size=3, stride=1,
+                               dilation=dilations[2], padding=dilations[2],
+                               bias=False)
+        self.aspp4 = conv(in_channel, mid_channel, kernel_size=3, stride=1,
+                               dilation=dilations[3], padding=dilations[3],
+                               bias=False)
+        self.aspp5 = conv(in_channel, mid_channel, kernel_size=1, stride=1, bias=False)
+        self.aspp1_bn = norm(mid_channel)
+        self.aspp2_bn = norm(mid_channel)
+        self.aspp3_bn = norm(mid_channel)
+        self.aspp4_bn = norm(mid_channel)
+        self.aspp5_bn = norm(mid_channel)
+        self.conv2 = conv(mid_channel * 5, out_channel, kernel_size=1, stride=1,
+                               bias=False)
+        self.bn2 = norm(out_channel)
+
+    def forward(self, x):
+        x1 = self.aspp1(x)
+        x1 = self.aspp1_bn(x1)
+        x1 = self.relu(x1)
+        x2 = self.aspp2(x)
+        x2 = self.aspp2_bn(x2)
+        x2 = self.relu(x2)
+        x3 = self.aspp3(x)
+        x3 = self.aspp3_bn(x3)
+        x3 = self.relu(x3)
+        x4 = self.aspp4(x)
+        x4 = self.aspp4_bn(x4)
+        x4 = self.relu(x4)
+        x5 = self.global_pooling(x)
+        x5 = self.aspp5(x5)
+        x5 = self.aspp5_bn(x5)
+        x5 = self.relu(x5)
+        x5 = nn.Upsample((x.shape[2], x.shape[3]), mode='nearest')(x5)
+        x = torch.cat((x1, x2, x3, x4, x5), 1)
+        x = self.conv2(x)
+        x = self.bn2(x)
+        x = self.relu(x)
+        return x
\ No newline at end of file
diff --git a/thirdparty/mam/utils.py b/thirdparty/mam/utils.py
new file mode 100644
index 0000000..0c3f0c8
--- /dev/null
+++ b/thirdparty/mam/utils.py
@@ -0,0 +1,110 @@
+import cv2
+import torch
+import numpy as np
+
+
+Kernels = [None] + [cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size)) for size in range(1,30)]
+def get_unknown_tensor_from_pred(pred: torch.Tensor, rand_width=30, train_mode=True):
+    ### pred: N, 1 ,H, W 
+    N, C, H, W = pred.shape
+
+    pred = pred.data.cpu().numpy()
+    uncertain_area = np.ones_like(pred, dtype=np.uint8)
+    uncertain_area[pred<1.0/255.0] = 0
+    uncertain_area[pred>1-1.0/255.0] = 0
+    for n in range(N):
+        uncertain_area_ = uncertain_area[n,0,:,:] # H, W
+        if train_mode:
+            width = np.random.randint(1, rand_width)
+        else:
+            width = rand_width // 2
+        uncertain_area_ = cv2.dilate(uncertain_area_, Kernels[width])
+        uncertain_area[n,0,:,:] = uncertain_area_
+    weight = np.zeros_like(uncertain_area)
+    weight[uncertain_area == 1] = 1
+    weight = torch.from_numpy(weight).cuda()
+    
+    return weight
+
+def get_unknown_tensor_from_pred_oneside(pred: torch.Tensor, rand_width=30, train_mode=True):
+    ### pred: N, 1 ,H, W 
+    N, C, H, W = pred.shape
+    pred = pred.data.cpu().numpy()
+    uncertain_area = np.ones_like(pred, dtype=np.uint8)
+    uncertain_area[pred<1.0/255.0] = 0
+    #uncertain_area[pred>1-1.0/255.0] = 0
+    for n in range(N):
+        uncertain_area_ = uncertain_area[n,0,:,:] # H, W
+        if train_mode:
+            width = np.random.randint(1, rand_width)
+        else:
+            width = rand_width // 2
+        uncertain_area_ = cv2.dilate(uncertain_area_, Kernels[width])
+        uncertain_area[n,0,:,:] = uncertain_area_
+    uncertain_area[pred>1-1.0/255.0] = 0
+    #weight = np.zeros_like(uncertain_area)
+    #weight[uncertain_area == 1] = 1
+    weight = torch.from_numpy(uncertain_area).cuda()
+    return weight
+
+Kernels_mask = [None] + [cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size)) for size in range(1,30)]
+def get_unknown_tensor_from_mask(mask: torch.Tensor, rand_width=30, train_mode=True):
+    """
+    get 1-channel unknown area tensor from the 3-channel/1-channel trimap tensor
+    """
+    N, C, H, W = mask.shape
+    mask_c = mask.data.cpu().numpy().astype(np.uint8)
+
+    weight = np.ones_like(mask_c, dtype=np.uint8)
+
+    for n in range(N):
+        if train_mode:
+            width = np.random.randint(rand_width // 2, rand_width)
+        else:
+            width = rand_width // 2
+        fg_mask = cv2.erode(mask_c[n,0], Kernels_mask[width])
+        bg_mask = cv2.erode(1 - mask_c[n,0], Kernels_mask[width])
+        weight[n,0][fg_mask==1] = 0
+        weight[n,0][bg_mask==1] = 0
+    weight = torch.from_numpy(weight).cuda()
+    return weight
+
+def get_unknown_tensor_from_mask_oneside(mask: torch.Tensor, rand_width=30, train_mode=True):
+    """
+    get 1-channel unknown area tensor from the 3-channel/1-channel trimap tensor
+    """
+    N, C, H, W = mask.shape
+    mask_c = mask.data.cpu().numpy().astype(np.uint8)
+
+    weight = np.ones_like(mask_c, dtype=np.uint8)
+
+    for n in range(N):
+        if train_mode:
+            width = np.random.randint(rand_width // 2, rand_width)
+        else:
+            width = rand_width // 2
+        #fg_mask = cv2.erode(mask_c[n,0], Kernels_mask[width])
+        fg_mask = mask_c[n,0]
+        bg_mask = cv2.erode(1 - mask_c[n,0], Kernels_mask[width])
+        weight[n,0][fg_mask==1] = 0
+        weight[n,0][bg_mask==1] = 0
+    weight = torch.from_numpy(weight).cuda()
+    return weight
+
+def get_unknown_box_from_mask(mask: torch.Tensor):
+    """
+    get 1-channel unknown area tensor from the 3-channel/1-channel trimap tensor
+    """
+    N, C, H, W = mask.shape
+    mask_c = mask.data.cpu().numpy().astype(np.uint8)
+
+    weight = np.ones_like(mask_c, dtype=np.uint8)
+    fg_set = np.where(mask_c[0][0] != 0)
+    x_min = np.min(fg_set[1])
+    x_max = np.max(fg_set[1])
+    y_min = np.min(fg_set[0])
+    y_max = np.max(fg_set[0])
+
+    weight[0, 0, y_min:y_max, x_min:x_max] = 0
+    weight = torch.from_numpy(weight).cuda()
+    return weight
\ No newline at end of file
diff --git a/sam_hq/automatic.py b/thirdparty/sam_hq/automatic.py
similarity index 100%
rename from sam_hq/automatic.py
rename to thirdparty/sam_hq/automatic.py
diff --git a/sam_hq/build_sam_hq.py b/thirdparty/sam_hq/build_sam_hq.py
similarity index 92%
rename from sam_hq/build_sam_hq.py
rename to thirdparty/sam_hq/build_sam_hq.py
index df28532..a736b81 100644
--- a/sam_hq/build_sam_hq.py
+++ b/thirdparty/sam_hq/build_sam_hq.py
@@ -45,12 +45,12 @@ def build_sam_hq_vit_b(checkpoint=None):
 
 
 sam_model_registry = {
-    "sam_vit_h": build_sam_vit_h,
-    "sam_vit_l": build_sam_vit_l,
-    "sam_vit_b": build_sam_vit_b,
-    "sam_hq_vit_h": build_sam_hq_vit_h,
-    "sam_hq_vit_l": build_sam_hq_vit_l,
-    "sam_hq_vit_b": build_sam_hq_vit_b,
+    "sam_vit_h_4b8939.pth" : build_sam_vit_h,
+    "sam_vit_l_0b3195.pth" : build_sam_vit_l,
+    "sam_vit_b_01ec64.pth" : build_sam_vit_b,
+    "sam_hq_vit_h.pth" : build_sam_hq_vit_h,
+    "sam_hq_vit_l.pth" : build_sam_hq_vit_l,
+    "sam_hq_vit_b.pth" : build_sam_hq_vit_b,
 }
 
 
diff --git a/sam_hq/modeling/image_encoder.py b/thirdparty/sam_hq/modeling/image_encoder.py
similarity index 100%
rename from sam_hq/modeling/image_encoder.py
rename to thirdparty/sam_hq/modeling/image_encoder.py
diff --git a/sam_hq/modeling/mask_decoder_hq.py b/thirdparty/sam_hq/modeling/mask_decoder_hq.py
similarity index 100%
rename from sam_hq/modeling/mask_decoder_hq.py
rename to thirdparty/sam_hq/modeling/mask_decoder_hq.py
diff --git a/sam_hq/predictor.py b/thirdparty/sam_hq/predictor.py
similarity index 99%
rename from sam_hq/predictor.py
rename to thirdparty/sam_hq/predictor.py
index 40e3162..980525f 100644
--- a/sam_hq/predictor.py
+++ b/thirdparty/sam_hq/predictor.py
@@ -1,10 +1,8 @@
 from typing import Optional, Tuple
-import numpy as np
 import torch
 from segment_anything import SamPredictor
 from segment_anything.modeling import Sam
 
-
 class SamPredictorHQ(SamPredictor):
 
     def __init__(
@@ -21,7 +19,8 @@ def __init__(
         """
         super().__init__(sam_model=sam_model)
         self.is_hq = sam_is_hq
-    
+        self.model = self.model.eval()
+
 
     @torch.no_grad()
     def set_torch_image(