Merge pull request #14 from Eating-Wisely-Labs/GPT4VisionAPI_Update_PR1

Gpt4 vision api update pr1
The-Swarm-Corporation · Jan 14, 2025 · 338e7c4 · 338e7c4
2 parents 76b76af + f06ef08
commit 338e7c4
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 31 deletions.
diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -7,7 +7,7 @@ root:
   - any-glob-to-any-file: '*'
 
 # Add 'Documentation' label to any changes within 'docs' folder or any subfolders
-Documentation:
+DocumentationChanged:
 - changed-files:
   - any-glob-to-any-file: docs/**
 
@@ -27,14 +27,14 @@ ghactions:
 Scripts:
 - changed-files:
   - any-glob-to-any-file: scripts/*
-  
+
 ## Equivalent of the above mentioned configuration using another syntax
-Documentation:
+DocsAndGuides:
 - changed-files:
   - any-glob-to-any-file: ['docs/*', 'guides/*']
 
-# Add 'Documentation' label to any change to .md files within the entire repository 
-Documentation:
+# Add 'DocsMarkdown' label to any change to .md files within the entire repository
+DocsMarkdown:
 - changed-files:
   - any-glob-to-any-file: '**/*.md'
 

diff --git a/.github/workflows/label.yml b/.github/workflows/label.yml
@@ -20,3 +20,4 @@ jobs:
     - uses: actions/[email protected]
       with:
         repo-token: "${{ secrets.GITHUB_TOKEN }}"
+        configuration-path: .github/labeler.yml
diff --git a/swarm_models/gpt4_vision_api.py b/swarm_models/gpt4_vision_api.py
@@ -59,7 +59,7 @@ class GPT4VisionAPI(BaseMultiModalModel):
     def __init__(
         self,
         openai_api_key: str = openai_api_key,
-        model_name: str = "gpt-4-vision-preview",
+        model_name: str = "gpt-4o-mini",
         logging_enabled: bool = False,
         max_workers: int = 10,
         max_tokens: str = 300,
@@ -71,7 +71,7 @@ def __init__(
         *args,
         **kwargs,
     ):
-        super(GPT4VisionAPI).__init__(*args, **kwargs)
+        super(GPT4VisionAPI, self).__init__(*args, **kwargs)
         self.openai_api_key = openai_api_key
         self.logging_enabled = logging_enabled
         self.model_name = model_name
@@ -83,6 +83,7 @@ def __init__(
         self.meta_prompt = meta_prompt
         self.system_prompt = system_prompt
 
+
         if self.logging_enabled:
             logging.basicConfig(level=logging.DEBUG)
         else:
@@ -93,6 +94,7 @@ def __init__(
         if self.meta_prompt:
             self.system_prompt = self.meta_prompt_init()
 
+
     def encode_image(self, img: str):
         """Encode image to base64."""
         if not os.path.exists(img):
@@ -111,51 +113,97 @@ def download_img_then_encode(self, img: str):
         response = requests.get(img)
         return base64.b64encode(response.content).decode("utf-8")
 
+    def compose_messages(self, task: str, img: str, img_list: list = None, context: list = None):
+        """Compose the payload for the GPT-4 Vision API, if illegal image paths are provided
+            , None is returned, means the payload is not valid
+
+        Parameters
+        ----------
+        task : str
+            The task to run the model on.
+        img : str
+            The image to run the task on
+        img_list : list
+            A list of images to run the task on
+        context : list
+            A list of context to run the task on
+
+        Returns
+        -------
+        payload : dict
+            The payload for the gpt-4o Vision API
+            if None is returned, then the payload is not valid
+        """
+
+
+        # Compose the messages
+        messages = []
+        # Add the system prompt to the messages
+        messages.append({"role": "system", "content": self.system_prompt})
+        # Add the context to the messages
+        messages = messages + context if context else messages
+
+        # Compose the content
+        content = []
+        # Add the task to the content
+        content.append({"type": "text", "text": task})
+        # Add the images to the content
+        images = [img] if img else []
+        images = images + img_list if img_list else images
+        if len(images) > 0:
+            for image in images:
+                if image:
+                    if os.path.exists(image):
+                        encoded_img = self.encode_image(image)
+                        content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{encoded_img}"}})
+                    elif image.startswith("http"):
+                        content.append({"type": "image_url", "image_url": {"url": f"{image}"}})
+                    else:
+                        logger.error(f"Image file not found: {image} or not a valid URL")
+                        print(f"Image file not found: {image} or not a valid URL")
+                        return None
+            content = {
+                "role": "user",
+                "content": content
+            }
+            messages.append(content)
+            return messages
+        return None
+
     # Function to handle vision tasks
     def run(
         self,
         task: str = None,
         img: str = None,
         multi_imgs: list = None,
         return_json: bool = False,
+        messages: list = None,
         *args,
         **kwargs,
     ):
         """Run the model."""
         try:
-            base64_image = self.encode_image(img)
             headers = {
                 "Content-Type": "application/json",
                 "Authorization": f"Bearer {self.openai_api_key}",
             }
+            if messages is None:
+                messages = self.compose_messages(task, img, multi_imgs, messages)
+
+            if messages is None:
+                raise ValueError("Image path is invalid, please check the image path")
+
             payload = {
                 "model": self.model_name,
-                "messages": [
-                    {
-                        "role": "system",
-                        "content": [self.system_prompt],
-                    },
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "text", "text": task},
-                            {
-                                "type": "image_url",
-                                "image_url": {
-                                    "url": f"data:image/jpeg;base64,{base64_image}"
-                                },
-                            },
-                        ],
-                    },
-                ],
-                "max_tokens": self.max_tokens,
-                **kwargs,
+                "messages": messages
             }
-            response = requests.post(headers=headers, json=payload)
+
+            response = requests.post(self.openai_proxy, headers=headers, json=payload)
 
             # Get the response as a JSON object
             response_json = response.json()
 
+
             # Return the JSON object if return_json is True
             if return_json is True:
                 print(response_json)
@@ -198,7 +246,7 @@ def video_prompt(self, frames):
         """
         PROMPT = f"""
         These are frames from a video that I want to upload. Generate a compelling description that I can upload along with the video:
-        
+
         {frames}
         """
         return PROMPT
@@ -374,4 +422,4 @@ def print_dashboard(self):
     #     example, instead of '1 - 4', list as '[1], [2], [3], [4]'. These labels could be
     #     numbers or letters and typically correspond to specific segments or parts of the image.
     #     """
-    #     return META_PROMPT
+    #     return META_PROMPT