diff --git a/backend/python/vllm/backend.py b/backend/python/vllm/backend.py
index 98f292ab0317..023a14bccbf6 100644
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -219,13 +219,15 @@ async def _predict(self, request, context, streaming=False):
         # Generate text using the LLM engine
         request_id = random_uuid()
         print(f"Generating text with request_id: {request_id}", file=sys.stderr)
+        multi_modal_data = {}
+        if image_data:
+            multi_modal_data["image"] = image_data
+        if video_data:
+            multi_modal_data["video"] = video_data
         outputs = self.llm.generate(
             {
-                "prompt": prompt,
-                "multi_modal_data": {
-                    "image": image_data if image_data else None,
-                    "video": video_data if video_data else None,
-                } if image_data or video_data else None,
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data if multi_modal_data else None,
             },
             sampling_params=sampling_params,
             request_id=request_id,
@@ -279,7 +281,7 @@ def load_image(self, image_path: str):
             return image
         except Exception as e:
             print(f"Error loading image {image_path}: {e}", file=sys.stderr)
-            return self.load_video(image_path)
+            return None
 
     def load_video(self, video_path: str):
         """