[pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
xuanzic · Feb 9, 2024 · 604be69 · 604be69
1 parent 888e6b0
commit 604be69
Show file tree

Hide file tree

Showing 3 changed files with 37 additions and 23 deletions.
diff --git a/examples/multimodal/multimodal_llm/video_neva/eval/single_video_inference.py b/examples/multimodal/multimodal_llm/video_neva/eval/single_video_inference.py
@@ -7,17 +7,22 @@
     --video_path <video_path>
 """
 
-from nemo.collections.multimodal.data.neva.conversation import Conversation, SeparatorStyle, conv_templates
+import argparse
+import os
+
+import cv2
+import numpy as np
 import torch
 
 # add new packages as below
 from PIL import Image
+
+from nemo.collections.multimodal.data.neva.conversation import Conversation, SeparatorStyle, conv_templates
+from nemo.collections.multimodal.data.video_neva.video_neva_dataset import (
+    TarOrFolderImageLoader,
+    TarOrFolderVideoLoader,
+)
 from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
-from nemo.collections.multimodal.data.video_neva.video_neva_dataset import TarOrFolderImageLoader, TarOrFolderVideoLoader
-import argparse
-import numpy as np
-import cv2
-import os
 
 # Define constants
 DEFAULT_VIDEO_TOKEN = "<video>"
@@ -26,8 +31,9 @@
 DEFAULT_VID_END_TOKEN = "<vid_end>"
 
 
-def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor,
-                        video_token_len):
+def video_neva_infer(
+    video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len
+):
     """
     Run inference using the Video-ChatGPT model.
 
@@ -51,7 +57,13 @@ def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tok
 
     # Prepare question string for the model
     if model.get_model().vision_config.use_vid_start_end:
-        qs = question + '\n' + DEFAULT_VID_START_TOKEN + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len + DEFAULT_VID_END_TOKEN
+        qs = (
+            question
+            + '\n'
+            + DEFAULT_VID_START_TOKEN
+            + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
+            + DEFAULT_VID_END_TOKEN
+        )
     else:
         qs = question + '\n' + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
 
@@ -72,17 +84,15 @@ def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tok
 
     # Run model inference
     with torch.inference_mode():
-        output_ids = model.generate(
-            input_ids,
-            **inference_config)
+        output_ids = model.generate(input_ids, **inference_config)
 
     # Check if output is the same as input
-    n_diff_input_output = (input_ids != output_ids[:, :input_ids.shape[1]]).sum().item()
+    n_diff_input_output = (input_ids != output_ids[:, : input_ids.shape[1]]).sum().item()
     if n_diff_input_output > 0:
         print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')
 
     # Decode output tokens
-    outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)[0]
+    outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1] :], skip_special_tokens=True)[0]
 
     return outputs
 
@@ -105,13 +115,14 @@ def parse_args():
 if __name__ == "__main__":
     args = parse_args()
 
-    model, vision_tower, tokenizer, image_processor, video_token_len = \
-        create_neva_model_and_processor(args.model_name, args.projection_path)
+    model, vision_tower, tokenizer, image_processor, video_token_len = create_neva_model_and_processor(
+        args.model_name, args.projection_path
+    )
 
     video_path = args.video_path
 
     if os.path.exists(video_path):
-        video_data_loader = TarOrFolderVideoLoader(video_folder = video_path)
+        video_data_loader = TarOrFolderVideoLoader(video_folder=video_path)
         video_object = video_data_loader.open_video(video_path)
         width = video_object.get(cv2.CAP_PROP_FRAME_WIDTH)
         height = video_object.get(cv2.CAP_PROP_FRAME_HEIGHT)
@@ -127,9 +138,10 @@ def parse_args():
 
     try:
         # Run inference on the video and add the output to the list
-        output = video_neva_infer(video_frames, question, conv_mode, model, vision_tower,
-                                     tokenizer, image_processor, video_token_len)
+        output = video_neva_infer(
+            video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len
+        )
         print("\n\n", output)
 
     except Exception as e:
-        print(f"Error processing video file '{video_path}': {e}")
+        print(f"Error processing video file '{video_path}': {e}")
diff --git a/nemo/collections/multimodal/data/video_neva/video_neva_dataset.py b/nemo/collections/multimodal/data/video_neva/video_neva_dataset.py
@@ -57,6 +57,7 @@
 MAX_NUM_VIDEOS = 1
 IGNORE_INDEX = -1
 
+
 class TarOrFolderVideoLoader:
     """
     A class for loading images from a tar archive or a regular folder.
@@ -124,6 +125,7 @@ def flatten_frames(self, cap, num_frames):
 
         return frames_array
 
+
 class TarOrFolderImageLoader:
     """
     A class for loading images and videos from a tar archive or a regular folder.
@@ -167,4 +169,4 @@ def open_image(self, file_name):
                     return Image.open(f).convert('RGB')
         else:
             return Image.open(os.path.join(self.image_folder, file_name)).convert('RGB')
-        return None
+        return None
diff --git a/nemo/collections/multimodal/models/multimodal_llm/video_neva/video_neva_model.py b/nemo/collections/multimodal/models/multimodal_llm/video_neva/video_neva_model.py
@@ -29,6 +29,7 @@
     DataCollatorForSupervisedDataset,
     make_supervised_data_module,
 )
+from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import FrozenCLIPVisionTransformer
 from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
     CLIPVisionTransformer,
     MegatronCLIPModel,
@@ -58,8 +59,6 @@
 from nemo.core.classes.common import PretrainedModelInfo
 from nemo.utils import logging
 
-from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import FrozenCLIPVisionTransformer
-
 try:
     import apex.transformer.pipeline_parallel.utils
 
@@ -79,6 +78,7 @@
 
     HAVE_MEGATRON_CORE = False
 
+
 class NevaWordEmbeddingMixin(torch.nn.Module, adapter_mixins.AdapterModuleMixin):
     """
     A mixin class for integrating vision-based embeddings into language models.