Skip to content

Commit

Permalink
[pre-commit.ci] auto fixes from pre-commit.com hooks
Browse files Browse the repository at this point in the history
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Feb 9, 2024
1 parent 888e6b0 commit 604be69
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 23 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,22 @@
--video_path <video_path>
"""

from nemo.collections.multimodal.data.neva.conversation import Conversation, SeparatorStyle, conv_templates
import argparse
import os

import cv2
import numpy as np
import torch

# add new packages as below
from PIL import Image

from nemo.collections.multimodal.data.neva.conversation import Conversation, SeparatorStyle, conv_templates
from nemo.collections.multimodal.data.video_neva.video_neva_dataset import (
TarOrFolderImageLoader,
TarOrFolderVideoLoader,
)
from nemo.collections.multimodal.parts.utils import create_neva_model_and_processor
from nemo.collections.multimodal.data.video_neva.video_neva_dataset import TarOrFolderImageLoader, TarOrFolderVideoLoader
import argparse
import numpy as np
import cv2
import os

# Define constants
DEFAULT_VIDEO_TOKEN = "<video>"
Expand All @@ -26,8 +31,9 @@
DEFAULT_VID_END_TOKEN = "<vid_end>"


def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor,
video_token_len):
def video_neva_infer(
video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len
):
"""
Run inference using the Video-ChatGPT model.
Expand All @@ -51,7 +57,13 @@ def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tok

# Prepare question string for the model
if model.get_model().vision_config.use_vid_start_end:
qs = question + '\n' + DEFAULT_VID_START_TOKEN + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len + DEFAULT_VID_END_TOKEN
qs = (
question
+ '\n'
+ DEFAULT_VID_START_TOKEN
+ DEFAULT_VIDEO_PATCH_TOKEN * video_token_len
+ DEFAULT_VID_END_TOKEN
)
else:
qs = question + '\n' + DEFAULT_VIDEO_PATCH_TOKEN * video_token_len

Expand All @@ -72,17 +84,15 @@ def video_neva_infer(video_frames, question, conv_mode, model, vision_tower, tok

# Run model inference
with torch.inference_mode():
output_ids = model.generate(
input_ids,
**inference_config)
output_ids = model.generate(input_ids, **inference_config)

# Check if output is the same as input
n_diff_input_output = (input_ids != output_ids[:, :input_ids.shape[1]]).sum().item()
n_diff_input_output = (input_ids != output_ids[:, : input_ids.shape[1]]).sum().item()
if n_diff_input_output > 0:
print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')

# Decode output tokens
outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1]:], skip_special_tokens=True)[0]
outputs = tokenizer.batch_decode(output_ids[:, input_ids.shape[1] :], skip_special_tokens=True)[0]

return outputs

Expand All @@ -105,13 +115,14 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()

model, vision_tower, tokenizer, image_processor, video_token_len = \
create_neva_model_and_processor(args.model_name, args.projection_path)
model, vision_tower, tokenizer, image_processor, video_token_len = create_neva_model_and_processor(
args.model_name, args.projection_path
)

video_path = args.video_path

if os.path.exists(video_path):
video_data_loader = TarOrFolderVideoLoader(video_folder = video_path)
video_data_loader = TarOrFolderVideoLoader(video_folder=video_path)
video_object = video_data_loader.open_video(video_path)
width = video_object.get(cv2.CAP_PROP_FRAME_WIDTH)
height = video_object.get(cv2.CAP_PROP_FRAME_HEIGHT)
Expand All @@ -127,9 +138,10 @@ def parse_args():

try:
# Run inference on the video and add the output to the list
output = video_neva_infer(video_frames, question, conv_mode, model, vision_tower,
tokenizer, image_processor, video_token_len)
output = video_neva_infer(
video_frames, question, conv_mode, model, vision_tower, tokenizer, image_processor, video_token_len
)
print("\n\n", output)

except Exception as e:
print(f"Error processing video file '{video_path}': {e}")
print(f"Error processing video file '{video_path}': {e}")
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
MAX_NUM_VIDEOS = 1
IGNORE_INDEX = -1


class TarOrFolderVideoLoader:
"""
A class for loading images from a tar archive or a regular folder.
Expand Down Expand Up @@ -124,6 +125,7 @@ def flatten_frames(self, cap, num_frames):

return frames_array


class TarOrFolderImageLoader:
"""
A class for loading images and videos from a tar archive or a regular folder.
Expand Down Expand Up @@ -167,4 +169,4 @@ def open_image(self, file_name):
return Image.open(f).convert('RGB')
else:
return Image.open(os.path.join(self.image_folder, file_name)).convert('RGB')
return None
return None
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
DataCollatorForSupervisedDataset,
make_supervised_data_module,
)
from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import FrozenCLIPVisionTransformer
from nemo.collections.multimodal.models.vision_language_foundation.clip.megatron_clip_models import (
CLIPVisionTransformer,
MegatronCLIPModel,
Expand Down Expand Up @@ -58,8 +59,6 @@
from nemo.core.classes.common import PretrainedModelInfo
from nemo.utils import logging

from nemo.collections.multimodal.models.multimodal_llm.neva.neva_model import FrozenCLIPVisionTransformer

try:
import apex.transformer.pipeline_parallel.utils

Expand All @@ -79,6 +78,7 @@

HAVE_MEGATRON_CORE = False


class NevaWordEmbeddingMixin(torch.nn.Module, adapter_mixins.AdapterModuleMixin):
"""
A mixin class for integrating vision-based embeddings into language models.
Expand Down

0 comments on commit 604be69

Please sign in to comment.