Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same #48

Open
imadoualid opened this issue Oct 24, 2024 · 1 comment

Comments

@imadoualid
Copy link

i'am trying to run this code :
`from byaldi import RAGMultiModalModel

RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")

RAG.index(
input_path=pdf_filepath,
index_name="image_index", # index will be saved at .byaldi/index_name/
store_collection_with_index=False,
overwrite=True)`

i'am getting this error : RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same

You are passing both textandimagestoPaliGemmaProcessor. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add tokens in the very beginning of your text and` token after that. For this call, we will infer how many images each text has and add special tokens.

RuntimeError Traceback (most recent call last)
Cell In[7], line 1
----> 1 RAG.index(
2 input_path=pdf_filepath,
3 index_name="image_index", # index will be saved at .byaldi/index_name/
4 store_collection_with_index=False,
5 overwrite=True)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/RAGModel.py:124, in RAGMultiModalModel.index(self, input_path, index_name, doc_ids, store_collection_with_index, overwrite, metadata, max_image_width, max_image_height, **kwargs)
92 def index(
93 self,
94 input_path: Union[str, Path],
(...)
107 **kwargs,
108 ):
109 """Build an index from input documents.
110
111 Parameters:
(...)
122 None
123 """
--> 124 return self.model.index(
125 input_path,
126 index_name,
127 doc_ids,
128 store_collection_with_index,
129 overwrite=overwrite,
130 metadata=metadata,
131 max_image_width=max_image_width,
132 max_image_height=max_image_height,
133 **kwargs,
134 )

File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:378, in ColPaliModel.index(self, input_path, index_name, doc_ids, store_collection_with_index, overwrite, metadata, max_image_width, max_image_height)
376 doc_id = doc_ids[0] if doc_ids else self.highest_doc_id + 1
377 doc_metadata = metadata[0] if metadata else None
--> 378 self.add_to_index(
379 input_path,
380 store_collection_with_index,
381 doc_id=doc_id,
382 metadata=doc_metadata,
383 )
384 self.doc_ids_to_file_names[doc_id] = str(input_path)
386 self._export_index()

File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:448, in ColPaliModel.add_to_index(self, input_item, store_collection_with_index, doc_id, metadata)
441 self._process_directory(
442 item_path,
443 store_collection_with_index,
444 current_doc_id,
445 current_metadata,
446 )
447 else:
--> 448 self._process_and_add_to_index(
449 item_path,
450 store_collection_with_index,
451 current_doc_id,
452 current_metadata,
453 )
454 self.doc_ids_to_file_names[current_doc_id] = str(item_path)
455 elif isinstance(item, Image.Image):

File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:500, in ColPaliModel._process_and_add_to_index(self, item, store_collection_with_index, doc_id, metadata)
498 for i, image_path in enumerate(images):
499 image = Image.open(image_path)
--> 500 self._add_to_index(
501 image,
502 store_collection_with_index,
503 doc_id,
504 page_id=i + 1,
505 metadata=metadata,
506 )
507 elif item.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
508 image = Image.open(item)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:542, in ColPaliModel._add_to_index(self, image, store_collection_with_index, doc_id, page_id, metadata)
540 with torch.no_grad():
541 processed_image = {k: v.to(self.device) for k, v in processed_image.items()}
--> 542 embedding = self.model(**processed_image)
544 # Add to index
545 embed_id = len(self.indexed_embeddings)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/colpali_engine/models/paligemma/colpali/modeling_colpali.py:38, in ColPali.forward(self, *args, **kwargs)
34 def forward(self, *args, **kwargs) -> torch.Tensor:
35 # Delete output_hidden_states from kwargs
36 kwargs.pop("output_hidden_states", None)
---> 38 outputs = self.model(*args, output_hidden_states=True, **kwargs) # (batch_size, sequence_length, hidden_size)
39 last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size)
40 proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:496, in PaliGemmaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, token_type_ids, cache_position, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, num_logits_to_keep)
494 # Merge text and images
495 if pixel_values is not None:
--> 496 image_features = self.get_image_features(pixel_values)
498 special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
499 special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:405, in PaliGemmaForConditionalGeneration.get_image_features(self, pixel_values)
395 def get_image_features(self, pixel_values: torch.FloatTensor):
396 """
397 Obtains image last hidden states from the vision tower and apply multimodal projection.
398
(...)
403 image_features (torch.Tensor): Image feature tensor of shape (num_images, image_length, embed_dim)).
404 """
--> 405 image_outputs = self.vision_tower(pixel_values)
406 selected_image_feature = image_outputs.last_hidden_state
407 image_features = self.multi_modal_projector(selected_image_feature)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1190, in SiglipVisionModel.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1166 r"""
1167 Returns:
1168
(...)
1186 >>> pooled_output = outputs.pooler_output # pooled features
1187 ```"""
1188 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1190 return self.vision_model(
1191 pixel_values=pixel_values,
1192 output_attentions=output_attentions,
1193 output_hidden_states=output_hidden_states,
1194 return_dict=return_dict,
1195 interpolate_pos_encoding=interpolate_pos_encoding,
1196 )

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1089, in SiglipVisionTransformer.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1084 output_hidden_states = (
1085 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1086 )
1087 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1089 hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
1091 encoder_outputs = self.encoder(
1092 inputs_embeds=hidden_states,
1093 output_attentions=output_attentions,
1094 output_hidden_states=output_hidden_states,
1095 return_dict=return_dict,
1096 )
1098 last_hidden_state = encoder_outputs[0]

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:311, in SiglipVisionEmbeddings.forward(self, pixel_values, interpolate_pos_encoding)
309 def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
310 _, _, height, width = pixel_values.shape
--> 311 patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
312 embeddings = patch_embeds.flatten(2).transpose(1, 2)
314 if interpolate_pos_encoding:

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/conv.py:458, in Conv2d.forward(self, input)
457 def forward(self, input: Tensor) -> Tensor:
--> 458 return self._conv_forward(input, self.weight, self.bias)

File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/conv.py:454, in Conv2d._conv_forward(self, input, weight, bias)
450 if self.padding_mode != 'zeros':
451 return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
452 weight, bias, self.stride,
453 _pair(0), self.dilation, self.groups)
--> 454 return F.conv2d(input, weight, bias, self.stride,
455 self.padding, self.dilation, self.groups)

RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same`

@if-ai
Copy link

if-ai commented Oct 25, 2024

I am getting the same error and is driving me nuts is an issue with the input images being in full float and the model loading on bfloat16 by default

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants