You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
RAG.index(
input_path=pdf_filepath,
index_name="image_index", # index will be saved at .byaldi/index_name/
store_collection_with_index=False,
overwrite=True)`
i'am getting this error : RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same
You are passing both textandimagestoPaliGemmaProcessor. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add tokens in the very beginning of your text and` token after that. For this call, we will infer how many images each text has and add special tokens.
RuntimeError Traceback (most recent call last)
Cell In[7], line 1
----> 1 RAG.index(
2 input_path=pdf_filepath,
3 index_name="image_index", # index will be saved at .byaldi/index_name/
4 store_collection_with_index=False,
5 overwrite=True)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:496, in PaliGemmaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, token_type_ids, cache_position, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, num_logits_to_keep)
494 # Merge text and images
495 if pixel_values is not None:
--> 496 image_features = self.get_image_features(pixel_values)
498 special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
499 special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:405, in PaliGemmaForConditionalGeneration.get_image_features(self, pixel_values)
395 def get_image_features(self, pixel_values: torch.FloatTensor):
396 """
397 Obtains image last hidden states from the vision tower and apply multimodal projection.
398
(...)
403 image_features (torch.Tensor): Image feature tensor of shape (num_images, image_length, embed_dim)).
404 """
--> 405 image_outputs = self.vision_tower(pixel_values)
406 selected_image_feature = image_outputs.last_hidden_state
407 image_features = self.multi_modal_projector(selected_image_feature)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1190, in SiglipVisionModel.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1166 r"""
1167 Returns:
1168
(...)
1186 >>> pooled_output = outputs.pooler_output # pooled features
1187 ```"""
1188 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1190 return self.vision_model(
1191 pixel_values=pixel_values,
1192 output_attentions=output_attentions,
1193 output_hidden_states=output_hidden_states,
1194 return_dict=return_dict,
1195 interpolate_pos_encoding=interpolate_pos_encoding,
1196 )
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1089, in SiglipVisionTransformer.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1084 output_hidden_states = (
1085 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1086 )
1087 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1089 hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
1091 encoder_outputs = self.encoder(
1092 inputs_embeds=hidden_states,
1093 output_attentions=output_attentions,
1094 output_hidden_states=output_hidden_states,
1095 return_dict=return_dict,
1096 )
1098 last_hidden_state = encoder_outputs[0]
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
i'am trying to run this code :
`from byaldi import RAGMultiModalModel
RAG = RAGMultiModalModel.from_pretrained("vidore/colpali")
RAG.index(
input_path=pdf_filepath,
index_name="image_index", # index will be saved at .byaldi/index_name/
store_collection_with_index=False,
overwrite=True)`
i'am getting this error : RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same
You are passing both
textand
imagesto
PaliGemmaProcessor. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add
tokens in the very beginning of your text and
` token after that. For this call, we will infer how many images each text has and add special tokens.RuntimeError Traceback (most recent call last)
Cell In[7], line 1
----> 1 RAG.index(
2 input_path=pdf_filepath,
3 index_name="image_index", # index will be saved at .byaldi/index_name/
4 store_collection_with_index=False,
5 overwrite=True)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/RAGModel.py:124, in RAGMultiModalModel.index(self, input_path, index_name, doc_ids, store_collection_with_index, overwrite, metadata, max_image_width, max_image_height, **kwargs)
92 def index(
93 self,
94 input_path: Union[str, Path],
(...)
107 **kwargs,
108 ):
109 """Build an index from input documents.
110
111 Parameters:
(...)
122 None
123 """
--> 124 return self.model.index(
125 input_path,
126 index_name,
127 doc_ids,
128 store_collection_with_index,
129 overwrite=overwrite,
130 metadata=metadata,
131 max_image_width=max_image_width,
132 max_image_height=max_image_height,
133 **kwargs,
134 )
File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:378, in ColPaliModel.index(self, input_path, index_name, doc_ids, store_collection_with_index, overwrite, metadata, max_image_width, max_image_height)
376 doc_id = doc_ids[0] if doc_ids else self.highest_doc_id + 1
377 doc_metadata = metadata[0] if metadata else None
--> 378 self.add_to_index(
379 input_path,
380 store_collection_with_index,
381 doc_id=doc_id,
382 metadata=doc_metadata,
383 )
384 self.doc_ids_to_file_names[doc_id] = str(input_path)
386 self._export_index()
File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:448, in ColPaliModel.add_to_index(self, input_item, store_collection_with_index, doc_id, metadata)
441 self._process_directory(
442 item_path,
443 store_collection_with_index,
444 current_doc_id,
445 current_metadata,
446 )
447 else:
--> 448 self._process_and_add_to_index(
449 item_path,
450 store_collection_with_index,
451 current_doc_id,
452 current_metadata,
453 )
454 self.doc_ids_to_file_names[current_doc_id] = str(item_path)
455 elif isinstance(item, Image.Image):
File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:500, in ColPaliModel._process_and_add_to_index(self, item, store_collection_with_index, doc_id, metadata)
498 for i, image_path in enumerate(images):
499 image = Image.open(image_path)
--> 500 self._add_to_index(
501 image,
502 store_collection_with_index,
503 doc_id,
504 page_id=i + 1,
505 metadata=metadata,
506 )
507 elif item.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp"]:
508 image = Image.open(item)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/byaldi/colpali.py:542, in ColPaliModel._add_to_index(self, image, store_collection_with_index, doc_id, page_id, metadata)
540 with torch.no_grad():
541 processed_image = {k: v.to(self.device) for k, v in processed_image.items()}
--> 542 embedding = self.model(**processed_image)
544 # Add to index
545 embed_id = len(self.indexed_embeddings)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/colpali_engine/models/paligemma/colpali/modeling_colpali.py:38, in ColPali.forward(self, *args, **kwargs)
34 def forward(self, *args, **kwargs) -> torch.Tensor:
35 # Delete output_hidden_states from kwargs
36 kwargs.pop("output_hidden_states", None)
---> 38 outputs = self.model(*args, output_hidden_states=True, **kwargs) # (batch_size, sequence_length, hidden_size)
39 last_hidden_states = outputs.hidden_states[-1] # (batch_size, sequence_length, hidden_size)
40 proj = self.custom_text_proj(last_hidden_states) # (batch_size, sequence_length, dim)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:496, in PaliGemmaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, token_type_ids, cache_position, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, num_logits_to_keep)
494 # Merge text and images
495 if pixel_values is not None:
--> 496 image_features = self.get_image_features(pixel_values)
498 special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
499 special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/paligemma/modeling_paligemma.py:405, in PaliGemmaForConditionalGeneration.get_image_features(self, pixel_values)
395 def get_image_features(self, pixel_values: torch.FloatTensor):
396 """
397 Obtains image last hidden states from the vision tower and apply multimodal projection.
398
(...)
403 image_features (
torch.Tensor
): Image feature tensor of shape(num_images, image_length, embed_dim)
).404 """
--> 405 image_outputs = self.vision_tower(pixel_values)
406 selected_image_feature = image_outputs.last_hidden_state
407 image_features = self.multi_modal_projector(selected_image_feature)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1190, in SiglipVisionModel.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1166 r"""
1167 Returns:
1168
(...)
1186 >>> pooled_output = outputs.pooler_output # pooled features
1187 ```"""
1188 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1190 return self.vision_model(
1191 pixel_values=pixel_values,
1192 output_attentions=output_attentions,
1193 output_hidden_states=output_hidden_states,
1194 return_dict=return_dict,
1195 interpolate_pos_encoding=interpolate_pos_encoding,
1196 )
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:1089, in SiglipVisionTransformer.forward(self, pixel_values, output_attentions, output_hidden_states, return_dict, interpolate_pos_encoding)
1084 output_hidden_states = (
1085 output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1086 )
1087 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-> 1089 hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
1091 encoder_outputs = self.encoder(
1092 inputs_embeds=hidden_states,
1093 output_attentions=output_attentions,
1094 output_hidden_states=output_hidden_states,
1095 return_dict=return_dict,
1096 )
1098 last_hidden_state = encoder_outputs[0]
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/transformers/models/siglip/modeling_siglip.py:311, in SiglipVisionEmbeddings.forward(self, pixel_values, interpolate_pos_encoding)
309 def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
310 _, _, height, width = pixel_values.shape
--> 311 patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
312 embeddings = patch_embeds.flatten(2).transpose(1, 2)
314 if interpolate_pos_encoding:
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1553, in Module._wrapped_call_impl(self, *args, **kwargs)
1551 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1552 else:
-> 1553 return self._call_impl(*args, **kwargs)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/module.py:1562, in Module._call_impl(self, *args, **kwargs)
1557 # If we don't have any hooks, we want to skip the rest of the logic in
1558 # this function, and just call forward.
1559 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1560 or _global_backward_pre_hooks or _global_backward_hooks
1561 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1562 return forward_call(*args, **kwargs)
1564 try:
1565 result = None
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/conv.py:458, in Conv2d.forward(self, input)
457 def forward(self, input: Tensor) -> Tensor:
--> 458 return self._conv_forward(input, self.weight, self.bias)
File ~/.conda/envs/test_env/lib/python3.10/site-packages/torch/nn/modules/conv.py:454, in Conv2d._conv_forward(self, input, weight, bias)
450 if self.padding_mode != 'zeros':
451 return F.conv2d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
452 weight, bias, self.stride,
453 _pair(0), self.dilation, self.groups)
--> 454 return F.conv2d(input, weight, bias, self.stride,
455 self.padding, self.dilation, self.groups)
RuntimeError: Input type (torch.cuda.FloatTensor) and weight type (CUDABFloat16Type) should be the same`
The text was updated successfully, but these errors were encountered: