diff --git a/daiv/common/vqa_tools/vqa.py b/daiv/common/vqa_tools/vqa.py index 1d9cf4f..4d716d6 100644 --- a/daiv/common/vqa_tools/vqa.py +++ b/daiv/common/vqa_tools/vqa.py @@ -189,9 +189,9 @@ def loadRes(self, resFile, quesFile): anns = json.load(open(resFile)) assert type(anns) == list, "results is not an array of objects" annsQuesIds = [ann["question_id"] for ann in anns] - assert set(annsQuesIds) == set( - self.getQuesIds() - ), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file." + # assert set(annsQuesIds) == set( + # self.getQuesIds() + # ), "Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file." for ann in anns: quesId = ann["question_id"] if res.dataset["task_type"] == "Multiple Choice": @@ -208,4 +208,4 @@ def loadRes(self, resFile, quesFile): res.dataset["annotations"] = anns res.createIndex() - return res \ No newline at end of file + return res, annsQuesIds \ No newline at end of file diff --git a/daiv/configs/datasets/okvqa/defaults.yaml b/daiv/configs/datasets/okvqa/defaults.yaml index 470621b..ba3a09c 100644 --- a/daiv/configs/datasets/okvqa/defaults.yaml +++ b/daiv/configs/datasets/okvqa/defaults.yaml @@ -54,21 +54,21 @@ datasets: - /root/workspace/24s-VQA-MLLM/BEiT3/stage2-t5/VQA-MLLM-stage2/daiv/data/okvqa/okvqa_train.json # - okvqa/annotations/OpenEnded_mscoco_train2014_questions.json # - okvqa/annotations/mscoco_train2014_annotations.json - # test: - # url: - # # TODO make this order insensitive - # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json - # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json - # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json - # - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json - # storage: - # # - okvqa/annotations/vqa_val_eval.json - # # - okvqa/annotations/answer_list.json - # # - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json - # # - okvqa/annotations/mscoco_val2014_annotations.json - # - /root/workspace/24s-VQA-MLLM/BEiT3/stage2-eval/VQA-MLLM-stage2/daiv/data/okvqa/okvqa_val.json - # - /root/datasets/okvqa/data/assets/answer_dict_okvqa.json - # - /root/datasets/okvqa/data/okvqa/OpenEnded_mscoco_val2014_questions.json - # - /root/datasets/okvqa/data/okvqa/mscoco_val2014_annotations.json + test: + url: + # TODO make this order insensitive + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_val_eval.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/okvqa_answer_list_train.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/OpenEnded_mscoco_val2014_questions.json + - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/okvqa/mscoco_val2014_annotations.json + storage: + # - okvqa/annotations/vqa_val_eval.json + # - okvqa/annotations/answer_list.json + # - okvqa/annotations/OpenEnded_mscoco_val2014_questions.json + # - okvqa/annotations/mscoco_val2014_annotations.json + - /root/workspace/24s-VQA-MLLM/BEiT3/stage2-t5/VQA-MLLM-stage2/daiv/data/okvqa/okvqa_val.json #testing + - /root/datasets/okvqa/data/assets/answer_dict_okvqa.json + - /root/datasets/okvqa/data/okvqa/OpenEnded_mscoco_val2014_questions.json + - /root/datasets/okvqa/data/okvqa/mscoco_val2014_annotations.json images: storage: /root/datasets/okvqa/data \ No newline at end of file diff --git a/daiv/configs/models/blip2_instruct_flant5xl.yaml b/daiv/configs/models/blip2_instruct_flant5xl.yaml index 15513ad..17daf03 100644 --- a/daiv/configs/models/blip2_instruct_flant5xl.yaml +++ b/daiv/configs/models/blip2_instruct_flant5xl.yaml @@ -8,7 +8,7 @@ model: load_finetuned: False #load_pretrained: True - pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" + # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" finetuned: "" # vit encoder diff --git a/daiv/data/okvqa/okvqa_val_one.json b/daiv/data/okvqa/okvqa_val_one.json new file mode 100644 index 0000000..f796ec9 --- /dev/null +++ b/daiv/data/okvqa/okvqa_val_one.json @@ -0,0 +1 @@ +{"license": {"url": "http://creativecommons.org/licenses/by/4.0/", "name": "Creative Commons Attribution 4.0 International License"}, "data_subtype": "val2014", "question_types": {"eight": "Plants and Animals", "nine": "Science and Technology", "four": "Sports and Recreation", "six": "Geography, History, Language and Culture", "two": "Brands, Companies and Products", "other": "Other", "one": "Vehicles and Transportation", "five": "Cooking and Food", "ten": "Weather and Climate", "seven": "People and Everyday life", "three": "Objects, Material and Clothing"}, "annotations": [{"image_id": 297147, "answer_type": "other", "question_type": "one", "question_id": 2971475, "answers": [{"answer_id": 1, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 2, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 3, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 4, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 5, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 6, "raw_answer": "racing", "answer_confidence": "yes", "answer": "race"}, {"answer_id": 7, "raw_answer": "motocross", "answer_confidence": "yes", "answer": "motocross"}, {"answer_id": 8, "raw_answer": "motocross", "answer_confidence": "yes", "answer": "motocross"}, {"answer_id": 9, "raw_answer": "riding", "answer_confidence": "yes", "answer": "ride"}, {"answer_id": 10, "raw_answer": "riding", "answer_confidence": "yes", "answer": "ride"}], "confidence": 3, "question": "What sport can you use this for?"}, {"image_id": 339761, "answer_type": "other", "question_type": "eight", "question_id": 3397615, "answers": [{"answer_id": 1, "raw_answer": "vine", "answer_confidence": "yes", "answer": "vine"}, {"answer_id": 2, "raw_answer": "vine", "answer_confidence": "yes", "answer": "vine"}, {"answer_id": 3, "raw_answer": "vine", "answer_confidence": "yes", "answer": "vine"}, {"answer_id": 4, "raw_answer": "vine", "answer_confidence": "yes", "answer": "vine"}, {"answer_id": 5, "raw_answer": "climbing", "answer_confidence": "yes", "answer": "climb"}, {"answer_id": 6, "raw_answer": "climbing", "answer_confidence": "yes", "answer": "climb"}, {"answer_id": 7, "raw_answer": "looks like some kind of ivy", "answer_confidence": "yes", "answer": "look like some kind of ivy"}, {"answer_id": 8, "raw_answer": "looks like some kind of ivy", "answer_confidence": "yes", "answer": "look like some kind of ivy"}, {"answer_id": 9, "raw_answer": "ficus", "answer_confidence": "yes", "answer": "ficus"}, {"answer_id": 10, "raw_answer": "ficus", "answer_confidence": "yes", "answer": "ficus"}], "confidence": 2, "question": "Name the type of plant this is?"}, {"image_id": 357586, "answer_type": "other", "question_type": "other", "question_id": 3575865, "answers": [{"answer_id": 1, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 2, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 3, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 4, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 5, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 6, "raw_answer": "stuffed animal", "answer_confidence": "yes", "answer": "stuffed animal"}, {"answer_id": 7, "raw_answer": "teddy bear", "answer_confidence": "yes", "answer": "teddy bear"}, {"answer_id": 8, "raw_answer": "teddy bear", "answer_confidence": "yes", "answer": "teddy bear"}, {"answer_id": 9, "raw_answer": "teddy bear", "answer_confidence": "yes", "answer": "teddy bear"}, {"answer_id": 10, "raw_answer": "teddy bear", "answer_confidence": "yes", "answer": "teddy bear"}], "confidence": 3, "question": "What toy is this?"}, {"image_id": 94922, "answer_type": "other", "question_type": "eight", "question_id": 949225, "answers": [{"answer_id": 1, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 2, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 3, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 4, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 5, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 6, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 7, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 8, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 9, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}, {"answer_id": 10, "raw_answer": "mouth", "answer_confidence": "yes", "answer": "mouth"}], "confidence": 5, "question": "Which part of this animal would be in use of it was playing the game that is played with the items the man is holding?"}, {"image_id": 207611, "answer_type": "other", "question_type": "seven", "question_id": 2076115, "answers": [{"answer_id": 1, "raw_answer": "clothes", "answer_confidence": "yes", "answer": "cloth"}, {"answer_id": 2, "raw_answer": "clothes", "answer_confidence": "yes", "answer": "cloth"}, {"answer_id": 3, "raw_answer": "clothes", "answer_confidence": "yes", "answer": "cloth"}, {"answer_id": 4, "raw_answer": "clothes", "answer_confidence": "yes", "answer": "cloth"}, {"answer_id": 5, "raw_answer": "food", "answer_confidence": "yes", "answer": "food"}, {"answer_id": 6, "raw_answer": "food", "answer_confidence": "yes", "answer": "food"}, {"answer_id": 7, "raw_answer": "lunch", "answer_confidence": "yes", "answer": "lunch"}, {"answer_id": 8, "raw_answer": "lunch", "answer_confidence": "yes", "answer": "lunch"}, {"answer_id": 9, "raw_answer": "shoes", "answer_confidence": "yes", "answer": "shoe"}, {"answer_id": 10, "raw_answer": "shoes", "answer_confidence": "yes", "answer": "shoe"}], "confidence": 2, "question": "What could this gentleman be carrying in that red bag?"}, {"image_id": 572399, "answer_type": "other", "question_type": "seven", "question_id": 5723996, "answers": [{"answer_id": 1, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 2, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 3, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 4, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 5, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 6, "raw_answer": "man", "answer_confidence": "yes", "answer": "man"}, {"answer_id": 7, "raw_answer": "men", "answer_confidence": "yes", "answer": "men"}, {"answer_id": 8, "raw_answer": "men", "answer_confidence": "yes", "answer": "men"}, {"answer_id": 9, "raw_answer": "men", "answer_confidence": "yes", "answer": "men"}, {"answer_id": 10, "raw_answer": "men", "answer_confidence": "yes", "answer": "men"}], "confidence": 3, "question": "Who leaves a toilet like this?"}, {"image_id": 575970, "answer_type": "other", "question_type": "seven", "question_id": 5759705, "answers": [{"answer_id": 1, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 2, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 3, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 4, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 5, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 6, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 7, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 8, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 9, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}, {"answer_id": 10, "raw_answer": "island", "answer_confidence": "yes", "answer": "island"}], "confidence": 5, "question": "A center affixed unit like this one in a kitchen is called a what?"}, {"image_id": 304557, "answer_type": "other", "question_type": "seven", "question_id": 3045575, "answers": [{"answer_id": 1, "raw_answer": "shopping", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 2, "raw_answer": "shopping", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 3, "raw_answer": "shopping", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 4, "raw_answer": "shopping", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 5, "raw_answer": "nyc", "answer_confidence": "yes", "answer": "nyc"}, {"answer_id": 6, "raw_answer": "nyc", "answer_confidence": "yes", "answer": "nyc"}, {"answer_id": 7, "raw_answer": "shop", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 8, "raw_answer": "shop", "answer_confidence": "yes", "answer": "shop"}, {"answer_id": 9, "raw_answer": "business", "answer_confidence": "yes", "answer": "business"}, {"answer_id": 10, "raw_answer": "business", "answer_confidence": "yes", "answer": "business"}], "confidence": 3, "question": "Why might someone go to this place?"}, {"image_id": 218365, "answer_type": "other", "question_type": "eight", "question_id": 2183655, "answers": [{"answer_id": 1, "raw_answer": "ground", "answer_confidence": "yes", "answer": "ground"}, {"answer_id": 2, "raw_answer": "ground", "answer_confidence": "yes", "answer": "ground"}, {"answer_id": 3, "raw_answer": "ground", "answer_confidence": "yes", "answer": "ground"}, {"answer_id": 4, "raw_answer": "ground", "answer_confidence": "yes", "answer": "ground"}, {"answer_id": 5, "raw_answer": "plant", "answer_confidence": "yes", "answer": "plant"}, {"answer_id": 6, "raw_answer": "plant", "answer_confidence": "yes", "answer": "plant"}, {"answer_id": 7, "raw_answer": "hibiscus plant stem", "answer_confidence": "yes", "answer": "hibiscus plant stem"}, {"answer_id": 8, "raw_answer": "hibiscus plant stem", "answer_confidence": "yes", "answer": "hibiscus plant stem"}, {"answer_id": 9, "raw_answer": "roots", "answer_confidence": "yes", "answer": "root"}, {"answer_id": 10, "raw_answer": "roots", "answer_confidence": "yes", "answer": "root"}], "confidence": 2, "question": "What does this grow from?"}, {"image_id": 286313, "answer_type": "other", "question_type": "four", "question_id": 2863135, "answers": [{"answer_id": 1, "raw_answer": "swinging", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 2, "raw_answer": "swinging", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 3, "raw_answer": "swinging", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 4, "raw_answer": "swinging", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 5, "raw_answer": "swing", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 6, "raw_answer": "swing", "answer_confidence": "yes", "answer": "swing"}, {"answer_id": 7, "raw_answer": "hitting", "answer_confidence": "yes", "answer": "hit"}, {"answer_id": 8, "raw_answer": "hitting", "answer_confidence": "yes", "answer": "hit"}, {"answer_id": 9, "raw_answer": "trying to hit hte ball", "answer_confidence": "yes", "answer": "try to hit hte ball"}, {"answer_id": 10, "raw_answer": "trying to hit hte ball", "answer_confidence": "yes", "answer": "try to hit hte ball"}], "confidence": 3, "question": "What is that man doing with the bat?"}, {"image_id": 29984, "answer_type": "other", "question_type": "seven", "question_id": 299845, "answers": [{"answer_id": 1, "raw_answer": "salt water beach", "answer_confidence": "yes", "answer": "salt water beach"}, {"answer_id": 2, "raw_answer": "salt water beach", "answer_confidence": "yes", "answer": "salt water beach"}, {"answer_id": 3, "raw_answer": "salt water beach", "answer_confidence": "yes", "answer": "salt water beach"}, {"answer_id": 4, "raw_answer": "salt water beach", "answer_confidence": "yes", "answer": "salt water beach"}, {"answer_id": 5, "raw_answer": "salt water", "answer_confidence": "yes", "answer": "salt water"}, {"answer_id": 6, "raw_answer": "salt water", "answer_confidence": "yes", "answer": "salt water"}, {"answer_id": 7, "raw_answer": "lake", "answer_confidence": "yes", "answer": "lake"}, {"answer_id": 8, "raw_answer": "lake", "answer_confidence": "yes", "answer": "lake"}, {"answer_id": 9, "raw_answer": "beach", "answer_confidence": "yes", "answer": "beach"}, {"answer_id": 10, "raw_answer": "beach", "answer_confidence": "yes", "answer": "beach"}], "confidence": 2, "question": "Is this at a salt water beach or a lake?"}, {"image_id": 11511, "answer_type": "other", "question_type": "other", "question_id": 115115, "answers": [{"answer_id": 1, "raw_answer": "artist", "answer_confidence": "yes", "answer": "artist"}, {"answer_id": 2, "raw_answer": "artist", "answer_confidence": "yes", "answer": "artist"}, {"answer_id": 3, "raw_answer": "artist", "answer_confidence": "yes", "answer": "artist"}, {"answer_id": 4, "raw_answer": "artist", "answer_confidence": "yes", "answer": "artist"}, {"answer_id": 5, "raw_answer": "guell", "answer_confidence": "yes", "answer": "guell"}, {"answer_id": 6, "raw_answer": "guell", "answer_confidence": "yes", "answer": "guell"}, {"answer_id": 7, "raw_answer": "toscano", "answer_confidence": "yes", "answer": "toscano"}, {"answer_id": 8, "raw_answer": "toscano", "answer_confidence": "yes", "answer": "toscano"}, {"answer_id": 9, "raw_answer": "aritect", "answer_confidence": "yes", "answer": "aritect"}, {"answer_id": 10, "raw_answer": "aritect", "answer_confidence": "yes", "answer": "aritect"}], "confidence": 2, "question": "Who designed the statues?"}, {"image_id": 323460, "answer_type": "other", "question_type": "five", "question_id": 3234605, "answers": [{"answer_id": 1, "raw_answer": "condiments", "answer_confidence": "yes", "answer": "condiment"}, {"answer_id": 2, "raw_answer": "condiments", "answer_confidence": "yes", "answer": "condiment"}, {"answer_id": 3, "raw_answer": "condiments", "answer_confidence": "yes", "answer": "condiment"}, {"answer_id": 4, "raw_answer": "condiments", "answer_confidence": "yes", "answer": "condiment"}, {"answer_id": 5, "raw_answer": "onions relish", "answer_confidence": "yes", "answer": "onion relish"}, {"answer_id": 6, "raw_answer": "onions relish", "answer_confidence": "yes", "answer": "onion relish"}, {"answer_id": 7, "raw_answer": "vegetables", "answer_confidence": "yes", "answer": "vegetable"}, {"answer_id": 8, "raw_answer": "vegetables", "answer_confidence": "yes", "answer": "vegetable"}, {"answer_id": 9, "raw_answer": "relish", "answer_confidence": "yes", "answer": "relish"}, {"answer_id": 10, "raw_answer": "relish", "answer_confidence": "yes", "answer": "relish"}], "confidence": 2, "question": "What is the name of the items the hot dog are topped with?"}, {"image_id": 516916, "answer_type": "other", "question_type": "other", "question_id": 5169165, "answers": [{"answer_id": 1, "raw_answer": "working", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 2, "raw_answer": "working", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 3, "raw_answer": "working", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 4, "raw_answer": "working", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 5, "raw_answer": "computing", "answer_confidence": "yes", "answer": "compute"}, {"answer_id": 6, "raw_answer": "computing", "answer_confidence": "yes", "answer": "compute"}, {"answer_id": 7, "raw_answer": "work", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 8, "raw_answer": "work", "answer_confidence": "yes", "answer": "work"}, {"answer_id": 9, "raw_answer": "office", "answer_confidence": "yes", "answer": "office"}, {"answer_id": 10, "raw_answer": "office", "answer_confidence": "yes", "answer": "office"}], "confidence": 3, "question": "What is this desk used for?"}, {"image_id": 21711, "answer_type": "other", "question_type": "one", "question_id": 217115, "answers": [{"answer_id": 1, "raw_answer": "bmx", "answer_confidence": "yes", "answer": "bmx"}, {"answer_id": 2, "raw_answer": "bmx", "answer_confidence": "yes", "answer": "bmx"}, {"answer_id": 3, "raw_answer": "bmx", "answer_confidence": "yes", "answer": "bmx"}, {"answer_id": 4, "raw_answer": "bmx", "answer_confidence": "yes", "answer": "bmx"}, {"answer_id": 5, "raw_answer": "bicycle", "answer_confidence": "yes", "answer": "bicycle"}, {"answer_id": 6, "raw_answer": "bicycle", "answer_confidence": "yes", "answer": "bicycle"}, {"answer_id": 7, "raw_answer": "bicycle", "answer_confidence": "yes", "answer": "bicycle"}, {"answer_id": 8, "raw_answer": "bicycle", "answer_confidence": "yes", "answer": "bicycle"}, {"answer_id": 9, "raw_answer": "10 speed", "answer_confidence": "yes", "answer": "10 speed"}, {"answer_id": 10, "raw_answer": "10 speed", "answer_confidence": "yes", "answer": "10 speed"}], "confidence": 2, "question": "What type of bike is on the ground?"}, {"image_id": 313386, "answer_type": "other", "question_type": "one", "question_id": 3133865, "answers": [{"answer_id": 1, "raw_answer": "commercial", "answer_confidence": "yes", "answer": "commercial"}, {"answer_id": 2, "raw_answer": "commercial", "answer_confidence": "yes", "answer": "commercial"}, {"answer_id": 3, "raw_answer": "commercial", "answer_confidence": "yes", "answer": "commercial"}, {"answer_id": 4, "raw_answer": "commercial", "answer_confidence": "yes", "answer": "commercial"}, {"answer_id": 5, "raw_answer": "passenger", "answer_confidence": "yes", "answer": "passenger"}, {"answer_id": 6, "raw_answer": "passenger", "answer_confidence": "yes", "answer": "passenger"}, {"answer_id": 7, "raw_answer": "quantas", "answer_confidence": "yes", "answer": "quanta"}, {"answer_id": 8, "raw_answer": "quantas", "answer_confidence": "yes", "answer": "quanta"}, {"answer_id": 9, "raw_answer": "md 80", "answer_confidence": "yes", "answer": "md 80"}, {"answer_id": 10, "raw_answer": "md 80", "answer_confidence": "yes", "answer": "md 80"}], "confidence": 2, "question": "What type of plane is that?"}], "info": {"year": 2019, "version": "1.0", "description": "This is v1.0 of the OK-VQA dataset."}, "data_type": "mscoco"} \ No newline at end of file diff --git a/daiv/models/blip2_t5_instruct.py b/daiv/models/blip2_t5_instruct.py index 2f955da..52379ce 100644 --- a/daiv/models/blip2_t5_instruct.py +++ b/daiv/models/blip2_t5_instruct.py @@ -120,7 +120,7 @@ def forward(self, samples): image_atts_mcan = self.MCAN.make_mask(image_embeds_mcan).to(image.device) text_input_mcan = samples["text_input"] - # text_input_llm = samples["text_input"] + text_input_llm = samples["text_input"] # Process text for MCAN text_tokens_mcan = self.tokenizer( @@ -148,16 +148,16 @@ def forward(self, samples): atts_llm_mcan = torch.ones(text_embeds_llm_mcan.size()[:-1], dtype=torch.long).to(image.device) # Process text for LLM - # text_tokens_llm = self.tokenizer( - # text_input_llm, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_txt_len - # ).input_ids.to(image.device) - # text_embeds_llm = self.text_embed_proj(self.MCAN.embedding(text_tokens_llm)) - # atts_llm_text = torch.ones(text_embeds_llm.size()[:-1], dtype=torch.long).to(image.device) + text_tokens_llm = self.tokenizer( + text_input_llm, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_txt_len + ).input_ids.to(image.device) + text_embeds_llm = self.text_embed_proj(self.MCAN.embedding(text_tokens_llm)) + atts_llm_text = torch.ones(text_embeds_llm.size()[:-1], dtype=torch.long).to(image.device) - # inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan, text_embeds_llm], dim=1) - # atts_llm = torch.cat([image_atts_llm, atts_llm_mcan, atts_llm_text], dim=1) - inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan], dim=1) - atts_llm = torch.cat([image_atts_llm, atts_llm_mcan], dim=1) + inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan, text_embeds_llm], dim=1) + atts_llm = torch.cat([image_atts_llm, atts_llm_mcan, atts_llm_text], dim=1) + # inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan], dim=1) + # atts_llm = torch.cat([image_atts_llm, atts_llm_mcan], dim=1) text_output = [t + self.t5_tokenizer.eos_token for t in samples["text_output"]] @@ -214,7 +214,7 @@ def generate( image_atts_mcan = self.MCAN.make_mask(image_embeds_mcan).to(image.device) text_input_mcan = samples["text_input"] - # text_input_llm = samples["text_input"] + text_input_llm = samples["text_input"] # Process text for MCAN text_tokens_mcan = self.tokenizer( @@ -241,16 +241,16 @@ def generate( atts_llm_mcan = torch.ones(text_embeds_llm_mcan.size()[:-1], dtype=torch.long).to(image.device) # Process text for LLM - # text_tokens_llm = self.tokenizer( - # text_input_llm, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_txt_len - # ).input_ids.to(image.device) - # text_embeds_llm = self.text_embed_proj(self.MCAN.embedding(text_tokens_llm)) - # atts_llm_text = torch.ones(text_embeds_llm.size()[:-1], dtype=torch.long).to(image.device) - - # inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm, text_embeds_llm_mcan], dim=1) - # atts_llm = torch.cat([image_atts_llm, atts_llm_text, atts_llm_mcan], dim=1) - inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan], dim=1) - atts_llm = torch.cat([image_atts_llm, atts_llm_mcan], dim=1) + text_tokens_llm = self.tokenizer( + text_input_llm, return_tensors="pt", padding="longest", truncation=True, max_length=self.max_txt_len + ).input_ids.to(image.device) + text_embeds_llm = self.text_embed_proj(self.MCAN.embedding(text_tokens_llm)) + atts_llm_text = torch.ones(text_embeds_llm.size()[:-1], dtype=torch.long).to(image.device) + + inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan, text_embeds_llm], dim=1) + atts_llm = torch.cat([image_atts_llm, atts_llm_mcan, atts_llm_text], dim=1) + # inputs_llm = torch.cat([image_embeds_llm, text_embeds_llm_mcan], dim=1) + # atts_llm = torch.cat([image_atts_llm, atts_llm_mcan], dim=1) if "prompt" in samples.keys(): prompt = samples["prompt"] diff --git a/daiv/runners/runner_base.py b/daiv/runners/runner_base.py index a82579a..e0ca9d0 100644 --- a/daiv/runners/runner_base.py +++ b/daiv/runners/runner_base.py @@ -384,27 +384,27 @@ def train(self): self.log_stats(split_name="train", stats=train_stats) # evaluation phase - if len(self.valid_splits) > 0: - for split_name in self.valid_splits: - logging.info("Evaluating on {}.".format(split_name)) - - val_log = self.eval_epoch( - split_name=split_name, cur_epoch=cur_epoch - ) - if val_log is not None: - if is_main_process(): - assert ( - "agg_metrics" in val_log - ), "No agg_metrics found in validation log." - - agg_metrics = val_log["agg_metrics"] - if agg_metrics > best_agg_metric and split_name == "val": - best_epoch, best_agg_metric = cur_epoch, agg_metrics - - self._save_checkpoint(cur_epoch, is_best=True) - - val_log.update({"best_epoch": best_epoch}) - self.log_stats(val_log, split_name) + # if len(self.valid_splits) > 0: + # for split_name in self.valid_splits: + # logging.info("Evaluating on {}.".format(split_name)) + + # val_log = self.eval_epoch( + # split_name=split_name, cur_epoch=cur_epoch + # ) + # if val_log is not None: + # if is_main_process(): + # assert ( + # "agg_metrics" in val_log + # ), "No agg_metrics found in validation log." + + # agg_metrics = val_log["agg_metrics"] + # if agg_metrics > best_agg_metric and split_name == "val": + # best_epoch, best_agg_metric = cur_epoch, agg_metrics + + # self._save_checkpoint(cur_epoch, is_best=True) + + # val_log.update({"best_epoch": best_epoch}) + # self.log_stats(val_log, split_name) else: # if no validation split is provided, we just save the checkpoint at the end of each epoch. diff --git a/daiv/tasks/vqa.py b/daiv/tasks/vqa.py index 5b0e060..1444b90 100644 --- a/daiv/tasks/vqa.py +++ b/daiv/tasks/vqa.py @@ -33,7 +33,8 @@ def __init__( sample_id_key = "", ques_files=dict(), anno_files=dict(), - valid_splits=['val'] + # valid_splits=['val'] + valid_splits=['test'] ): super().__init__() @@ -75,7 +76,8 @@ def setup_task(cls, cfg): sample_id_key = run_cfg.get("sample_id_key", "instance_id") ques_files = run_cfg.get("ques_files", dict()) anno_files = run_cfg.get("anno_files", dict()) - valid_splits = run_cfg.get("valid_splits", ["val"]) + # valid_splits = run_cfg.get("valid_splits", ["val"]) + valid_splits = run_cfg.get("valid_splits", ["test"]) return cls( @@ -99,7 +101,7 @@ def build_datasets(self, cfg): for split in self.valid_splits: if split not in dataset: print(f"Split {split} not found in {ds_name}.") - continue # 추가 + # continue # 추가 if ( hasattr(dataset[split], "coco_fmt_qust_file") and dataset[split].coco_fmt_qust_file is not None @@ -141,14 +143,14 @@ def valid_step(self, model, samples): prompt=self.prompt, ) pred_qa_pairs = [] - + question = samples['text_input'] question_id = samples["question_id"] - for answer, ques_id in zip(answers, question_id): + for answer, ques_id, que in zip(answers, question_id, question): ques_id = int(ques_id.item()) if isinstance(ques_id, torch.Tensor) else ques_id if ques_id != int and is_convertible_to_int(ques_id): ques_id = int(ques_id) pred_qa_pairs.append({"question_id": ques_id, "answer": answer}) - print(f'answer : {answer}') + print(f'question: {que} / answer : {answer}') return pred_qa_pairs @@ -161,7 +163,7 @@ def after_evaluation(self, val_result, split_name, **kwargs): ) metrics = self._report_metrics(result_file=result_file, split=split_name) - + print(metrics) return metrics @dist_utils.main_process @@ -170,17 +172,17 @@ def _report_metrics(self, result_file, split): Use official VQA evaluation script to report metrics. """ metrics = {} - + print(f'ques_files: {self.ques_files} / anno_files: {self.anno_files}' ) if split in self.ques_files and split in self.anno_files: vqa = VQA(self.anno_files[split], self.ques_files[split]) - vqa_result = vqa.loadRes( + vqa_result, resQuesIds = vqa.loadRes( resFile=result_file, quesFile=self.ques_files[split] ) # create vqaEval object by taking vqa and vqaRes # n is precision of accuracy (number of places after decimal), default is 2 vqa_scorer = VQAEval(vqa, vqa_result, n=2) logging.info("Start VQA evaluation.") - vqa_scorer.evaluate() + vqa_scorer.evaluate(resQuesIds) # print accuracies overall_acc = vqa_scorer.accuracy["overall"] diff --git a/train_configs/pretrain_stage2_eval.yaml b/train_configs/pretrain_stage2_eval.yaml index e1b651d..80202a6 100644 --- a/train_configs/pretrain_stage2_eval.yaml +++ b/train_configs/pretrain_stage2_eval.yaml @@ -67,11 +67,12 @@ run: amp: True # MCAN - resume_ckpt_path: '/root/workspace/24s-VQA-MLLM/EunJuPark/VQA-MLLM-stage2/daiv/output/BLIP2/Pretrain_stage2_eval/20240720153/checkpoint_9.pth' + resume_ckpt_path: '/root/workspace/24s-VQA-MLLM/EunJuPark/stage2/BLIVA/daiv/output/BLIP2/Pretrain_stage2/20240720153/checkpoint_9.pth' + # resume_ckpt_path: '/root/workspace/24s-VQA-MLLM/EunJuPark/VQA-MLLM-stage2/daiv/output/BLIP2/Pretrain_stage2_eval/20240720153/checkpoint_9.pth' # resume_ckpt_path: '/root/workspace/24s-VQA-MLLM/BEiT3/VQA-MLLM-stage2/daiv/output/BLIP2/Pretrain_stage2/20240719160/checkpoint_9.pth' evaluate: True - train_splits: ["train"] + # train_splits: ["train"] test_splits: ["test"] device: "cuda"