-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreproces_caption.py
109 lines (95 loc) · 5.83 KB
/
preproces_caption.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import numpy as np
import json, re
from collections import Counter
from tqdm import tqdm
import os
def build_vocab(all_words, min_freq=1):
#returns a a list of tuples with each tuple having the word as the first member and the frequency as the second member.
#The tuples are ordered by the frequency of the word.
all_words = all_words.most_common()
word_to_index = {'<pad>': 0, '<unk>':1} #create a dictionary with <pad> has the key 0, and <unk> is 1
for index, (word, freq) in enumerate(tqdm(all_words, desc='building vocab'), start=2): #iterate over all_words to build the vocab
if freq<min_freq: #pass when the the frequency of the word is below to min_freq
continue
word_to_index[word] = index #create word to index data
index_to_word = {v: k for k, v in word_to_index.items()} #reverse word to index to get the index to word data
#output, printing the number of words and unknown word in the vocab
print(f'number of words in vocab: {len(word_to_index)}')
print(f'number of <unk> in vocab: {len(all_words)-len(word_to_index)}')
return word_to_index, index_to_word
def parse_csv(csv_file, caption_file, gts_file, clean_only=False):
file = pd.read_csv(csv_file, encoding='utf-8') #read the csv file (caption dataset)
data = pd.DataFrame(file) #create a new dataframe
data = data.dropna(axis=0) #drop the missing value
eng_data = data[data['Language']=='English'] #set the data when the language is English
if clean_only is True: #if the clean_only is True then the algorith will choose only the clean category in the source column
eng_data = eng_data[eng_data['Source']=='clean']
print(f'There are totally {len(eng_data)} english descriptions') #print the information about the english data
captions = [] #create a new list to store the caption
counter = Counter() #call the Counter class
filenames = [] #create a new list to store the filenames
gts = {} #create a new dictionary to store the ground truth
max_cap_ids = {} #create a new dictionary to store the max_cap_ids
#iterate over dataset
for _, name, start, end, sentence in tqdm(eng_data[['VideoID', 'Start', 'End', 'Description']].itertuples(), desc = 'reading captions'):
file_name = name + '_' + str(start) + '_' +str(end) #join the value to get the full filename
filenames.append(file_name) #store the filename to the list
tokenized = sentence.lower() #set the sentence into lower (uncapitalize)
tokenized = re.sub(r'[~\\/().!,;?:]', ' ', tokenized) #remove the unwanted character
gts_token = tokenized #store the clean sentences into new variable
tokenized = tokenized.split() #split the sentence into word
tokenized = ['<sos>'] + tokenized + ['<eos>'] #give the start of the sentences and end of sentences sign
counter.update(tokenized) #update the counter of the word with the clean tokenizer
captions.append(tokenized) #store the clean sentence into captions list
if file_name in gts: #check condition, if the file name is exist in the ground truth data
max_cap_ids[file_name]+=1 #set the sigh into zero
#make the dictionary format to store the filename, caption_id, caption, and clean ground truth
gts[file_name].append({
u'image_id':file_name,
u'cap_id': 0,
u'caption': sentence,
u'tokenized': gts_token
})
else: #is not exist
max_cap_ids[file_name] = 0 #set the sigh into zero
#make the dictionary format to store the filename, caption_id, caption, and clean ground truth
gts[file_name] = [{
u'image_id': file_name,
u'cap_id': 0,
u'caption': sentence,
u'tokenized': gts_token
}]
word_to_index, index_to_word = build_vocab(counter) #start the process of the build_vocab function to get tghe word_to_index and index_to_word data
#
captions = [[word_to_index.get(w, word_to_index['<unk>']) for w in caption] for caption in tqdm(captions, desc = 'turing words into index')]
caption_dict = {} #create a new variable to store the caption dict
for name, cap in zip(filenames, captions): #iterate over the filename and caption
if name not in caption_dict.keys(): #check if the file name is not in the caption
caption_dict[name] = [] #if True, then set the name in the caption_dict as empty list
caption_dict[name].append(cap) #store the caption and file name into the caption dict
#split dataset
vid_names = list(caption_dict.keys()) #get all the video names
# np.random.shuffle(vid_names)
train_data = vid_names[:1200] #split the data for the training
valid_data = vid_names[1200: 1300] #split the data for the validation
test_data = vid_names[1300:1970] #split the data for the testing
data_split = {'train': train_data, 'validation':valid_data, 'test': test_data} #combine all part into a dictionary
with open(caption_file, 'w+', encoding='utf-8') as f: #save the caption file and other data with json format
json.dump({
'word_to_index' : word_to_index,
'index_to_word': index_to_word,
'captions': caption_dict,
'splits': data_split
}, f)
with open(gts_file, 'w+', encoding='utf-8') as f: #save the ground truth data
json.dump({'gts': gts}, f)
if __name__ =='__main__':
base_path = os.getcwd() #set the base path
#perform the preprocess caption
parse_csv(
csv_file = os.path.join(base_path, 'data/video_corpus.csv'),
caption_file = os.path.join(base_path, 'data/captions.json'),
gts_file = os.path.join(base_path, 'data/gts_3.json'),
clean_only=True
)