-
Notifications
You must be signed in to change notification settings - Fork 8
/
generate_dataset.py
171 lines (140 loc) · 6.35 KB
/
generate_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import glob
import itertools
import numpy as np
import os
import json
from concurrent.futures import ProcessPoolExecutor, as_completed
from functools import partial
from config import *
import random
from sys import argv
verbose = False
#-----------------------------------------------------------------------------
def collecting_positive_samples (data, amount):
#Choosing the samples:
samples_set = []
nsamples = 0 #Number of samples.
for s1, s2 in data:
#Getting the list of images:
list1 = sorted(glob.glob(os.path.join(s1, "*")))[:amount]
list2 = sorted(glob.glob(os.path.join(s2, "*")))[:amount]
#Considering samples that exists for plates and vehicles of both crossings:
if list1 == [] or list2 == []:
continue
#Permutation of the samples:
for p in itertools.product(list1, list2):
c0 = p[0].replace(plt_name, car_name)
c1 = p[1].replace(plt_name, car_name)
if (os.path.exists(c0)) and (os.path.exists(c1)):
samples_set.append ((p[0], c0, p[1], c1, POS))
nsamples += 1
#if verbose:
# print (p[0], c0, p[1], c1)
#if verbose:
# print ("\n\n")
if verbose:
print (("Number of positive samples: %d") % (nsamples))
return samples_set
#-----------------------------------------------------------------------------
def build_positive_set (plt_set1, plt_set2, car_set1, car_set2, amount, percentage):
#Searching for matchings labels in folder 1 against folder 2:
nmatchings = 0 #Number of matchings.
data = []
for path1 in plt_set1:
suffix = path1.split("/")[-1]
#Labels from license plate annotations (cross1).
#Trying to find the same labels in cross1 and 2 for license plate and vehicle shapes.
path2 = list(filter(lambda x: suffix in x, plt_set2))
if path1 != [] and path2 != []:
data.append((path1, path2[0]))
nmatchings += 1
if (verbose):
print (("Number of vehicles matchings: %d") % (nmatchings))
#Dataset size of the positive samples:
data_size = len(data)
#Shuffling images to use for training and testing different day periods:
np.random.shuffle(data)
#Spliting the testing and training samples:
ptrn_data = data[ : int(data_size * percentage)] #Trainning set
ptst_data = data[int(data_size * percentage) : ] #Testing set
#Adding the images:
ptrn_set = collecting_positive_samples (ptrn_data, amount)
ptst_set = collecting_positive_samples (ptst_data, amount)
return ptrn_set, ptst_set
#-----------------------------------------------------------------------------
def distance_string (ocr1, ocr2):
return sum (ocr1[i] != ocr2[i] for i in range(min(len(ocr1),len(ocr2))) )
#-----------------------------------------------------------------------------
def collecting_negative_samples (plt_set1, plt_set2, car_set1, car_set2, nsamples, multiply):
#Choosing the samples:
labels_set = []
samples_set = []
failed = 0
tries = 0
threshold = 1000
samples = 0
percentage = int(nsamples/7); #Samples distribution
hist = [0, 0, 0, 0, 0, 0, 0, 0] #Histogram to accumulate the characters
while (samples < nsamples):
r1 = np.random.choice(plt_set1)
r2 = np.random.choice(plt_set2)
#if verbose:
# print ("negative",r1, r2)
n1 = r1.split("/")[-1]
n2 = r2.split("/")[-1]
p1 = list(filter(lambda x: n1 in x, car_set1)) #Matchings
p2 = list(filter(lambda x: n2 in x, car_set2)) #Matchings
dist = int(distance_string(n1, n2))
if (hist[dist] < percentage or tries > threshold):
if (n1 != []) and (n2 != []) and (p1 != []) and (p2 != []) and (n1 != n2) and ((n1,n2) not in labels_set) and ((n2,n1) not in labels_set):
plt0 = random.choice(glob.glob(os.path.join(r1,"*")))
plt1 = random.choice(glob.glob(os.path.join(r2,"*")))
car0 = plt0.replace(plt_name, car_name)
car1 = plt1.replace(plt_name, car_name)
if os.path.exists(car0) and os.path.exists(car1):
labels_set.append((n1,n2))
labels_set.append((n2,n1))
samples_set.append ((plt0, car0, plt1, car1, NEG))
samples += 1
hist[dist] += 1
#if verbose:
# print ("negative join", plt0, plt1, car0, car1)
else:
failed += 1
else:
tries += 1
print (("Number of samples rejected: %d") % (failed))
if verbose:
print (hist)
return samples_set
#-----------------------------------------------------------------------------
def build_negative_set (plt_set1, plt_set2, car_set1, car_set2, ptrn_set, ptst_set, amount, multiply):
size = len(ptrn_set) + multiply * len(ptst_set)
data = collecting_negative_samples (plt_set1, plt_set2, car_set1, car_set2, size, amount)
#Shuffling images to use for training and testing different day periods:
np.random.shuffle(data)
#Spliting the testing and training samples:
ntrn_set = data[ : len(ptrn_set)] #Trainning set
ntst_set = data[len(ptrn_set) : ] #Testing set
return ntrn_set, ntst_set
def run(plt_set1, plt_set2, car_set1, car_set2, amount, percentage, multiply):
#Building the positive training and testing datasets:
pos_trn_set, pos_tst_set = build_positive_set (plt_set1, plt_set2, car_set1, car_set2, amount, percentage)
#Building the negatives training and testing datasets:
neg_trn_set, neg_tst_set = build_negative_set (plt_set1, plt_set2, car_set1, car_set2, pos_trn_set, pos_tst_set, amount, multiply)
trn = pos_trn_set + neg_trn_set
tst = pos_tst_set + neg_tst_set
dataset = {'trn':trn,'tst':tst}
return dataset
plt_name="classes"
car_name="classes_carros"
plt_folder="*/classes"
car_folder="*/classes_carros"
percentage = 0.5
plt_set1 = glob.glob(os.path.join(folder_cross1, plt_folder,"*")) #License plate path for images in crossing 1.
plt_set2 = glob.glob(os.path.join(folder_cross2, plt_folder,"*")) #License plate path for images in crossing 2.
car_set1 = glob.glob(os.path.join(folder_cross1, car_folder,"*")) #Vehicle path for images in crossing 1.
car_set2 = glob.glob(os.path.join(folder_cross2, car_folder,"*")) #Vehicle path for images in crossing 2.
r = run(plt_set1, plt_set2, car_set1, car_set2, amount, percentage, multiplyNegatives)
with open('dataset%d_%d.json' % (amount, multiplyNegatives), 'w') as fp:
json.dump(r, fp)