-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_input.py
83 lines (56 loc) · 4.68 KB
/
data_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import numpy as np
class dataset:
"""
Object which imports and holds all the relevant information and data of a dataset.
"""
def __init__(self, img_path: str, lbl_path: str) -> None:
with open(img_path, "rb") as imagefile, open(lbl_path, "rb") as labelfile:
self.id_imgs, self.id_lbls = int.from_bytes(imagefile.read(4), byteorder='big'), int.from_bytes(labelfile.read(4), byteorder='big') # Gets the IDs of the image and label datasets in the first 4 bytes of each file
assert self.id_imgs == 2051 and self.id_lbls == 2049 # Dataset files should contain these IDs
self.size = int.from_bytes(imagefile.read(4), byteorder='big') # Reads the size of the dataset in the next 4 bytes
assert self.size == int.from_bytes(labelfile.read(4), byteorder='big') # Size should be the same in the image and label files
self.labels = np.array(list(labelfile.read())) # Reads the labels in the rest of the label file and puts them in a vector
self.width, self.height = int.from_bytes(imagefile.read(4), byteorder='big'), int.from_bytes(imagefile.read(4), byteorder='big') # Reads the width and height of the images in the next 4 + 4 bytes of the file
self.pixel_count = self.width * self.height # Defines the pixel count of individual images from their dimensions
set = []
img = imagefile.read(self.pixel_count)
while img: # img becomes 0 when the end of the file is reached, thus this loop runs until reaching it
set.append(list(img)) # Creates a vector with all the pixel values of an image and appends it to a list
img = imagefile.read(self.pixel_count)
self.images = np.array(set) # Creates a matrix with all the values for each image on every line
def __len__(self) -> int: # Returns the size of the dataset when calling the len function on it
return self.size
def __getitem__(self, n: int): # Allows the use of dataset[n] to return individual example of index n
return example(self, n)
def __repr__(self) -> str: # Canonical string representation of the dataset
return f"/{repr(self.labels)}: {repr(self.images)}/"
def __str__(self) -> str: # Simple string representation of the dataset
return f"{str(self.labels)}: {str(self.images)}"
def get_pixelcount(self) -> int: # Returns the number of pixels in each image of the dataset
return self.pixel_count
class example:
"""
Object which holds the data and label of an individual image of the dataset.
"""
def __init__(self, dtst: dataset, index: int) -> None:
assert index <= len(dtst) # The index of the example we want to extract should be less than the total size of the dataset
self.values = dtst.images[index] # Assign the vector which contains the pixel values to a variable
self.label = dtst.labels[index] # Assign the number label to a variable
self.width, self.height = dtst.width, dtst.height # Width and height are the same as that of the dataset
self.size = dtst.pixel_count # Size of a example/vector is the pixel count of an image
out = np.zeros(10) # Creating a 10-value vectoer filled with zeros
out[self.label] = 1 # Replacing the 0 by a 1 for the label-th value, corresponding to the digit it represents
self.expected_output = out # This vector is the ideal output of the neural network for the example
def __len__(self) -> int: # Returns the size (pixel count) of the example when calling the len function on it
return self.size
def __repr__(self) -> str: # Canonical string representation of the example
return f"/{self.label}: {repr(self.values)}/"
def __str__(self) -> str: # Simpler and visual string representation of the example
rows = self.values.reshape((self.height, self.width)) # Reshapes the pixel values vector into a 28*28 matrix
lines = ["".join(["▯" if p < 128 else "▮" for p in row]) for row in rows] # In every row, replaces values smaller than 128 with hollow boxes, else filled boxes. Then, joints the characters together into a single string
fulltext = "\n".join(lines) # Joins the lines into a single string and inserts newlines between them
return f"{self.label}:\n{fulltext}" # Returns the label of the example followed by its approximate text visualization
def get_pixelvec(self) -> np.ndarray: # Returns a copy of the values vector of the example
return np.copy(self.values)
def get_labelvec(self) -> np.ndarray: # Returns a copy of the label vector of the example
return np.copy(self.expected_output)