forked from DataStories-UniPi/miniDB
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathex_hash.py
127 lines (111 loc) · 6.14 KB
/
ex_hash.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class Bucket:
def __init__(self,b,lsb,hash_res) -> None:
self.b=b
self.known_lsb=lsb #each bucket keeps the price of lsb either the point it was created or the last time it was split
self.hash_res=hash_res
self.data=[]
self.overflown_elems=[]
def insert(self,element,ptr):
if (len(self.data)>=self.b):
return False
self.data.append((element,ptr))
return True
class ExHash:
def __init__(self,b) -> None:
self.b=b
self.lsb=1
self.key=2047
self.hash_prefix=dict([('0',Bucket(self.b,self.lsb,'0')),('1',Bucket(self.b,self.lsb,'1'))])
self.overflown_elems=[] #When an overflow happens , elements are appended in a list so they can be organized later
def insert(self,value,ptr):
''''
The standard insert function , it gets the key for the given value and
either inserts the element in the right bucket , or appends it in overflown_elems
and calls split() in order to reorganize the hash.
'''
pos=self.get_key(value)
if not self.hash_prefix[pos].insert(value,ptr):
self.overflown_elems.append((value,ptr))
self.split()
def insert_helper(self,val,ptr):
'''
This function is an internal one of the hash object,used only internally
when overflows happens so that we dont have to call split recursively.
'''
pos=self.get_key(val)
if not self.hash_prefix[pos].insert(val,ptr):
self.overflown_elems.append((val,ptr))
def get_key(self,val):
'''
Get the insert key (hash prefix key address) where val has to be inserted.
We hash the value to get a unique represantation of it as an int.
Then we perform the operation modulo 2047 with the hashed value and
retrieve the self.lsb bits
'''
val=hash(val)
return bin(val%self.key).replace('0b','').zfill(32)[-self.lsb-1:-1]
def split(self):
'''
This function is called when an overflow happens by the insert function.
It checks whether the known lsb of an overflown bucket is smaller than the one of the hash and if its true ,
it gets the address of the overflown bucket and creates two new buckets and two new keys for the hash prefix
or if the lsb's are the same , it increases the lsb of the hash and reorganizes the hash_prefix dict.
In each case , it inserts the overflown elements and the ones of overflown buckets in the hash.
'''
while len(self.overflown_elems)>0:
address=self.get_key(self.overflown_elems[0][0])
overflownBucket = self.hash_prefix[address]
if (overflownBucket.known_lsb<self.lsb): #in this case , we dont need to increase the lsb,but only split the overflown bucket to two
print("hello2")
if address[0]=='1': #get the other address that this bucket is going to be slit
alt_address='0'+address[1:]
else :
alt_address='1'+address[1:]
self.hash_prefix[address]=Bucket(self.b,overflownBucket.known_lsb+1,address)
self.hash_prefix[alt_address]=Bucket(self.b,overflownBucket.known_lsb+1,alt_address)
self.insert_helper(self.overflown_elems[0][0],self.overflown_elems[0][1])
self.overflown_elems.pop(0)
for val in overflownBucket.data: #insert the data of the overflown bucket again in the hash index
self.insert_helper(val[0],val[1])
if len(address)<self.lsb: #delete hash prefixes with less lsb's than the current one hold in self.lsb
del self.hash_prefix[address]
else: #in this case we have to increase the lsb and also change the hash prefix dictionary
self.lsb+=1
new_address1='0'+address #create the new two address for the overflown bucket
new_address2='1'+address
for key in list(self.hash_prefix.keys()):
if key==address: #in this case we already split the bucket and dont have to point those two new buckets
continue #to the same bucket but in new separate ones
self.hash_prefix['0'+key]=self.hash_prefix[key] #point new hash prefixes to the same buckets based on lsb-1 bits
self.hash_prefix['1'+key]=self.hash_prefix[key] #i.e. 10 and 00 buckets have to point to bucket 0 (based on previous lsb)
self.hash_prefix[new_address1]=Bucket(self.b,self.lsb,new_address1)
self.hash_prefix[new_address2]=Bucket(self.b,self.lsb,new_address2)
for key in list(self.hash_prefix.keys()):
if key==address:
continue
if len(key)<self.lsb: #delete previous hash prefixes with smaller lsb length than the current one
del self.hash_prefix[key]
self.insert_helper(self.overflown_elems[0][0],self.overflown_elems[0][1])
self.overflown_elems.pop(0)
for val in overflownBucket.data:
self.insert_helper(val[0],val[1])
if len(address)<self.lsb:
del self.hash_prefix[address]
def search(self,val):
'''
Get the key for the value given and either return the tuple (val,idx) if val is found,
, or return None.idx is the index of the element val in the main file.
'''
key=self.get_key(val)
for elem in self.hash_prefix[key].data:
if val==elem[0]:
return elem
return None
def print_hash(self):
'''
Used only for example purposes to show how the hash index works and saves values.
Note that if two buckets have the same elements , it means that they are showing to the
same bucket (considering we only insert unique values in the hash index)
'''
for key in self.hash_prefix:
print(key,self.hash_prefix[key].data)