TheAlgorithms_Python-hashing
TheAlgorithms_Python-hashing
TheAlgorithms/data_structures/hashing
TheAlgorithms_Python-hashing.d .......................................................................
hash_table_with_linked_list.py.....................................................................1
number_theory.........................................................................................
prime_numbers.py .................................................................................. 1
tests......................................................................................................
test_hash_map.py .................................................................................. 2
quadratic_probing.py.................................................................................4
double_hash.py........................................................................................5
bloom_filter.py.......................................................................................6
hash_table.py.........................................................................................8
hash_map.py .......................................................................................... 12
hash_table_with_linked_list.py
1 from collections import deque
2
3 from .hash_table import HashTable
4
5
6 class HashTableWithLinkedList(HashTable):
7 def __init__(self, *args, **kwargs):
8 super().__init__(*args, **kwargs)
9
10 def _set_value(self, key, data):
11 self.values[key] = deque([]) if self.values[key] is None else self.values[key]
12 self.values[key].appendleft(data)
13 self._keys[key] = self.values[key]
14
15 def balanced_factor(self):
16 return (
17 sum(self.charge_factor - len(slot) for slot in self.values)
18 / self.size_table
19 * self.charge_factor
20 )
21
22 def _collision_resolution(self, key, data=None):
23 if not (
24 len(self.values[key]) == self.charge_factor and self.values.count(None) == 0
25 ):
26 return key
27 return super()._collision_resolution(key, data)
number_theory/prime_numbers.py
1 #!/usr/bin/env python3
2 """
3 module to operations with prime numbers
4 """
5
6 import math
7
8
9 def is_prime(number: int) -> bool:
2
tests/test_hash_map.py
1 from operator import delitem, getitem, setitem
2
3 import pytest
4
5 from data_structures.hashing.hash_map import HashMap
6
7
8 def _get(k):
9 return getitem, k
10
11
12 def _set(k, v):
13 return setitem, k, v
14
15
16 def _del(k):
17 return delitem, k
tests/test_hash_map.py 3
18
19
20 def _run_operation(obj, fun, *args):
21 try:
22 return fun(obj, *args), None
23 except Exception as e:
24 return None, e
25
26
27 _add_items = (
28 _set("key_a", "val_a"),
29 _set("key_b", "val_b"),
30 )
31
32 _overwrite_items = [
33 _set("key_a", "val_a"),
34 _set("key_a", "val_b"),
35 ]
36
37 _delete_items = [
38 _set("key_a", "val_a"),
39 _set("key_b", "val_b"),
40 _del("key_a"),
41 _del("key_b"),
42 _set("key_a", "val_a"),
43 _del("key_a"),
44 ]
45
46 _access_absent_items = [
47 _get("key_a"),
48 _del("key_a"),
49 _set("key_a", "val_a"),
50 _del("key_a"),
51 _del("key_a"),
52 _get("key_a"),
53 ]
54
55 _add_with_resize_up = [
56 *[_set(x, x) for x in range(5)], # guaranteed upsize
57 ]
58
59 _add_with_resize_down = [
60 *[_set(x, x) for x in range(5)], # guaranteed upsize
61 *[_del(x) for x in range(5)],
62 _set("key_a", "val_b"),
63 ]
64
65
66 @pytest.mark.parametrize(
67 "operations",
68 [
69 pytest.param(_add_items, id="add items"),
70 pytest.param(_overwrite_items, id="overwrite items"),
71 pytest.param(_delete_items, id="delete items"),
72 pytest.param(_access_absent_items, id="access absent items"),
73 pytest.param(_add_with_resize_up, id="add with resize up"),
74 pytest.param(_add_with_resize_down, id="add with resize down"),
75 ],
76 )
77 def test_hash_map_is_the_same_as_dict(operations):
78 my = HashMap(initial_block_size=4)
79 py = {}
80 for _, (fun, *args) in enumerate(operations):
81 my_res, my_exc = _run_operation(my, fun, *args)
82 py_res, py_exc = _run_operation(py, fun, *args)
83 assert my_res == py_res
84 assert str(my_exc) == str(py_exc)
85 assert set(py) == set(my)
86 assert len(py) == len(my)
87 assert set(my.items()) == set(py.items())
88
4
89
90 def test_no_new_methods_was_added_to_api():
91 def is_public(name: str) -> bool:
92 return not name.startswith("_")
93
94 dict_public_names = {name for name in dir({}) if is_public(name)}
95 hash_public_names = {name for name in dir(HashMap()) if is_public(name)}
96
97 assert dict_public_names > hash_public_names
quadratic_probing.py
1 #!/usr/bin/env python3
2
3 from .hash_table import HashTable
4
5
6 class QuadraticProbing(HashTable):
7 """
8 Basic Hash Table example with open addressing using Quadratic Probing
9 """
10
11 def __init__(self, *args, **kwargs):
12 super().__init__(*args, **kwargs)
13
14 def _collision_resolution(self, key, data=None): # noqa: ARG002
15 """
16 Quadratic probing is an open addressing scheme used for resolving
17 collisions in hash table.
18
19 It works by taking the original hash index and adding successive
20 values of an arbitrary quadratic polynomial until open slot is found.
21
22 Hash + 1², Hash + 2², Hash + 3² .... Hash + n²
23
24 reference:
25 - https://en.wikipedia.org/wiki/Quadratic_probing
26 e.g:
27 1. Create hash table with size 7
28 >>> qp = QuadraticProbing(7)
29 >>> qp.insert_data(90)
30 >>> qp.insert_data(340)
31 >>> qp.insert_data(24)
32 >>> qp.insert_data(45)
33 >>> qp.insert_data(99)
34 >>> qp.insert_data(73)
35 >>> qp.insert_data(7)
36 >>> qp.keys()
37 {11: 45, 14: 99, 7: 24, 0: 340, 5: 73, 6: 90, 8: 7}
38
39 2. Create hash table with size 8
40 >>> qp = QuadraticProbing(8)
41 >>> qp.insert_data(0)
42 >>> qp.insert_data(999)
43 >>> qp.insert_data(111)
44 >>> qp.keys()
45 {0: 0, 7: 999, 3: 111}
46
47 3. Try to add three data elements when the size is two
48 >>> qp = QuadraticProbing(2)
49 >>> qp.insert_data(0)
50 >>> qp.insert_data(999)
51 >>> qp.insert_data(111)
52 >>> qp.keys()
53 {0: 0, 4: 999, 1: 111}
54
55 4. Try to add three data elements when the size is one
56 >>> qp = QuadraticProbing(1)
57 >>> qp.insert_data(0)
58 >>> qp.insert_data(999)
5
59 >>> qp.insert_data(111)
60 >>> qp.keys()
61 {4: 999, 1: 111}
62 """
63
64 i = 1
65 new_key = self.hash_function(key + i * i)
66
67 while self.values[new_key] is not None and self.values[new_key] != key:
68 i += 1
69 new_key = (
70 self.hash_function(key + i * i)
71 if not self.balanced_factor() >= self.lim_charge
72 else None
73 )
74
75 if new_key is None:
76 break
77
78 return new_key
79
80
81 if __name__ == "__main__":
82 import doctest
83
84 doctest.testmod()
double_hash.py
1 #!/usr/bin/env python3
2 """
3 Double hashing is a collision resolving technique in Open Addressed Hash tables.
4 Double hashing uses the idea of applying a second hash function to key when a collision
5 occurs. The advantage of Double hashing is that it is one of the best form of probing,
6 producing a uniform distribution of records throughout a hash table. This technique
7 does not yield any clusters. It is one of effective method for resolving collisions.
8
9 Double hashing can be done using: (hash1(key) + i * hash2(key)) % TABLE_SIZE
10 Where hash1() and hash2() are hash functions and TABLE_SIZE is size of hash table.
11
12 Reference: https://en.wikipedia.org/wiki/Double_hashing
13 """
14
15 from .hash_table import HashTable
16 from .number_theory.prime_numbers import is_prime, next_prime
17
18
19 class DoubleHash(HashTable):
20 """
21 Hash Table example with open addressing and Double Hash
22 """
23
24 def __init__(self, *args, **kwargs):
25 super().__init__(*args, **kwargs)
26
27 def __hash_function_2(self, value, data):
28 next_prime_gt = (
29 next_prime(value % self.size_table)
30 if not is_prime(value % self.size_table)
31 else value % self.size_table
32 ) # gt = bigger than
33 return next_prime_gt - (data % next_prime_gt)
34
35 def __hash_double_function(self, key, data, increment):
36 return (increment * self.__hash_function_2(key, data)) % self.size_table
37
38 def _collision_resolution(self, key, data=None):
39 """
40 Examples:
41
6
bloom_filter.py
1 """
2 See https://en.wikipedia.org/wiki/Bloom_filter
3
4 The use of this data structure is to test membership in a set.
5 Compared to Python's built-in set() it is more space-efficient.
6 In the following example, only 8 bits of memory will be used:
7 >>> bloom = Bloom(size=8)
8
9 Initially, the filter contains all zeros:
10 >>> bloom.bitstring
11 '00000000'
12
13 When an element is added, two bits are set to 1
14 since there are 2 hash functions in this implementation:
15 >>> "Titanic" in bloom
16 False
17 >>> bloom.add("Titanic")
18 >>> bloom.bitstring
19 '01100000'
20 >>> "Titanic" in bloom
21 True
22
bloom_filter.py 7
94 position = (
95 int.from_bytes(func(value.encode()).digest(), "little") % self.size
96 )
97 res |= 2**position
98 return res
99
100 def format_hash(self, value: str) -> str:
101 return self.format_bin(self.hash_(value))
102
103 @property
104 def estimated_error_rate(self) -> float:
105 n_ones = bin(self.bitarray).count("1")
106 return (n_ones / self.size) ** len(HASH_FUNCTIONS)
hash_table.py
1 #!/usr/bin/env python3
2 from abc import abstractmethod
3
4 from .number_theory.prime_numbers import next_prime
5
6
7 class HashTable:
8 """
9 Basic Hash Table example with open addressing and linear probing
10 """
11
12 def __init__(
13 self,
14 size_table: int,
15 charge_factor: int | None = None,
16 lim_charge: float | None = None,
17 ) -> None:
18 self.size_table = size_table
19 self.values = [None] * self.size_table
20 self.lim_charge = 0.75 if lim_charge is None else lim_charge
21 self.charge_factor = 1 if charge_factor is None else charge_factor
22 self.__aux_list: list = []
23 self._keys: dict = {}
24
25 def keys(self):
26 """
27 The keys function returns a dictionary containing the key value pairs.
28 key being the index number in hash table and value being the data value.
29
30 Examples:
31 1. creating HashTable with size 10 and inserting 3 elements
32 >>> ht = HashTable(10)
33 >>> ht.insert_data(10)
34 >>> ht.insert_data(20)
35 >>> ht.insert_data(30)
36 >>> ht.keys()
37 {0: 10, 1: 20, 2: 30}
38
39 2. creating HashTable with size 5 and inserting 5 elements
40 >>> ht = HashTable(5)
41 >>> ht.insert_data(5)
42 >>> ht.insert_data(4)
43 >>> ht.insert_data(3)
44 >>> ht.insert_data(2)
45 >>> ht.insert_data(1)
46 >>> ht.keys()
47 {0: 5, 4: 4, 3: 3, 2: 2, 1: 1}
48 """
49 return self._keys
50
51 def balanced_factor(self):
52 return sum(1 for slot in self.values if slot is not None) / (
53 self.size_table * self.charge_factor
54 )
hash_table.py 9
55
56 def hash_function(self, key):
57 """
58 Generates hash for the given key value
59
60 Examples:
61
62 Creating HashTable with size 5
63 >>> ht = HashTable(5)
64 >>> ht.hash_function(10)
65 0
66 >>> ht.hash_function(20)
67 0
68 >>> ht.hash_function(4)
69 4
70 >>> ht.hash_function(18)
71 3
72 >>> ht.hash_function(-18)
73 2
74 >>> ht.hash_function(18.5)
75 3.5
76 >>> ht.hash_function(0)
77 0
78 >>> ht.hash_function(-0)
79 0
80 """
81 return key % self.size_table
82
83 def _step_by_step(self, step_ord):
84 print(f"step {step_ord}")
85 print(list(range(len(self.values))))
86 print(self.values)
87
88 def bulk_insert(self, values):
89 """
90 bulk_insert is used for entering more than one element at a time
91 in the HashTable.
92
93 Examples:
94 1.
95 >>> ht = HashTable(5)
96 >>> ht.bulk_insert((10,20,30))
97 step 1
98 [0, 1, 2, 3, 4]
99 [10, None, None, None, None]
100 step 2
101 [0, 1, 2, 3, 4]
102 [10, 20, None, None, None]
103 step 3
104 [0, 1, 2, 3, 4]
105 [10, 20, 30, None, None]
106
107 2.
108 >>> ht = HashTable(5)
109 >>> ht.bulk_insert([5,4,3,2,1])
110 step 1
111 [0, 1, 2, 3, 4]
112 [5, None, None, None, None]
113 step 2
114 [0, 1, 2, 3, 4]
115 [5, None, None, None, 4]
116 step 3
117 [0, 1, 2, 3, 4]
118 [5, None, None, 3, 4]
119 step 4
120 [0, 1, 2, 3, 4]
121 [5, None, 2, 3, 4]
122 step 5
123 [0, 1, 2, 3, 4]
124 [5, 1, 2, 3, 4]
125 """
hash_table.py 10
126 i = 1
127 self.__aux_list = values
128 for value in values:
129 self.insert_data(value)
130 self._step_by_step(i)
131 i += 1
132
133 def _set_value(self, key, data):
134 """
135 _set_value functions allows to update value at a particular hash
136
137 Examples:
138 1. _set_value in HashTable of size 5
139 >>> ht = HashTable(5)
140 >>> ht.insert_data(10)
141 >>> ht.insert_data(20)
142 >>> ht.insert_data(30)
143 >>> ht._set_value(0,15)
144 >>> ht.keys()
145 {0: 15, 1: 20, 2: 30}
146
147 2. _set_value in HashTable of size 2
148 >>> ht = HashTable(2)
149 >>> ht.insert_data(17)
150 >>> ht.insert_data(18)
151 >>> ht.insert_data(99)
152 >>> ht._set_value(3,15)
153 >>> ht.keys()
154 {3: 15, 2: 17, 4: 99}
155
156 3. _set_value in HashTable when hash is not present
157 >>> ht = HashTable(2)
158 >>> ht.insert_data(17)
159 >>> ht.insert_data(18)
160 >>> ht.insert_data(99)
161 >>> ht._set_value(0,15)
162 >>> ht.keys()
163 {3: 18, 2: 17, 4: 99, 0: 15}
164
165 4. _set_value in HashTable when multiple hash are not present
166 >>> ht = HashTable(2)
167 >>> ht.insert_data(17)
168 >>> ht.insert_data(18)
169 >>> ht.insert_data(99)
170 >>> ht._set_value(0,15)
171 >>> ht._set_value(1,20)
172 >>> ht.keys()
173 {3: 18, 2: 17, 4: 99, 0: 15, 1: 20}
174 """
175 self.values[key] = data
176 self._keys[key] = data
177
178 @abstractmethod
179 def _collision_resolution(self, key, data=None):
180 """
181 This method is a type of open addressing which is used for handling collision.
182
183 In this implementation the concept of linear probing has been used.
184
185 The hash table is searched sequentially from the original location of the
186 hash, if the new hash/location we get is already occupied we check for the next
187 hash/location.
188
189 references:
190 - https://en.wikipedia.org/wiki/Linear_probing
191
192 Examples:
193 1. The collision will be with keys 18 & 99, so new hash will be created for 99
194 >>> ht = HashTable(3)
195 >>> ht.insert_data(17)
196 >>> ht.insert_data(18)
hash_table.py 11
hash_map.py
1 """
2 Hash map with open addressing.
3
4 https://en.wikipedia.org/wiki/Hash_table
5
6 Another hash map implementation, with a good explanation.
7 Modern Dictionaries by Raymond Hettinger
8 https://www.youtube.com/watch?v=p33CVV29OG8
9 """
10
11 from collections.abc import Iterator, MutableMapping
12 from dataclasses import dataclass
13 from typing import Generic, TypeVar
14
15 KEY = TypeVar("KEY")
16 VAL = TypeVar("VAL")
17
18
19 @dataclass(frozen=True, slots=True)
20 class _Item(Generic[KEY, VAL]):
21 key: KEY
22 val: VAL
23
24
25 class _DeletedItem(_Item):
26 def __init__(self) -> None:
27 super().__init__(None, None)
28
29 def __bool__(self) -> bool:
30 return False
31
32
33 _deleted = _DeletedItem()
34
35
36 class HashMap(MutableMapping[KEY, VAL]):
37 """
38 Hash map with open addressing.
39 """
40
41 def __init__(
42 self, initial_block_size: int = 8, capacity_factor: float = 0.75
43 ) -> None:
44 self._initial_block_size = initial_block_size
45 self._buckets: list[_Item | None] = [None] * initial_block_size
46 assert 0.0 < capacity_factor < 1.0
47 self._capacity_factor = capacity_factor
48 self._len = 0
49
50 def _get_bucket_index(self, key: KEY) -> int:
51 return hash(key) % len(self._buckets)
hash_map.py 13
52
53 def _get_next_ind(self, ind: int) -> int:
54 """
55 Get next index.
56
57 Implements linear open addressing.
58 >>> HashMap(5)._get_next_ind(3)
59 4
60 >>> HashMap(5)._get_next_ind(5)
61 1
62 >>> HashMap(5)._get_next_ind(6)
63 2
64 >>> HashMap(5)._get_next_ind(9)
65 0
66 """
67 return (ind + 1) % len(self._buckets)
68
69 def _try_set(self, ind: int, key: KEY, val: VAL) -> bool:
70 """
71 Try to add value to the bucket.
72
73 If bucket is empty or key is the same, does insert and return True.
74
75 If bucket has another key or deleted placeholder,
76 that means that we need to check next bucket.
77 """
78 stored = self._buckets[ind]
79 if not stored:
80 self._buckets[ind] = _Item(key, val)
81 self._len += 1
82 return True
83 elif stored.key == key:
84 self._buckets[ind] = _Item(key, val)
85 return True
86 else:
87 return False
88
89 def _is_full(self) -> bool:
90 """
91 Return true if we have reached safe capacity.
92
93 So we need to increase the number of buckets to avoid collisions.
94
95 >>> hm = HashMap(2)
96 >>> hm._add_item(1, 10)
97 >>> hm._add_item(2, 20)
98 >>> hm._is_full()
99 True
100 >>> HashMap(2)._is_full()
101 False
102 """
103 limit = len(self._buckets) * self._capacity_factor
104 return len(self) >= int(limit)
105
106 def _is_sparse(self) -> bool:
107 """Return true if we need twice fewer buckets when we have now."""
108 if len(self._buckets) <= self._initial_block_size:
109 return False
110 limit = len(self._buckets) * self._capacity_factor / 2
111 return len(self) < limit
112
113 def _resize(self, new_size: int) -> None:
114 old_buckets = self._buckets
115 self._buckets = [None] * new_size
116 self._len = 0
117 for item in old_buckets:
118 if item:
119 self._add_item(item.key, item.val)
120
121 def _size_up(self) -> None:
122 self._resize(len(self._buckets) * 2)
hash_map.py 14
123
124 def _size_down(self) -> None:
125 self._resize(len(self._buckets) // 2)
126
127 def _iterate_buckets(self, key: KEY) -> Iterator[int]:
128 ind = self._get_bucket_index(key)
129 for _ in range(len(self._buckets)):
130 yield ind
131 ind = self._get_next_ind(ind)
132
133 def _add_item(self, key: KEY, val: VAL) -> None:
134 """
135 Try to add 3 elements when the size is 5
136 >>> hm = HashMap(5)
137 >>> hm._add_item(1, 10)
138 >>> hm._add_item(2, 20)
139 >>> hm._add_item(3, 30)
140 >>> hm
141 HashMap(1: 10, 2: 20, 3: 30)
142
143 Try to add 3 elements when the size is 5
144 >>> hm = HashMap(5)
145 >>> hm._add_item(-5, 10)
146 >>> hm._add_item(6, 30)
147 >>> hm._add_item(-7, 20)
148 >>> hm
149 HashMap(-5: 10, 6: 30, -7: 20)
150
151 Try to add 3 elements when size is 1
152 >>> hm = HashMap(1)
153 >>> hm._add_item(10, 13.2)
154 >>> hm._add_item(6, 5.26)
155 >>> hm._add_item(7, 5.155)
156 >>> hm
157 HashMap(10: 13.2)
158
159 Trying to add an element with a key that is a floating point value
160 >>> hm = HashMap(5)
161 >>> hm._add_item(1.5, 10)
162 >>> hm
163 HashMap(1.5: 10)
164
165 5. Trying to add an item with the same key
166 >>> hm = HashMap(5)
167 >>> hm._add_item(1, 10)
168 >>> hm._add_item(1, 20)
169 >>> hm
170 HashMap(1: 20)
171 """
172 for ind in self._iterate_buckets(key):
173 if self._try_set(ind, key, val):
174 break
175
176 def __setitem__(self, key: KEY, val: VAL) -> None:
177 """
178 1. Changing value of item whose key is present
179 >>> hm = HashMap(5)
180 >>> hm._add_item(1, 10)
181 >>> hm.__setitem__(1, 20)
182 >>> hm
183 HashMap(1: 20)
184
185 2. Changing value of item whose key is not present
186 >>> hm = HashMap(5)
187 >>> hm._add_item(1, 10)
188 >>> hm.__setitem__(0, 20)
189 >>> hm
190 HashMap(0: 20, 1: 10)
191
192 3. Changing the value of the same item multiple times
193 >>> hm = HashMap(5)
hash_map.py 15