1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | """Some vector logic""" |
---|
4 | |
---|
5 | import os |
---|
6 | import re |
---|
7 | import numpy as np |
---|
8 | |
---|
9 | from common.constants import KEYWORDS_FILE |
---|
10 | |
---|
11 | ### Read from file ######################################################## |
---|
12 | |
---|
13 | def read_dictionary(filename=KEYWORDS_FILE): |
---|
14 | """ |
---|
15 | Read a dictionary saved as a textfile |
---|
16 | """ |
---|
17 | if not os.path.isfile(filename): |
---|
18 | print("Please provide a valid input file as argument to read " |
---|
19 | "dictionary") |
---|
20 | raise FileNotFoundError |
---|
21 | |
---|
22 | dictionary = [] |
---|
23 | |
---|
24 | with open(filename, "r") as file: |
---|
25 | line = file.readline() |
---|
26 | |
---|
27 | while not line == "": |
---|
28 | dictionary.append(line.strip()) |
---|
29 | line = file.readline() |
---|
30 | return np.array(dictionary) |
---|
31 | |
---|
32 | |
---|
33 | def count_occurances(filename, dictionary, normalise=True): |
---|
34 | """ |
---|
35 | Create a vector from a dictionary and populate the counts according to |
---|
36 | a specified file |
---|
37 | """ |
---|
38 | if not os.path.isfile(filename): |
---|
39 | print("Please provide a valid input file as argument") |
---|
40 | raise FileNotFoundError |
---|
41 | assert dictionary is not None, \ |
---|
42 | "Please provide a valid dictionary as argument" |
---|
43 | assert not dictionary.size == 0, \ |
---|
44 | "Please provide a valid dictionary as argument" |
---|
45 | |
---|
46 | vector = create_vector_dictionary(dictionary) |
---|
47 | with open(filename, "r+") as file: |
---|
48 | line = file.readline() |
---|
49 | |
---|
50 | while not line == "": |
---|
51 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string |
---|
52 | for word in words: |
---|
53 | if word in vector.keys(): |
---|
54 | vector[word] = vector[word] + 1 |
---|
55 | line = file.readline() |
---|
56 | vector = np.array(list(vector.values())) |
---|
57 | if normalise: |
---|
58 | vector = normalise_vector(vector) |
---|
59 | return vector |
---|
60 | |
---|
61 | |
---|
62 | ### Copying ############################################################### |
---|
63 | |
---|
64 | def copy_dictionary(dictionary): |
---|
65 | """ |
---|
66 | Return an identical copy of a dictionary |
---|
67 | """ |
---|
68 | return np.copy(np.array(dictionary)) |
---|
69 | |
---|
70 | |
---|
71 | def copy_vector(vector): |
---|
72 | """ |
---|
73 | Return an identical copy of a vector |
---|
74 | """ |
---|
75 | return np.copy(np.array(vector)) |
---|
76 | |
---|
77 | |
---|
78 | ### Vector specific logic ################################################# |
---|
79 | |
---|
80 | def create_vector_dictionary(dictionary): |
---|
81 | """ |
---|
82 | Create a zero lookup dictionary for a given dictionary |
---|
83 | """ |
---|
84 | assert not dictionary is None, "Please give a dictionary" |
---|
85 | assert not np.array(dictionary).size == 0, "Please give a dictionary" |
---|
86 | vector = {} |
---|
87 | for word in dictionary: |
---|
88 | vector[word] = 0 |
---|
89 | return vector |
---|
90 | |
---|
91 | |
---|
92 | def vector_distance(vec1, vec2): |
---|
93 | """ |
---|
94 | Calculate the Euclidean distance between two vectors. |
---|
95 | """ |
---|
96 | assert len(vec1) == len(vec2), \ |
---|
97 | "Vectors don't have the same sizes" |
---|
98 | |
---|
99 | dist = np.linalg.norm(vec1 - vec2) |
---|
100 | |
---|
101 | return dist |
---|
102 | |
---|
103 | |
---|
104 | def normalise_vector(vec): |
---|
105 | """ |
---|
106 | Take a given vector and normalise each entry to get a Euclidean |
---|
107 | distance of 1 between the zero vector and the vector itself. |
---|
108 | """ |
---|
109 | |
---|
110 | if vec is None: |
---|
111 | print("Warning, None is not a valid vector") |
---|
112 | print("Returning empty vector by default") |
---|
113 | return np.array([]) |
---|
114 | |
---|
115 | if not isinstance(vec, np.ndarray): |
---|
116 | print("Warning, vector should be a numpy array") |
---|
117 | |
---|
118 | if np.array(vec).size == 0: |
---|
119 | print("Warning, vector being normalised is empty") |
---|
120 | |
---|
121 | norm = np.linalg.norm(vec) |
---|
122 | if not norm == 0: |
---|
123 | vec = vec / norm |
---|
124 | return vec |
---|
125 | |
---|
126 | def test_read_dictionary(): |
---|
127 | """ |
---|
128 | Create test for read_dictionary function |
---|
129 | """ |
---|
130 | print("\033[1;32;40mTesting read_dictionary function:\033[1;37;40m") |
---|
131 | |
---|
132 | print("Non-existant file") |
---|
133 | correct = False |
---|
134 | try: |
---|
135 | read_dictionary("asdfasdf") |
---|
136 | except FileNotFoundError: |
---|
137 | print("correctly caught non-existant file") |
---|
138 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
139 | correct = True |
---|
140 | if not correct: |
---|
141 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
142 | |
---|
143 | print("Reading default file") |
---|
144 | correct = True |
---|
145 | try: |
---|
146 | read_dictionary() |
---|
147 | except FileNotFoundError: |
---|
148 | print("Default file for dictionary missing") |
---|
149 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
150 | correct = False |
---|
151 | if correct: |
---|
152 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
153 | print() |
---|
154 | print() |
---|
155 | |
---|
156 | def test_count_occurances(): |
---|
157 | """ |
---|
158 | Create test for count_occurances function |
---|
159 | """ |
---|
160 | print("\033[1;32;40mTesting count_occurances function:\033[1;37;40m") |
---|
161 | dic = read_dictionary() |
---|
162 | correct = False |
---|
163 | try: |
---|
164 | vec = count_occurances("asdfasdf", dic) |
---|
165 | except FileNotFoundError: |
---|
166 | correct = True |
---|
167 | print("Correctly raised FileNotFoundError") |
---|
168 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
169 | if not correct: |
---|
170 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
171 | |
---|
172 | print("Count occurances with None dictionary:") |
---|
173 | correct = False |
---|
174 | try: |
---|
175 | count_occurances("../Singular/table.h", None) |
---|
176 | except AssertionError: |
---|
177 | print("Correctly caught AssertionError") |
---|
178 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
179 | correct = True |
---|
180 | if not correct: |
---|
181 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
182 | |
---|
183 | |
---|
184 | print("Count occurances with empty dictionary:") |
---|
185 | correct = False |
---|
186 | try: |
---|
187 | count_occurances("../Singular/table.h", np.array([])) |
---|
188 | except AssertionError: |
---|
189 | print("Correctly caught AssertionError") |
---|
190 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
191 | correct = True |
---|
192 | if not correct: |
---|
193 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
194 | |
---|
195 | |
---|
196 | print("vector of ../Singular/table.h") |
---|
197 | vec = count_occurances("../Singular/table.h", dic) |
---|
198 | print(vec) |
---|
199 | print() |
---|
200 | print() |
---|
201 | |
---|
202 | def test_create_vector_dictionary(): |
---|
203 | """ |
---|
204 | Create test for create_vector_dictionary function |
---|
205 | """ |
---|
206 | print("\033[1;32;40mTesting create_vector_dictionary " \ |
---|
207 | "function:\033[1;37;40m") |
---|
208 | read_dictionary() |
---|
209 | |
---|
210 | print("Create Vector Dictionary with None as dictionary:") |
---|
211 | correct = False |
---|
212 | try: |
---|
213 | create_vector_dictionary(None) |
---|
214 | except AssertionError: |
---|
215 | correct = True |
---|
216 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
217 | if not correct: |
---|
218 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
219 | |
---|
220 | print("Create Vector Dictionary with empty dictionary:") |
---|
221 | correct = False |
---|
222 | try: |
---|
223 | create_vector_dictionary(np.array([])) |
---|
224 | except AssertionError: |
---|
225 | correct = True |
---|
226 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
227 | if not correct: |
---|
228 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
229 | |
---|
230 | print() |
---|
231 | print() |
---|
232 | |
---|
233 | def test_vector_distance(): |
---|
234 | """ |
---|
235 | Create test for vector_distance function |
---|
236 | """ |
---|
237 | print("\033[1;32;40mTesting vector_distance function:\033[1;37;40m") |
---|
238 | |
---|
239 | vector1 = np.array([3, 4]) |
---|
240 | vector1 = normalise_vector(vector1) |
---|
241 | vector2 = np.array([4, 3]) |
---|
242 | vector2 = normalise_vector(vector2) |
---|
243 | |
---|
244 | print("Distance of vectors of different dimensions:") |
---|
245 | correct = False |
---|
246 | try: |
---|
247 | vector_distance(np.array([1, 2, 3]), vector1) |
---|
248 | except AssertionError: |
---|
249 | correct = True |
---|
250 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
251 | if not correct: |
---|
252 | print("\033[1;31;40mfail\033[1;37;40m") |
---|
253 | |
---|
254 | |
---|
255 | print("Distance same vector: " + str(vector_distance(vector1, vector1))) |
---|
256 | assert vector_distance(vector1, vector1) == 0, \ |
---|
257 | "distance to same vectorshould be 0" |
---|
258 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
259 | |
---|
260 | print("Distance different vector: " + str(vector_distance(vector1, vector2))) |
---|
261 | assert vector_distance(vector1, vector2) > 0, \ |
---|
262 | "Distance between nonequal vectors should be strictly positive" |
---|
263 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
264 | print() |
---|
265 | print() |
---|
266 | |
---|
267 | def test_normalise_vector(): |
---|
268 | """ |
---|
269 | Create test for normalise_vector function |
---|
270 | """ |
---|
271 | print("\033[1;32;40mTesting normalise_vector function:\033[1;37;40m") |
---|
272 | testvector = np.array([3, 4]) |
---|
273 | testvector = normalise_vector(testvector) |
---|
274 | assert np.linalg.norm(testvector) == 1, \ |
---|
275 | "Normalised vector should have norm of 1" |
---|
276 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
277 | print("normalised vector: " + str(testvector)) |
---|
278 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
279 | |
---|
280 | print("Attempt to normalise the zero vector") |
---|
281 | print(normalise_vector(np.array([0, 0, 0, 0, 0]))) |
---|
282 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
283 | |
---|
284 | print("Attempt to normalise list") |
---|
285 | print(normalise_vector([3, 4, 0, 0, 0])) |
---|
286 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
287 | |
---|
288 | print("Attempt to normalise empty vector") |
---|
289 | print(normalise_vector(np.array([]))) |
---|
290 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
291 | |
---|
292 | print("Attempt to normalise None") |
---|
293 | print(normalise_vector(None)) |
---|
294 | print("\033[1;32;40mpass\033[1;37;40m") |
---|
295 | print() |
---|
296 | print() |
---|
297 | |
---|
298 | |
---|
299 | def main(): |
---|
300 | """ |
---|
301 | Run some basic tests |
---|
302 | """ |
---|
303 | test_normalise_vector() |
---|
304 | |
---|
305 | test_vector_distance() |
---|
306 | |
---|
307 | test_read_dictionary() |
---|
308 | |
---|
309 | test_count_occurances() |
---|
310 | |
---|
311 | test_create_vector_dictionary() |
---|
312 | |
---|
313 | if __name__ == '__main__': |
---|
314 | main() |
---|