source: git/machine_learning/common/keyword_vector.py @ f59883

spielwiese
Last change on this file since f59883 was f59883, checked in by Murray Heymann <heymann.murray@…>, 4 years ago
Expand testing
  • Property mode set to 100755
File size: 8.6 KB
Line 
1#!/usr/bin/python3
2
3"""Some vector logic"""
4
5import os
6import re
7import numpy as np
8
9from common.constants import KEYWORDS_FILE
10
11### Read from file ########################################################
12
13def read_dictionary(filename=KEYWORDS_FILE):
14    """
15    Read a dictionary saved as a textfile
16    """
17    if not os.path.isfile(filename):
18        print("Please provide a valid input file as argument to read "
19              "dictionary")
20        raise FileNotFoundError
21
22    dictionary = []
23
24    with open(filename, "r") as file:
25        line = file.readline()
26
27        while not line == "":
28            dictionary.append(line.strip())
29            line = file.readline()
30    return np.array(dictionary)
31
32
33def count_occurances(filename, dictionary, normalise=True):
34    """
35    Create a vector from a dictionary and populate the counts according to
36    a specified file
37    """
38    if not os.path.isfile(filename):
39        print("Please provide a valid input file as argument")
40        raise FileNotFoundError
41    assert dictionary is not None, \
42            "Please provide a valid dictionary as argument"
43    assert not dictionary.size == 0, \
44            "Please provide a valid dictionary as argument"
45
46    vector = create_vector_dictionary(dictionary)
47    with open(filename, "r+") as file:
48        line = file.readline()
49
50        while not line == "":
51            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
52            for word in words:
53                if word in vector.keys():
54                    vector[word] = vector[word] + 1
55            line = file.readline()
56    vector = np.array(list(vector.values()))
57    if normalise:
58        vector = normalise_vector(vector)
59    return vector
60
61
62### Copying ###############################################################
63
64def copy_dictionary(dictionary):
65    """
66    Return an identical copy of a dictionary
67    """
68    return np.copy(np.array(dictionary))
69
70
71def copy_vector(vector):
72    """
73    Return an identical copy of a vector
74    """
75    return np.copy(np.array(vector))
76
77
78### Vector specific logic #################################################
79
80def create_vector_dictionary(dictionary):
81    """
82    Create a zero lookup dictionary for a given dictionary
83    """
84    assert not dictionary is None, "Please give a dictionary"
85    assert not np.array(dictionary).size == 0, "Please give a dictionary"
86    vector = {}
87    for word in dictionary:
88        vector[word] = 0
89    return vector
90
91
92def vector_distance(vec1, vec2):
93    """
94    Calculate the Euclidean distance between two vectors.
95    """
96    assert len(vec1) == len(vec2), \
97            "Vectors don't have the same sizes"
98
99    dist = np.linalg.norm(vec1 - vec2)
100
101    return dist
102
103
104def normalise_vector(vec):
105    """
106    Take a given vector and normalise each entry to get a Euclidean
107    distance of 1 between the zero vector and the vector itself.
108    """
109
110    if vec is None:
111        print("Warning, None is not a valid vector")
112        print("Returning empty vector by default")
113        return np.array([])
114
115    if not isinstance(vec, np.ndarray):
116        print("Warning, vector should be a numpy array")
117
118    if np.array(vec).size == 0:
119        print("Warning, vector being normalised is empty")
120
121    norm = np.linalg.norm(vec)
122    if not norm == 0:
123        vec = vec / norm
124    return vec
125
126def test_read_dictionary():
127    """
128    Create test for read_dictionary function
129    """
130    print("\033[1;32;40mTesting read_dictionary function:\033[1;37;40m")
131
132    print("Non-existant file")
133    correct = False
134    try:
135        read_dictionary("asdfasdf")
136    except FileNotFoundError:
137        print("correctly caught non-existant file")
138        print("\033[1;32;40mpass\033[1;37;40m")
139        correct = True
140    if not correct:
141        print("\033[1;31;40mfail\033[1;37;40m")
142
143    print("Reading default file")
144    correct = True
145    try:
146        read_dictionary()
147    except FileNotFoundError:
148        print("Default file for dictionary missing")
149        print("\033[1;31;40mfail\033[1;37;40m")
150        correct = False
151    if correct:
152        print("\033[1;32;40mpass\033[1;37;40m")
153    print()
154    print()
155
156def test_count_occurances():
157    """
158    Create test for count_occurances function
159    """
160    print("\033[1;32;40mTesting count_occurances function:\033[1;37;40m")
161    dic = read_dictionary()
162    correct = False
163    try:
164        vec = count_occurances("asdfasdf", dic)
165    except FileNotFoundError:
166        correct = True
167        print("Correctly raised FileNotFoundError")
168        print("\033[1;32;40mpass\033[1;37;40m")
169    if not correct:
170        print("\033[1;31;40mfail\033[1;37;40m")
171
172    print("Count occurances with None dictionary:")
173    correct = False
174    try:
175        count_occurances("../Singular/table.h", None)
176    except AssertionError:
177        print("Correctly caught AssertionError")
178        print("\033[1;32;40mpass\033[1;37;40m")
179        correct = True
180    if not correct:
181        print("\033[1;31;40mfail\033[1;37;40m")
182
183
184    print("Count occurances with empty dictionary:")
185    correct = False
186    try:
187        count_occurances("../Singular/table.h", np.array([]))
188    except AssertionError:
189        print("Correctly caught AssertionError")
190        print("\033[1;32;40mpass\033[1;37;40m")
191        correct = True
192    if not correct:
193        print("\033[1;31;40mfail\033[1;37;40m")
194
195
196    print("vector of ../Singular/table.h")
197    vec = count_occurances("../Singular/table.h", dic)
198    print(vec)
199    print()
200    print()
201
202def test_create_vector_dictionary():
203    """
204    Create test for create_vector_dictionary function
205    """
206    print("\033[1;32;40mTesting create_vector_dictionary " \
207            "function:\033[1;37;40m")
208    read_dictionary()
209
210    print("Create Vector Dictionary with None as dictionary:")
211    correct = False
212    try:
213        create_vector_dictionary(None)
214    except AssertionError:
215        correct = True
216        print("\033[1;32;40mpass\033[1;37;40m")
217    if not correct:
218        print("\033[1;31;40mfail\033[1;37;40m")
219
220    print("Create Vector Dictionary with empty dictionary:")
221    correct = False
222    try:
223        create_vector_dictionary(np.array([]))
224    except AssertionError:
225        correct = True
226        print("\033[1;32;40mpass\033[1;37;40m")
227    if not correct:
228        print("\033[1;31;40mfail\033[1;37;40m")
229
230    print()
231    print()
232
233def test_vector_distance():
234    """
235    Create test for vector_distance function
236    """
237    print("\033[1;32;40mTesting vector_distance function:\033[1;37;40m")
238
239    vector1 = np.array([3, 4])
240    vector1 = normalise_vector(vector1)
241    vector2 = np.array([4, 3])
242    vector2 = normalise_vector(vector2)
243
244    print("Distance of vectors of different dimensions:")
245    correct = False
246    try:
247        vector_distance(np.array([1, 2, 3]), vector1)
248    except AssertionError:
249        correct = True
250        print("\033[1;32;40mpass\033[1;37;40m")
251    if not correct:
252        print("\033[1;31;40mfail\033[1;37;40m")
253
254
255    print("Distance same vector: " + str(vector_distance(vector1, vector1)))
256    assert vector_distance(vector1, vector1) == 0, \
257            "distance to same vectorshould be 0"
258    print("\033[1;32;40mpass\033[1;37;40m")
259
260    print("Distance different vector: " + str(vector_distance(vector1, vector2)))
261    assert vector_distance(vector1, vector2) > 0, \
262            "Distance between nonequal vectors should be strictly positive"
263    print("\033[1;32;40mpass\033[1;37;40m")
264    print()
265    print()
266
267def test_normalise_vector():
268    """
269    Create test for normalise_vector function
270    """
271    print("\033[1;32;40mTesting normalise_vector function:\033[1;37;40m")
272    testvector = np.array([3, 4])
273    testvector = normalise_vector(testvector)
274    assert np.linalg.norm(testvector) == 1, \
275            "Normalised vector should have norm of 1"
276    print("\033[1;32;40mpass\033[1;37;40m")
277    print("normalised vector: " + str(testvector))
278    print("\033[1;32;40mpass\033[1;37;40m")
279
280    print("Attempt to normalise the zero vector")
281    print(normalise_vector(np.array([0, 0, 0, 0, 0])))
282    print("\033[1;32;40mpass\033[1;37;40m")
283
284    print("Attempt to normalise list")
285    print(normalise_vector([3, 4, 0, 0, 0]))
286    print("\033[1;32;40mpass\033[1;37;40m")
287
288    print("Attempt to normalise empty vector")
289    print(normalise_vector(np.array([])))
290    print("\033[1;32;40mpass\033[1;37;40m")
291
292    print("Attempt to normalise None")
293    print(normalise_vector(None))
294    print("\033[1;32;40mpass\033[1;37;40m")
295    print()
296    print()
297
298
299def main():
300    """
301    Run some basic tests
302    """
303    test_normalise_vector()
304
305    test_vector_distance()
306
307    test_read_dictionary()
308
309    test_count_occurances()
310
311    test_create_vector_dictionary()
312
313if __name__ == '__main__':
314    main()
Note: See TracBrowser for help on using the repository browser.