1 | #!/usr/bin/python3 |
2 | |
3 | """Some vector logic""" |
4 | |
5 | import os |
6 | import re |
7 | import sys |
8 | import numpy as np |
9 | |
10 | from common.constants import KEYWORDS_FILE |
11 | |
12 | ### Read from file ######################################################## |
13 | |
14 | def read_dictionary(filename=KEYWORDS_FILE): |
15 | """ |
16 | Read a dictionary saved as a textfile |
17 | """ |
18 | if not os.path.isfile(filename): |
19 | print("Please provide a valid input file as argument") |
20 | return np.array([]) |
21 | |
22 | dictionary = [] |
23 | |
24 | with open(filename, "r") as file: |
25 | line = file.readline() |
26 | |
27 | while not line == "": |
28 | dictionary.append(line.strip()) |
29 | line = file.readline() |
30 | return np.array(dictionary) |
31 | |
32 | |
33 | def count_occurances(filename, dictionary, normalise=True): |
34 | """ |
35 | Create a vector from a dictionary and populate the counts according to |
36 | a specified file |
37 | """ |
38 | if not os.path.isfile(filename): |
39 | print("Please provide a valid input file as argument") |
40 | return [] |
41 | if dictionary.size == 0: |
42 | print("Please provide a valid dictionary as argument") |
43 | return [] |
44 | if dictionary is None: |
45 | print("Please provide a valid dictionary as argument") |
46 | return [] |
47 | vector = create_vector_dictionary(dictionary) |
48 | with open(filename, "r+") as file: |
49 | line = file.readline() |
50 | |
51 | while not line == "": |
52 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string |
53 | for word in words: |
54 | if word in vector.keys(): |
55 | vector[word] = vector[word] + 1 |
56 | line = file.readline() |
57 | vector = np.array(list(vector.values())) |
58 | if normalise: |
59 | vector = normalise_vector(vector) |
60 | return vector |
61 | |
62 | |
63 | ### Copying ############################################################### |
64 | |
65 | def copy_dictionary(dictionary): |
66 | """ |
67 | Return an identical copy of a dictionary |
68 | """ |
69 | new_dic = [] |
70 | for word in dictionary: |
71 | new_dic.append(word) |
72 | return new_dic |
73 | |
74 | |
75 | def copy_vector(vector): |
76 | """ |
77 | Return an identical copy of a vector |
78 | """ |
79 | return np.copy(np.array(vector)) |
80 | |
81 | |
82 | ### Vector specific logic ################################################# |
83 | |
84 | def create_vector_dictionary(dictionary): |
85 | """ |
86 | Create a zero vector for a given dictionary |
87 | """ |
88 | assert not dictionary is None, "Please give a dictionary" |
89 | assert not np.array(dictionary).size == 0, "Please give a dictionary" |
90 | vector = {} |
91 | for word in dictionary: |
92 | vector[word] = 0 |
93 | return vector |
94 | |
95 | |
96 | def vector_distance(vec1, vec2): |
97 | """ |
98 | Calculate the Euclidean distance between two vectors. |
99 | """ |
100 | if not len(vec1) == len(vec2): |
101 | print("Vectors don't have the same sizes") |
102 | return -1 |
103 | |
104 | dist = np.linalg.norm(vec1 - vec2) |
105 | |
106 | return dist |
107 | |
108 | |
109 | def normalise_vector(vec): |
110 | """ |
111 | Take a given vector and normalise each entry to get a Euclidean |
112 | distance of 1 between the zero vector and the vector itself. |
113 | """ |
114 | |
115 | if vec is None: |
116 | print("Please provide a valid vector") |
117 | print("Returning empty vector by default") |
118 | return np.array([]) |
119 | |
120 | if not isinstance(vec, np.ndarray): |
121 | print("Warning, vector should be a numpy array") |
122 | norm = np.linalg.norm(vec) |
123 | if not norm == 0: |
124 | vec = vec / norm |
125 | return vec |
126 | |
127 | |
128 | def main(): |
129 | """ |
130 | Run some basic tests |
131 | """ |
132 | |
133 | testvector = np.array([3, 4]) |
134 | normalise_vector(testvector) |
135 | print("normalised vector: " + str(testvector)) |
136 | |
137 | vector1 = np.array([3, 4]) |
138 | vector1 = normalise_vector(vector1) |
139 | vector2 = np.array([4, 3]) |
140 | normalise_vector(vector2) |
141 | vector2 = normalise_vector(vector2) |
142 | print("distance same vector: " + str(vector_distance(vector1, vector1))) |
143 | print("distance different vector: " + str(vector_distance(vector1, vector2))) |
144 | print(vector1) |
145 | print(vector2) |
146 | print() |
147 | |
148 | print("Attempt to normalise the zero vector") |
149 | print(normalise_vector(np.array([0, 0, 0, 0, 0]))) |
150 | print() |
151 | |
152 | print("Attempt to normalise list") |
153 | print(normalise_vector([3, 4, 0, 0, 0])) |
154 | print() |
155 | |
156 | print("Attempt to normalise empty vector") |
157 | print(normalise_vector(np.array([]))) |
158 | print() |
159 | |
160 | print("Attempt to normalise None") |
161 | print(normalise_vector(None)) |
162 | print() |
163 | |
164 | if len(sys.argv) == 2: |
165 | dic = read_dictionary(filename=sys.argv[1]) |
166 | else: |
167 | dic = read_dictionary() |
168 | print("vector of ../Singular/table.h") |
169 | print(count_occurances("../Singular/table.h", dic)) |
170 | |
171 | if __name__ == '__main__': |
172 | main() |
