1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | """Some vector logic""" |
---|
4 | |
---|
5 | import os |
---|
6 | import re |
---|
7 | import sys |
---|
8 | import numpy as np |
---|
9 | |
---|
10 | from common.constants import KEYWORDS_FILE |
---|
11 | |
---|
12 | ### Read from file ######################################################## |
---|
13 | |
---|
14 | def read_dictionary(filename=KEYWORDS_FILE): |
---|
15 | """ |
---|
16 | Read a dictionary saved as a textfile |
---|
17 | """ |
---|
18 | if not os.path.isfile(filename): |
---|
19 | print("Please provide a valid input file as argument") |
---|
20 | return np.array([]) |
---|
21 | |
---|
22 | dictionary = [] |
---|
23 | |
---|
24 | with open(filename, "r") as file: |
---|
25 | line = file.readline() |
---|
26 | |
---|
27 | while not line == "": |
---|
28 | dictionary.append(line.strip()) |
---|
29 | line = file.readline() |
---|
30 | return np.array(dictionary) |
---|
31 | |
---|
32 | |
---|
33 | def count_occurances(filename, dictionary, normalise=True): |
---|
34 | """ |
---|
35 | Create a vector from a dictionary and populate the counts according to |
---|
36 | a specified file |
---|
37 | """ |
---|
38 | if not os.path.isfile(filename): |
---|
39 | print("Please provide a valid input file as argument") |
---|
40 | return [] |
---|
41 | if dictionary.size == 0: |
---|
42 | print("Please provide a valid dictionary as argument") |
---|
43 | return [] |
---|
44 | if dictionary is None: |
---|
45 | print("Please provide a valid dictionary as argument") |
---|
46 | return [] |
---|
47 | vector = create_vector_dictionary(dictionary) |
---|
48 | with open(filename, "r+") as file: |
---|
49 | line = file.readline() |
---|
50 | |
---|
51 | while not line == "": |
---|
52 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string |
---|
53 | for word in words: |
---|
54 | if word in vector.keys(): |
---|
55 | vector[word] = vector[word] + 1 |
---|
56 | line = file.readline() |
---|
57 | vector = np.array(list(vector.values())) |
---|
58 | if normalise: |
---|
59 | vector = normalise_vector(vector) |
---|
60 | return vector |
---|
61 | |
---|
62 | |
---|
63 | ### Copying ############################################################### |
---|
64 | |
---|
65 | def copy_dictionary(dictionary): |
---|
66 | """ |
---|
67 | Return an identical copy of a dictionary |
---|
68 | """ |
---|
69 | new_dic = [] |
---|
70 | for word in dictionary: |
---|
71 | new_dic.append(word) |
---|
72 | return new_dic |
---|
73 | |
---|
74 | |
---|
75 | def copy_vector(vector): |
---|
76 | """ |
---|
77 | Return an identical copy of a vector |
---|
78 | """ |
---|
79 | return np.copy(np.array(vector)) |
---|
80 | |
---|
81 | |
---|
82 | ### Vector specific logic ################################################# |
---|
83 | |
---|
84 | def create_vector_dictionary(dictionary): |
---|
85 | """ |
---|
86 | Create a zero vector for a given dictionary |
---|
87 | """ |
---|
88 | assert not dictionary is None, "Please give a dictionary" |
---|
89 | assert not np.array(dictionary).size == 0, "Please give a dictionary" |
---|
90 | vector = {} |
---|
91 | for word in dictionary: |
---|
92 | vector[word] = 0 |
---|
93 | return vector |
---|
94 | |
---|
95 | |
---|
96 | def vector_distance(vec1, vec2): |
---|
97 | """ |
---|
98 | Calculate the Euclidean distance between two vectors. |
---|
99 | """ |
---|
100 | if not len(vec1) == len(vec2): |
---|
101 | print("Vectors don't have the same sizes") |
---|
102 | return -1 |
---|
103 | |
---|
104 | dist = np.linalg.norm(vec1 - vec2) |
---|
105 | |
---|
106 | return dist |
---|
107 | |
---|
108 | |
---|
109 | def normalise_vector(vec): |
---|
110 | """ |
---|
111 | Take a given vector and normalise each entry to get a Euclidean |
---|
112 | distance of 1 between the zero vector and the vector itself. |
---|
113 | """ |
---|
114 | |
---|
115 | if vec is None: |
---|
116 | print("Please provide a valid vector") |
---|
117 | print("Returning empty vector by default") |
---|
118 | return np.array([]) |
---|
119 | |
---|
120 | if not isinstance(vec, np.ndarray): |
---|
121 | print("Warning, vector should be a numpy array") |
---|
122 | norm = np.linalg.norm(vec) |
---|
123 | if not norm == 0: |
---|
124 | vec = vec / norm |
---|
125 | return vec |
---|
126 | |
---|
127 | |
---|
128 | def main(): |
---|
129 | """ |
---|
130 | Run some basic tests |
---|
131 | """ |
---|
132 | |
---|
133 | testvector = np.array([3, 4]) |
---|
134 | normalise_vector(testvector) |
---|
135 | print("normalised vector: " + str(testvector)) |
---|
136 | |
---|
137 | vector1 = np.array([3, 4]) |
---|
138 | vector1 = normalise_vector(vector1) |
---|
139 | vector2 = np.array([4, 3]) |
---|
140 | normalise_vector(vector2) |
---|
141 | vector2 = normalise_vector(vector2) |
---|
142 | print("distance same vector: " + str(vector_distance(vector1, vector1))) |
---|
143 | print("distance different vector: " + str(vector_distance(vector1, vector2))) |
---|
144 | print(vector1) |
---|
145 | print(vector2) |
---|
146 | print() |
---|
147 | |
---|
148 | print("Attempt to normalise the zero vector") |
---|
149 | print(normalise_vector(np.array([0, 0, 0, 0, 0]))) |
---|
150 | print() |
---|
151 | |
---|
152 | print("Attempt to normalise list") |
---|
153 | print(normalise_vector([3, 4, 0, 0, 0])) |
---|
154 | print() |
---|
155 | |
---|
156 | print("Attempt to normalise empty vector") |
---|
157 | print(normalise_vector(np.array([]))) |
---|
158 | print() |
---|
159 | |
---|
160 | print("Attempt to normalise None") |
---|
161 | print(normalise_vector(None)) |
---|
162 | print() |
---|
163 | |
---|
164 | if len(sys.argv) == 2: |
---|
165 | dic = read_dictionary(filename=sys.argv[1]) |
---|
166 | else: |
---|
167 | dic = read_dictionary() |
---|
168 | print("vector of ../Singular/table.h") |
---|
169 | print(count_occurances("../Singular/table.h", dic)) |
---|
170 | |
---|
171 | if __name__ == '__main__': |
---|
172 | main() |
---|