1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | """Some vector logic""" |
---|
4 | |
---|
5 | import math |
---|
6 | import os |
---|
7 | import re |
---|
8 | import sys |
---|
9 | |
---|
10 | ### Read from file ######################################################## |
---|
11 | |
---|
12 | def read_dictionary(filename): |
---|
13 | """ |
---|
14 | Read a dictionary saved as a textfile |
---|
15 | """ |
---|
16 | if not os.path.isfile(filename): |
---|
17 | print("Please provide a valid input file as argument") |
---|
18 | return [] |
---|
19 | |
---|
20 | dictionary = [] |
---|
21 | |
---|
22 | with open(filename, "r") as file: |
---|
23 | line = file.readline() |
---|
24 | |
---|
25 | while not line == "": |
---|
26 | dictionary.append(line.strip()) |
---|
27 | line = file.readline() |
---|
28 | return dictionary |
---|
29 | |
---|
30 | |
---|
31 | def count_occurances(filename, dictionary): |
---|
32 | """ |
---|
33 | Create a vector from a dictionary and populate the counts according to |
---|
34 | a specified file |
---|
35 | """ |
---|
36 | if not os.path.isfile(filename): |
---|
37 | print("Please provide a valid input file as argument") |
---|
38 | return {} |
---|
39 | if dictionary == []: |
---|
40 | print("Please provide a valid dictionary as argument") |
---|
41 | return {} |
---|
42 | vector = create_vector(dictionary) |
---|
43 | with open(filename, "r+") as file: |
---|
44 | line = file.readline() |
---|
45 | |
---|
46 | while not line == "": |
---|
47 | # DONE: replace all non-alphanumeric characters with space |
---|
48 | # words = line.split() |
---|
49 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string |
---|
50 | for word in words: |
---|
51 | if word in vector.keys(): |
---|
52 | vector[word] = vector[word] + 1 |
---|
53 | line = file.readline() |
---|
54 | return vector |
---|
55 | |
---|
56 | |
---|
57 | ### Copying ############################################################### |
---|
58 | |
---|
59 | def copy_dictionary(dictionary): |
---|
60 | """ |
---|
61 | Return an identical copy of a dictionary |
---|
62 | """ |
---|
63 | new_dic = [] |
---|
64 | for word in dictionary: |
---|
65 | new_dic.append(word) |
---|
66 | return new_dic |
---|
67 | |
---|
68 | |
---|
69 | def copy_vector(vector): |
---|
70 | """ |
---|
71 | Return an identical copy of a vector |
---|
72 | """ |
---|
73 | new_vector = {} |
---|
74 | for key in vector.keys(): |
---|
75 | new_vector[key] = vector[key] |
---|
76 | return new_vector |
---|
77 | |
---|
78 | |
---|
79 | ### Vector specific logic ################################################# |
---|
80 | |
---|
81 | def create_vector(dictionary): |
---|
82 | """ |
---|
83 | Create a zero vector for a given dictionary |
---|
84 | """ |
---|
85 | vector = {} |
---|
86 | for word in dictionary: |
---|
87 | vector[word] = 0 |
---|
88 | return vector |
---|
89 | |
---|
90 | |
---|
91 | def vector_distance(vec1, vec2): |
---|
92 | """ |
---|
93 | Calculate the Euclidean distance between two vectors. |
---|
94 | """ |
---|
95 | if not set(vec1.keys()) == set(vec2.keys()): |
---|
96 | print("Dictionaries don't have the same keys") |
---|
97 | return -1 |
---|
98 | |
---|
99 | nvec1 = copy_vector(vec1) |
---|
100 | nvec2 = copy_vector(vec2) |
---|
101 | normalise_vector(nvec1) |
---|
102 | normalise_vector(nvec2) |
---|
103 | |
---|
104 | dist = 0 |
---|
105 | for key in nvec1: |
---|
106 | dist = dist + (nvec1[key] - nvec2[key]) ** 2 |
---|
107 | |
---|
108 | dist = math.sqrt(dist) |
---|
109 | |
---|
110 | return dist |
---|
111 | |
---|
112 | |
---|
113 | def normalise_vector(vec): |
---|
114 | """ |
---|
115 | Take a given vector and normalise each entry to get a Euclidean |
---|
116 | distance of 1 between the zero vector and the vector itself. |
---|
117 | """ |
---|
118 | sum_vals = 0 |
---|
119 | for key in vec.keys(): |
---|
120 | sum_vals = sum_vals + (vec[key] * vec[key]) |
---|
121 | |
---|
122 | sum_vals = math.sqrt(sum_vals) |
---|
123 | |
---|
124 | for key in vec.keys(): |
---|
125 | vec[key] = (vec[key] + 0.0) / sum_vals |
---|
126 | |
---|
127 | |
---|
128 | if __name__ == '__main__': |
---|
129 | if len(sys.argv) != 2: |
---|
130 | print("Usage: ") |
---|
131 | print(sys.argv[0] + " <dict_name>") |
---|
132 | sys.exit(1) |
---|
133 | |
---|
134 | dic = read_dictionary(sys.argv[1]) # pylint: disable=invalid-name |
---|
135 | |
---|
136 | testvector = {"hello":3, "bye":4} # pylint: disable=invalid-name |
---|
137 | normalise_vector(testvector) |
---|
138 | print("normalised vector: " + str(testvector)) |
---|
139 | |
---|
140 | vector1 = {"hello":3, "bye":4} # pylint: disable=invalid-name |
---|
141 | vector2 = {"hello":4, "bye":3} # pylint: disable=invalid-name |
---|
142 | print("distance same vector: " + str(vector_distance(vector1, vector1))) |
---|
143 | print("distance different vector: " + str(vector_distance(vector1, vector2))) |
---|
144 | print(vector1) |
---|
145 | print(vector2) |
---|
146 | |
---|
147 | print(count_occurances("../Singular/table.h", dic)) |
---|