1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | import math |
---|
4 | import os |
---|
5 | import re |
---|
6 | import sys |
---|
7 | |
---|
8 | ### Read from file ######################################################## |
---|
9 | |
---|
10 | def read_dictionary(filename): |
---|
11 | if not os.path.isfile(filename): |
---|
12 | print("Please provide a valid input file as argument") |
---|
13 | return [] |
---|
14 | |
---|
15 | dictionary = [] |
---|
16 | |
---|
17 | with open(filename, "r") as file: |
---|
18 | line = file.readline() |
---|
19 | |
---|
20 | while not line == "": |
---|
21 | dictionary.append(line.strip()) |
---|
22 | line = file.readline() |
---|
23 | return dictionary |
---|
24 | |
---|
25 | |
---|
26 | def count_occurances(filename, dictionary): |
---|
27 | if not os.path.isfile(filename): |
---|
28 | print("Please provide a valid input file as argument") |
---|
29 | return {} |
---|
30 | if dictionary == []: |
---|
31 | print("Please provide a valid dictionary as argument") |
---|
32 | return {} |
---|
33 | vector = create_vector(dictionary) |
---|
34 | with open(filename, "r+") as file: |
---|
35 | line = file.readline() |
---|
36 | |
---|
37 | while not line == "": |
---|
38 | # DONE: replace all non-alphanumeric characters with space |
---|
39 | # words = line.split() |
---|
40 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() |
---|
41 | for word in words: |
---|
42 | if word in vector.keys(): |
---|
43 | vector[word] = vector[word] + 1 |
---|
44 | line = file.readline() |
---|
45 | return vector |
---|
46 | |
---|
47 | |
---|
48 | ### Copying ############################################################### |
---|
49 | |
---|
50 | def copy_dictionary(dictionary): |
---|
51 | new_dic = [] |
---|
52 | for word in dictionary: |
---|
53 | new_dic.append(word) |
---|
54 | return new_dic |
---|
55 | |
---|
56 | |
---|
57 | def copy_vector(vector): |
---|
58 | new_vector = {} |
---|
59 | for key in vector.keys(): |
---|
60 | new_vector[key] = vector[key] |
---|
61 | return new_vector |
---|
62 | |
---|
63 | |
---|
64 | ### Vector specific logic ################################################# |
---|
65 | |
---|
66 | def create_vector(dictionary): |
---|
67 | vector={} |
---|
68 | for word in dictionary: |
---|
69 | vector[word] = 0 |
---|
70 | return vector |
---|
71 | |
---|
72 | |
---|
73 | def vector_distance(vec1, vec2): |
---|
74 | if not set(vec1.keys()) == set(vec2.keys()): |
---|
75 | print("Dictionaries don't have the same keys") |
---|
76 | return -1 |
---|
77 | |
---|
78 | nvec1 = copy_vector(vec1) |
---|
79 | nvec2 = copy_vector(vec2) |
---|
80 | normalise_vector(nvec1) |
---|
81 | normalise_vector(nvec2) |
---|
82 | |
---|
83 | dist = 0 |
---|
84 | for key in nvec1.keys(): |
---|
85 | dist = dist + (nvec1[key] - nvec2[key]) ** 2 |
---|
86 | |
---|
87 | dist = math.sqrt(dist) |
---|
88 | |
---|
89 | return dist |
---|
90 | |
---|
91 | |
---|
92 | def normalise_vector(vec): |
---|
93 | sum = 0 |
---|
94 | for key in vec.keys(): |
---|
95 | sum = sum + (vec[key] * vec[key]) |
---|
96 | |
---|
97 | sum = math.sqrt(sum) |
---|
98 | |
---|
99 | for key in vec.keys(): |
---|
100 | vec[key] = (vec[key] + 0.0) / sum |
---|
101 | |
---|
102 | |
---|
103 | if __name__ == '__main__': |
---|
104 | if len(sys.argv) != 2: |
---|
105 | print("Usage: ") |
---|
106 | print(sys.argv[0] + " <dict_name>") |
---|
107 | sys.exit(1) |
---|
108 | |
---|
109 | dic = read_dictionary(sys.argv[1]) |
---|
110 | |
---|
111 | vector = {"hello":3, "bye":4} |
---|
112 | normalise_vector(vector) |
---|
113 | print("normalised vector: " + str(vector)) |
---|
114 | |
---|
115 | vector1 = {"hello":3, "bye":4} |
---|
116 | vector2 = {"hello":4, "bye":3} |
---|
117 | print("distance same vector: " + str(vector_distance(vector1, vector1))) |
---|
118 | print("distance different vector: " + str(vector_distance(vector1, vector2))) |
---|
119 | print(vector1) |
---|
120 | print(vector2) |
---|
121 | |
---|
122 | print(count_occurances("Singular/table.h", dic)) |
---|