[db561a] | 1 | #!/usr/bin/python3 |
---|
| 2 | |
---|
| 3 | import math |
---|
| 4 | import os |
---|
| 5 | import re |
---|
| 6 | import sys |
---|
| 7 | |
---|
| 8 | ### Read from file ######################################################## |
---|
| 9 | |
---|
| 10 | def read_dictionary(filename): |
---|
| 11 | if not os.path.isfile(filename): |
---|
| 12 | print("Please provide a valid input file as argument") |
---|
| 13 | return [] |
---|
| 14 | |
---|
| 15 | dictionary = [] |
---|
| 16 | |
---|
| 17 | with open(filename, "r") as file: |
---|
| 18 | line = file.readline() |
---|
| 19 | |
---|
| 20 | while not line == "": |
---|
| 21 | dictionary.append(line.strip()) |
---|
| 22 | line = file.readline() |
---|
| 23 | return dictionary |
---|
| 24 | |
---|
| 25 | |
---|
| 26 | def count_occurances(filename, dictionary): |
---|
| 27 | if not os.path.isfile(filename): |
---|
| 28 | print("Please provide a valid input file as argument") |
---|
| 29 | return {} |
---|
| 30 | if dictionary == []: |
---|
| 31 | print("Please provide a valid dictionary as argument") |
---|
| 32 | return {} |
---|
| 33 | vector = create_vector(dictionary) |
---|
| 34 | with open(filename, "r+") as file: |
---|
| 35 | line = file.readline() |
---|
| 36 | |
---|
| 37 | while not line == "": |
---|
| 38 | # DONE: replace all non-alphanumeric characters with space |
---|
| 39 | # words = line.split() |
---|
| 40 | words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() |
---|
| 41 | for word in words: |
---|
| 42 | if word in vector.keys(): |
---|
| 43 | vector[word] = vector[word] + 1 |
---|
| 44 | line = file.readline() |
---|
| 45 | return vector |
---|
| 46 | |
---|
| 47 | |
---|
| 48 | ### Copying ############################################################### |
---|
| 49 | |
---|
| 50 | def copy_dictionary(dictionary): |
---|
| 51 | new_dic = [] |
---|
| 52 | for word in dictionary: |
---|
| 53 | new_dic.append(word) |
---|
| 54 | return new_dic |
---|
| 55 | |
---|
| 56 | |
---|
| 57 | def copy_vector(vector): |
---|
| 58 | new_vector = {} |
---|
| 59 | for key in vector.keys(): |
---|
| 60 | new_vector[key] = vector[key] |
---|
| 61 | return new_vector |
---|
| 62 | |
---|
| 63 | |
---|
| 64 | ### Vector specific logic ################################################# |
---|
| 65 | |
---|
| 66 | def create_vector(dictionary): |
---|
| 67 | vector={} |
---|
| 68 | for word in dictionary: |
---|
| 69 | vector[word] = 0 |
---|
| 70 | return vector |
---|
| 71 | |
---|
| 72 | |
---|
| 73 | def vector_distance(vec1, vec2): |
---|
| 74 | if not set(vec1.keys()) == set(vec2.keys()): |
---|
| 75 | print("Dictionaries don't have the same keys") |
---|
| 76 | return -1 |
---|
| 77 | |
---|
| 78 | nvec1 = copy_vector(vec1) |
---|
| 79 | nvec2 = copy_vector(vec2) |
---|
| 80 | normalise_vector(nvec1) |
---|
| 81 | normalise_vector(nvec2) |
---|
| 82 | |
---|
| 83 | dist = 0 |
---|
| 84 | for key in nvec1.keys(): |
---|
| 85 | dist = dist + (nvec1[key] - nvec2[key]) ** 2 |
---|
| 86 | |
---|
| 87 | dist = math.sqrt(dist) |
---|
| 88 | |
---|
| 89 | return dist |
---|
| 90 | |
---|
| 91 | |
---|
| 92 | def normalise_vector(vec): |
---|
| 93 | sum = 0 |
---|
| 94 | for key in vec.keys(): |
---|
| 95 | sum = sum + (vec[key] * vec[key]) |
---|
| 96 | |
---|
| 97 | sum = math.sqrt(sum) |
---|
| 98 | |
---|
| 99 | for key in vec.keys(): |
---|
| 100 | vec[key] = (vec[key] + 0.0) / sum |
---|
| 101 | |
---|
| 102 | |
---|
| 103 | if __name__ == '__main__': |
---|
| 104 | if len(sys.argv) != 2: |
---|
| 105 | print("Usage: ") |
---|
| 106 | print(sys.argv[0] + " <dict_name>") |
---|
| 107 | sys.exit(1) |
---|
| 108 | |
---|
| 109 | dic = read_dictionary(sys.argv[1]) |
---|
| 110 | |
---|
| 111 | vector = {"hello":3, "bye":4} |
---|
| 112 | normalise_vector(vector) |
---|
| 113 | print("normalised vector: " + str(vector)) |
---|
| 114 | |
---|
| 115 | vector1 = {"hello":3, "bye":4} |
---|
| 116 | vector2 = {"hello":4, "bye":3} |
---|
| 117 | print("distance same vector: " + str(vector_distance(vector1, vector1))) |
---|
| 118 | print("distance different vector: " + str(vector_distance(vector1, vector2))) |
---|
| 119 | print(vector1) |
---|
| 120 | print(vector2) |
---|
| 121 | |
---|
| 122 | print(count_occurances("Singular/table.h", dic)) |
---|