Changeset 2a4516e in git for machine_learning/common/keyword_vector.py
- Timestamp:
- Jul 25, 2019, 4:01:14 PM (5 years ago)
- Branches:
- (u'spielwiese', 'fe61d9c35bf7c61f2b6cbf1b56e25e2f08d536cc')
- Children:
- dd499fe3f7367342e4f0cf3495cdab72a21c2280
- Parents:
- 1a034b64ea0141af657c8bd605ec28fcfe0d1d8b
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
machine_learning/common/keyword_vector.py
r1a034b r2a4516e 1 1 #!/usr/bin/python3 2 3 """Some vector logic""" 2 4 3 5 import math … … 9 11 10 12 def read_dictionary(filename): 13 """ 14 Read a dictionary saved as a textfile 15 """ 11 16 if not os.path.isfile(filename): 12 17 print("Please provide a valid input file as argument") … … 25 30 26 31 def count_occurances(filename, dictionary): 32 """ 33 Create a vector from a dictionary and populate the counts according to 34 a specified file 35 """ 27 36 if not os.path.isfile(filename): 28 37 print("Please provide a valid input file as argument") … … 32 41 return {} 33 42 vector = create_vector(dictionary) 34 with open(filename, "r+") as file: 43 with open(filename, "r+") as file: 35 44 line = file.readline() 36 45 … … 38 47 # DONE: replace all non-alphanumeric characters with space 39 48 # words = line.split() 40 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() 49 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string 41 50 for word in words: 42 51 if word in vector.keys(): … … 49 58 50 59 def copy_dictionary(dictionary): 60 """ 61 Return an identical copy of a dictionary 62 """ 51 63 new_dic = [] 52 64 for word in dictionary: … … 56 68 57 69 def copy_vector(vector): 70 """ 71 Return an identical copy of a vector 72 """ 58 73 new_vector = {} 59 74 for key in vector.keys(): … … 65 80 66 81 def create_vector(dictionary): 67 vector={} 82 """ 83 Create a zero vector for a given dictionary 84 """ 85 vector = {} 68 86 for word in dictionary: 69 87 vector[word] = 0 … … 72 90 73 91 def vector_distance(vec1, vec2): 92 """ 93 Calculate the Euclidean distance between two vectors. 94 """ 74 95 if not set(vec1.keys()) == set(vec2.keys()): 75 96 print("Dictionaries don't have the same keys") … … 82 103 83 104 dist = 0 84 for key in nvec1 .keys():105 for key in nvec1: 85 106 dist = dist + (nvec1[key] - nvec2[key]) ** 2 86 107 87 108 dist = math.sqrt(dist) 88 109 89 110 return dist 90 111 91 112 92 113 def normalise_vector(vec): 93 sum = 0 114 """ 115 Take a given vector and normalise each entry to get a Euclidean 116 distance of 1 between the zero vector and the vector itself. 117 """ 118 sum_vals = 0 94 119 for key in vec.keys(): 95 sum = sum+ (vec[key] * vec[key])96 97 sum = math.sqrt(sum)120 sum_vals = sum_vals + (vec[key] * vec[key]) 121 122 sum_vals = math.sqrt(sum_vals) 98 123 99 124 for key in vec.keys(): 100 vec[key] = (vec[key] + 0.0) / sum 125 vec[key] = (vec[key] + 0.0) / sum_vals 101 126 102 127 … … 107 132 sys.exit(1) 108 133 109 dic = read_dictionary(sys.argv[1]) 134 dic = read_dictionary(sys.argv[1]) # pylint: disable=invalid-name 110 135 111 vector = {"hello":3, "bye":4}112 normalise_vector( vector)113 print("normalised vector: " + str( vector))136 testvector = {"hello":3, "bye":4} # pylint: disable=invalid-name 137 normalise_vector(testvector) 138 print("normalised vector: " + str(testvector)) 114 139 115 vector1 = {"hello":3, "bye":4} 116 vector2 = {"hello":4, "bye":3} 140 vector1 = {"hello":3, "bye":4} # pylint: disable=invalid-name 141 vector2 = {"hello":4, "bye":3} # pylint: disable=invalid-name 117 142 print("distance same vector: " + str(vector_distance(vector1, vector1))) 118 143 print("distance different vector: " + str(vector_distance(vector1, vector2))) … … 120 145 print(vector2) 121 146 122 print(count_occurances(" Singular/table.h", dic))147 print(count_occurances("../Singular/table.h", dic))
Note: See TracChangeset
for help on using the changeset viewer.