Changeset 91d4ae in git
- Timestamp:
- Jul 29, 2019, 2:45:44 PM (4 years ago)
- Branches:
- (u'spielwiese', '828514cf6e480e4bafc26df99217bf2a1ed1ef45')
- Children:
- eb2904af1f3fb2d653378807ed3b18780cfd4dbc
- Parents:
- d93ae5668f2da84265ee686cd190828895b49cba
- Location:
- machine_learning
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
machine_learning/common/keyword_vector.py
rd93ae5 r91d4ae 7 7 import re 8 8 import sys 9 import numpy as np 9 10 10 11 ### Read from file ######################################################## … … 40 41 print("Please provide a valid dictionary as argument") 41 42 return {} 42 vector = create_vector(dictionary) 43 if dictionary is None: 44 print("Please provide a valid dictionary as argument") 45 return {} 46 vector = create_vector_dictionary(dictionary) 43 47 with open(filename, "r+") as file: 44 48 line = file.readline() 45 49 46 50 while not line == "": 47 # DONE: replace all non-alphanumeric characters with space48 # words = line.split()49 51 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string 50 52 for word in words: … … 52 54 vector[word] = vector[word] + 1 53 55 line = file.readline() 56 vector = np.array(list(vector.values())) 54 57 if normalise: 55 normalise_vector(vector)56 return vector 58 vector = normalise_vector(vector) 59 return vector 57 60 58 61 … … 73 76 Return an identical copy of a vector 74 77 """ 75 new_vector = {} 76 for key in vector.keys(): 77 new_vector[key] = vector[key] 78 return new_vector 78 return np.copy(np.array(vector)) 79 79 80 80 81 81 ### Vector specific logic ################################################# 82 82 83 def create_vector (dictionary):83 def create_vector_dictionary(dictionary): 84 84 """ 85 85 Create a zero vector for a given dictionary … … 95 95 Calculate the Euclidean distance between two vectors. 96 96 """ 97 if not set(vec1.keys()) == set(vec2.keys()):98 print(" Dictionaries don't have the same keys")97 if not len(vec1) == len(vec2): 98 print("Vectors don't have the same sizes") 99 99 return -1 100 100 101 102 dist = 0 103 for key in vec1: 104 dist = dist + (vec1[key] - vec2[key]) ** 2 105 106 dist = math.sqrt(dist) 101 diff_vec = vec1 - vec2 102 dist = np.linalg.norm(diff_vec) 107 103 108 104 return dist … … 114 110 distance of 1 between the zero vector and the vector itself. 115 111 """ 116 sum_vals = 0 117 for key in vec.keys(): 118 sum_vals = sum_vals + (vec[key] * vec[key]) 119 120 sum_vals = math.sqrt(sum_vals) 121 122 for key in vec.keys(): 123 vec[key] = (vec[key] + 0.0) / sum_vals 112 vec = vec / np.linalg.norm(vec) 113 return vec 124 114 125 115 … … 133 123 sys.exit(1) 134 124 135 dic = read_dictionary(sys.argv[1]) 136 137 testvector = {"hello":3, "bye":4} 125 testvector = np.array([3,4]) 138 126 normalise_vector(testvector) 139 127 print("normalised vector: " + str(testvector)) 140 128 141 vector1 = {"hello":3, "bye":4}142 normalise_vector(vector1)143 vector2 = {"hello":4, "bye":3}129 vector1 = np.array([3,4]) 130 vector1 = normalise_vector(vector1) 131 vector2 = np.array([4,3]) 144 132 normalise_vector(vector2) 133 vector2 = normalise_vector(vector2) 145 134 print("distance same vector: " + str(vector_distance(vector1, vector1))) 146 135 print("distance different vector: " + str(vector_distance(vector1, vector2))) … … 148 137 print(vector2) 149 138 139 dic = read_dictionary(sys.argv[1]) 150 140 print(count_occurances("../Singular/table.h", dic)) 151 141 -
machine_learning/common/lookuptable.py
rd93ae5 r91d4ae 76 76 vectors = np.load(VECTORS_NPY) 77 77 file_list = np.load(HELPFILE_NPY) 78 for vector in vectors: 79 normalise_vector(vector) 78 79 # normalise the vectors 80 vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis] 80 81 81 82 return (vectors, file_list) -
machine_learning/model/predictor.py
rd93ae5 r91d4ae 12 12 # Local imports 13 13 from common.keyword_vector import vector_distance, count_occurances, \ 14 read_dictionary14 read_dictionary, normalise_vector 15 15 from common.lookuptable import create_table 16 16 from common.constants import KEYWORDS_FILE … … 48 48 for x in X: # pylint: disable=invalid-name 49 49 # find the closest vector 50 51 50 min_val = float("inf") 52 min_vec = None 51 index = -1 52 i = 0 53 53 for vec in self.vectors: 54 54 dist = vector_distance(x, vec) 55 55 if dist < min_val: 56 56 min_val = dist 57 min_vec = vec 57 index = i 58 i = i + 1 58 59 59 60 # find corresponding filename 60 index = list(self.vectors).index(min_vec)61 61 file = self.files[index] 62 62 ret_list.append(file) … … 70 70 print("Running some tests") 71 71 predictor = HelpPagePredictor() 72 vector1 = {"hello":1, "bye":4, "pizza": 10}73 vector2 = {"hello":2, "bye":3, "pizza": 1}74 vector3 = {"hello":3, "bye":9, "pizza": 3}72 vector1 = normalise_vector([1, 4, 10]) 73 vector2 = normalise_vector([2, 3, 1]) 74 vector3 = normalise_vector([3, 9, 3]) 75 75 76 76 vectors = np.array([vector1, vector2, vector3]) … … 78 78 print(vectors) 79 79 print(files) 80 print() 80 81 81 testvec = {"hello":1, "bye":1, "pizza": 1} 82 testvec = normalise_vector([1, 1, 1]) 83 print("test vector:") 84 print(testvec) 85 print() 82 86 83 87 print("distance to 1") … … 93 97 predictor.fit(vectors, files) 94 98 prediction = predictor.predict(np.array([testvec])) 99 print("Prediction:") 95 100 print(prediction) 101 print() 96 102 97 103 dictionary = read_dictionary(KEYWORDS_FILE) 104 98 105 start = time.time() 99 106 vectors, file_list = create_table(dictionary=dictionary) 100 107 end = time.time() 101 108 print(end - start, "seconds to create_table") 109 102 110 test_vec = count_occurances("extract.lib", dictionary) 103 111 predictor.fit(vectors, file_list) 112 104 113 start = time.time() 105 114 prediction = predictor.predict(np.array([test_vec])) 106 115 end = time.time() 116 print(end - start, "seconds to make prediction") 107 117 print(prediction) 108 print(end - start, "seconds to make prediction")109 118 110 119 if __name__ == '__main__':
Note: See TracChangeset
for help on using the changeset viewer.