Changeset 2a4516e in git
- Timestamp:
- Jul 25, 2019, 4:01:14 PM (4 years ago)
- Branches:
- (u'spielwiese', '828514cf6e480e4bafc26df99217bf2a1ed1ef45')
- Children:
- dd499fe3f7367342e4f0cf3495cdab72a21c2280
- Parents:
- 1a034b64ea0141af657c8bd605ec28fcfe0d1d8b
- Location:
- machine_learning
- Files:
-
- 3 edited
Legend:
- Unmodified
- Added
- Removed
-
machine_learning/common/keyword_vector.py
r1a034b r2a4516e 1 1 #!/usr/bin/python3 2 3 """Some vector logic""" 2 4 3 5 import math … … 9 11 10 12 def read_dictionary(filename): 13 """ 14 Read a dictionary saved as a textfile 15 """ 11 16 if not os.path.isfile(filename): 12 17 print("Please provide a valid input file as argument") … … 25 30 26 31 def count_occurances(filename, dictionary): 32 """ 33 Create a vector from a dictionary and populate the counts according to 34 a specified file 35 """ 27 36 if not os.path.isfile(filename): 28 37 print("Please provide a valid input file as argument") … … 32 41 return {} 33 42 vector = create_vector(dictionary) 34 with open(filename, "r+") as file: 43 with open(filename, "r+") as file: 35 44 line = file.readline() 36 45 … … 38 47 # DONE: replace all non-alphanumeric characters with space 39 48 # words = line.split() 40 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() 49 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string 41 50 for word in words: 42 51 if word in vector.keys(): … … 49 58 50 59 def copy_dictionary(dictionary): 60 """ 61 Return an identical copy of a dictionary 62 """ 51 63 new_dic = [] 52 64 for word in dictionary: … … 56 68 57 69 def copy_vector(vector): 70 """ 71 Return an identical copy of a vector 72 """ 58 73 new_vector = {} 59 74 for key in vector.keys(): … … 65 80 66 81 def create_vector(dictionary): 67 vector={} 82 """ 83 Create a zero vector for a given dictionary 84 """ 85 vector = {} 68 86 for word in dictionary: 69 87 vector[word] = 0 … … 72 90 73 91 def vector_distance(vec1, vec2): 92 """ 93 Calculate the Euclidean distance between two vectors. 94 """ 74 95 if not set(vec1.keys()) == set(vec2.keys()): 75 96 print("Dictionaries don't have the same keys") … … 82 103 83 104 dist = 0 84 for key in nvec1 .keys():105 for key in nvec1: 85 106 dist = dist + (nvec1[key] - nvec2[key]) ** 2 86 107 87 108 dist = math.sqrt(dist) 88 109 89 110 return dist 90 111 91 112 92 113 def normalise_vector(vec): 93 sum = 0 114 """ 115 Take a given vector and normalise each entry to get a Euclidean 116 distance of 1 between the zero vector and the vector itself. 117 """ 118 sum_vals = 0 94 119 for key in vec.keys(): 95 sum = sum+ (vec[key] * vec[key])96 97 sum = math.sqrt(sum)120 sum_vals = sum_vals + (vec[key] * vec[key]) 121 122 sum_vals = math.sqrt(sum_vals) 98 123 99 124 for key in vec.keys(): 100 vec[key] = (vec[key] + 0.0) / sum 125 vec[key] = (vec[key] + 0.0) / sum_vals 101 126 102 127 … … 107 132 sys.exit(1) 108 133 109 dic = read_dictionary(sys.argv[1]) 134 dic = read_dictionary(sys.argv[1]) # pylint: disable=invalid-name 110 135 111 vector = {"hello":3, "bye":4}112 normalise_vector( vector)113 print("normalised vector: " + str( vector))136 testvector = {"hello":3, "bye":4} # pylint: disable=invalid-name 137 normalise_vector(testvector) 138 print("normalised vector: " + str(testvector)) 114 139 115 vector1 = {"hello":3, "bye":4} 116 vector2 = {"hello":4, "bye":3} 140 vector1 = {"hello":3, "bye":4} # pylint: disable=invalid-name 141 vector2 = {"hello":4, "bye":3} # pylint: disable=invalid-name 117 142 print("distance same vector: " + str(vector_distance(vector1, vector1))) 118 143 print("distance different vector: " + str(vector_distance(vector1, vector2))) … … 120 145 print(vector2) 121 146 122 print(count_occurances(" Singular/table.h", dic))147 print(count_occurances("../Singular/table.h", dic)) -
machine_learning/predictor.py
r1a034b r2a4516e 2 2 Define the predictor class for classifying according to help page. 3 3 """ 4 4 5 # Third party imports 6 import numpy as np 5 7 from sklearn.base import BaseEstimator, ClassifierMixin 6 8 7 9 # Local imports 8 10 from common.keyword_vector import vector_distance 11 9 12 10 13 class HelpPagePredictor(BaseEstimator, ClassifierMixin): … … 23 26 Setup the correspondence of vectors to help-files 24 27 """ 28 assert X is not None, "Please provide data for X" 29 assert y is not None, "Please provide data for y" 25 30 self.vectors = X 26 31 self.files = y … … 31 36 Classify the input vectors 32 37 """ 38 assert X is not None, "Please provide data for X" 33 39 ret_list = [] 34 40 for x in X: # pylint: disable=invalid-name … … 44 50 45 51 # find corresponding filename 46 index = self.vectors.index(min_vec)52 index = list(self.vectors).index(min_vec) 47 53 file = self.files[index] 48 54 ret_list.append(file) 49 return ret_list 55 return np.array(ret_list) 56 57 58 if __name__ == '__main__': 59 print("Running some tests") 60 predictor = HelpPagePredictor() # pylint: disable=invalid-name 61 vector1 = {"hello":1, "bye":4, "pizza": 10} # pylint: disable=invalid-name 62 vector2 = {"hello":2, "bye":3, "pizza": 1} # pylint: disable=invalid-name 63 vector3 = {"hello":3, "bye":9, "pizza": 3} # pylint: disable=invalid-name 64 65 vectors = np.array([vector1, vector2, vector3]) # pylint: disable=invalid-name 66 files = np.array(["file1", "file2", "file3"]) # pylint: disable=invalid-name 67 print(vectors) 68 print(files) 69 70 testvec = {"hello":1, "bye":1, "pizza": 1} # pylint: disable=invalid-name 71 72 print("distance to 1") 73 print(vector_distance(testvec, vector1)) 74 print() 75 print("distance to 2") 76 print(vector_distance(testvec, vector2)) 77 print() 78 print("distance to 3") 79 print(vector_distance(testvec, vector3)) 80 print() 81 82 predictor.fit(vectors, files) 83 prediction = predictor.predict(np.array([testvec])) # pylint: disable=invalid-name 84 print(prediction) -
machine_learning/requirements.txt
r1a034b r2a4516e 1 1 numpy==1.16.1 2 2 #pandas==0.24.0 3 pylint==2. 2.23 pylint==2.3.0 4 4 scikit-learn==0.20.2 5 5 scipy==1.2.0
Note: See TracChangeset
for help on using the changeset viewer.