Changeset 112c79 in git
- Timestamp:
- Aug 2, 2019, 7:35:10 PM (4 years ago)
- Branches:
- (u'spielwiese', '828514cf6e480e4bafc26df99217bf2a1ed1ef45')
- Children:
- 58603fd7faac3d049f61ca0b65d89e4be0a664f9
- Parents:
- fece1392f8e9ff07b64d0ed4e5ec57bfa6dbf258
- git-author:
- Murray Heymann <heymann.murray@gmail.com>2019-08-02 19:35:10+02:00
- git-committer:
- Murray Heymann <heymann.murray@gmail.com>2019-08-02 19:35:13+02:00
- Location:
- machine_learning
- Files:
-
- 6 edited
Legend:
- Unmodified
- Added
- Removed
-
machine_learning/common/keyword_vector.py
rfece13 r112c79 7 7 import sys 8 8 import numpy as np 9 from sklearn.feature_extraction.text import CountVectorizer 9 10 10 11 from common.constants import KEYWORDS_FILE … … 19 20 print("Please provide a valid input file as argument to read " 20 21 "dictionary") 21 if sys.version_info[0] == 3: 22 if sys.version_info[0] == 3: # pylint: disable=no-else-raise 22 23 raise FileNotFoundError 23 24 else: … … 35 36 return np.array(dictionary) 36 37 38 def get_vectors(filenames, dictionary, normalise=True): 39 """ 40 Create vectors from a dictionary and populate the counts according to 41 specified files 42 """ 43 assert filenames is not None, \ 44 "Please provide a valid list of files as argument" 45 assert not filenames.size == 0, \ 46 "Please provide a valid list of files as argument" 47 for filename in filenames: 48 if not os.path.isfile(filename): 49 print("Please provide a valid input file as argument") 50 if sys.version_info[0] == 3: # pylint: disable=no-else-raise 51 raise FileNotFoundError 52 else: 53 print(filename) 54 raise IOError 55 assert dictionary is not None, \ 56 "Please provide a valid dictionary as argument" 57 assert not dictionary.size == 0, \ 58 "Please provide a valid dictionary as argument" 59 60 doc_strings = [] 61 for filename in filenames: 62 doc_string = "" 63 with open(filename, "r+") as file: 64 line = file.readline() 65 66 while not line == "": 67 doc_string = doc_string + " " + line 68 line = file.readline() 69 70 doc_string = re.sub('[^0-9a-zA-Z\-\_]', ' ', doc_string) # pylint: disable=anomalous-backslash-in-string 71 doc_strings.append(doc_string) 72 doc_strings = np.array(doc_strings) 73 74 vectorizer = CountVectorizer(vocabulary=dictionary) 75 vectors = vectorizer.fit_transform(doc_strings) 76 vectors = vectors.toarray() 77 if normalise: 78 vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis] 79 return vectors 80 37 81 38 82 def count_occurances(filename, dictionary, normalise=True): … … 41 85 a specified file 42 86 """ 43 if not os.path.isfile(filename): 44 print("Please provide a valid input file as argument") 45 if sys.version_info[0] == 3: 46 raise FileNotFoundError 47 else: 48 raise IOError 49 assert dictionary is not None, \ 50 "Please provide a valid dictionary as argument" 51 assert not dictionary.size == 0, \ 52 "Please provide a valid dictionary as argument" 53 54 vector = create_vector_dictionary(dictionary) 55 with open(filename, "r+") as file: 56 line = file.readline() 57 58 while not line == "": 59 words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string 60 for word in words: 61 if word in vector.keys(): 62 vector[word] = vector[word] + 1 63 line = file.readline() 64 vector = np.array(list(vector.values())) 65 if normalise: 66 vector = normalise_vector(vector) 67 return vector 87 res = get_vectors(np.array([filename]), 88 dictionary, 89 normalise=normalise) 90 return res[0] 68 91 69 92 -
machine_learning/common/lookuptable.py
rfece13 r112c79 14 14 15 15 # local imports 16 from common.keyword_vector import count_occurances, read_dictionary16 from common.keyword_vector import get_vectors, read_dictionary 17 17 from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \ 18 18 EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \ … … 82 82 np.save(HELPFILE_NPY, file_list) 83 83 84 filenames = [] 84 85 for file in file_list: 85 vector = count_occurances(os.path.join(HELP_FILE_PATH, "html", 86 file), 87 dictionary, 88 normalise=False) 89 vectors.append(vector) 90 vectors = np.array(vectors) 86 filename = os.path.join(HELP_FILE_PATH, "html", file) 87 filenames.append(filename) 88 filenames = np.array(filenames) 89 vectors = get_vectors(filenames, dictionary, normalise=False) 91 90 np.save(VECTORS_NPY, vectors) 92 91 else: … … 98 97 99 98 return (vectors, file_list) 100 101 -
machine_learning/model/predictor.py
rfece13 r112c79 4 4 5 5 # import cProfile 6 import os7 import sys8 import time9 6 10 7 # Third party imports 11 8 import numpy as np 12 9 from sklearn.base import BaseEstimator, ClassifierMixin 13 14 # Local imports15 from common.keyword_vector import vector_distance, count_occurances, \16 read_dictionary, normalise_vector17 from common.lookuptable import create_table18 from common.constants import KEYWORDS_FILE19 10 20 11 -
machine_learning/predictor_runner.py
rfece13 r112c79 5 5 import os 6 6 import sys 7 import time 7 8 import numpy as np 8 from model.predictor import * 9 10 from model.predictor import HelpPagePredictor 11 from common.keyword_vector import read_dictionary, count_occurances 12 from common.lookuptable import create_table 13 from common.constants import KEYWORDS_FILE 9 14 10 15 def find_prediction(filename): 16 """ 17 Given a file name as string, get the predicted help page name 18 """ 11 19 dictionary = read_dictionary(KEYWORDS_FILE) 20 12 21 start = time.time() 13 22 vectors, file_list = create_table(dictionary=dictionary) … … 15 24 print(end - start, "seconds to create_table") 16 25 26 return _find_prediction(filename, dictionary, vectors, file_list) 27 28 29 def _find_prediction(filename, dictionary, vectors, file_list): 30 """ 31 Train a predictor, get the predicted help page name 32 """ 17 33 predictor = HelpPagePredictor() 18 34 predictor.fit(vectors, file_list) … … 24 40 print(end - start, "seconds to make prediction") 25 41 return prediction 26 42 27 43 28 44 def main(): … … 42 58 predictor.fit(vectors, file_list) 43 59 44 start = time.time()45 test_vec = count_occurances("extract.lib", dictionary)46 prediction = predictor.predict(np.array([test_vec]))47 end = time.time()48 print(end - start, "seconds to make prediction")49 print(prediction)50 print()51 52 60 print("prediction for zero vector") 53 61 start = time.time() … … 59 67 print() 60 68 69 prediction = _find_prediction("extract.lib", 70 dictionary, 71 vectors, 72 file_list) 73 print(prediction) 74 print() 75 76 61 77 if len(sys.argv) >= 2: 62 78 for i in range(len(sys.argv)): … … 65 81 if not os.path.isfile(sys.argv[i]): 66 82 continue 83 67 84 print("predicting for file", sys.argv[i]) 68 start = time.time() 69 test_vec = count_occurances(sys.argv[i], dictionary) 70 prediction = predictor.predict(np.array([test_vec])) 71 end = time.time() 72 print(end - start, "seconds to make prediction") 85 prediction = _find_prediction(sys.argv[i], 86 dictionary, 87 vectors, 88 file_list) 73 89 print(prediction) 74 90 print() -
machine_learning/tests/common/test_lookuptable.py
rfece13 r112c79 2 2 import unittest 3 3 import numpy as np 4 import cProfile 4 5 5 6 from common.lookuptable import * 7 from common.keyword_vector import count_occurances 6 8 from common.constants import KEYWORDS_FILE 7 9 … … 32 34 33 35 if __name__ == '__main__': 36 #cProfile.run("unittest.main()") 34 37 unittest.main() -
machine_learning/tests/model/test_predictor.py
rfece13 r112c79 5 5 from model.predictor import * 6 6 from common.constants import KEYWORDS_FILE 7 8 from common.keyword_vector import normalise_vector, vector_distance 7 9 8 10 class TestPredictionMethods(unittest.TestCase):
Note: See TracChangeset
for help on using the changeset viewer.