Changeset 05dbb13 in git
- Timestamp:
- Jul 29, 2019, 5:43:56 PM (4 years ago)
- Branches:
- (u'spielwiese', 'd1ba061a762c62d3a25159d8da8b6e17332291fa')
- Children:
- c1b9ababa50e8059dd847da0957f50aea901e6a8
- Parents:
- 9f0f63787f503527775273cff2d354bd558c9cda
- Location:
- machine_learning
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
machine_learning/common/keyword_vector.py
r9f0f63 r05dbb13 3 3 """Some vector logic""" 4 4 5 import numpy as np 5 6 import os 6 7 import re 7 8 import sys 8 import numpy as np 9 import traceback 10 11 from common.constants import KEYWORDS_FILE 9 12 10 13 ### Read from file ######################################################## 11 14 12 def read_dictionary(filename ):15 def read_dictionary(filename=KEYWORDS_FILE): 13 16 """ 14 17 Read a dictionary saved as a textfile … … 26 29 dictionary.append(line.strip()) 27 30 line = file.readline() 28 return dictionary31 return np.array(dictionary) 29 32 30 33 … … 37 40 print("Please provide a valid input file as argument") 38 41 return [] 39 if dictionary == []:42 if dictionary.size == 0: 40 43 print("Please provide a valid dictionary as argument") 41 44 return [] … … 110 113 distance of 1 between the zero vector and the vector itself. 111 114 """ 112 vec = vec / np.linalg.norm(vec) 115 116 if vec is None: 117 print("Please provide a valid vector") 118 print("Returning empty vector by default") 119 return np.array([]) 120 121 if not isinstance(vec, np.ndarray): 122 print("Warning, vector should be a numpy array") 123 norm = np.linalg.norm(vec) 124 if not norm == 0: 125 vec = vec / norm 113 126 return vec 114 127 … … 118 131 Run some basic tests 119 132 """ 120 if len(sys.argv) != 2:121 print("Usage: ")122 print(sys.argv[0] + " <dict_name>")123 sys.exit(1)124 133 125 134 testvector = np.array([3, 4]) … … 136 145 print(vector1) 137 146 print(vector2) 147 print() 138 148 139 dic = read_dictionary(sys.argv[1]) 149 print("Attempt to normalise the zero vector") 150 print(normalise_vector(np.array([0,0,0,0,0]))) 151 print() 152 153 print("Attempt to normalise list") 154 print(normalise_vector([3,4,0,0,0])) 155 print() 156 157 print("Attempt to normalise empty vector") 158 print(normalise_vector(np.array([]))) 159 print() 160 161 print("Attempt to normalise None") 162 print(normalise_vector(None)) 163 print() 164 165 if len(sys.argv) == 2: 166 dic = read_dictionary(filename=sys.argv[1]) 167 else: 168 dic = read_dictionary() 169 print("vector of ../Singular/table.h") 140 170 print(count_occurances("../Singular/table.h", dic)) 141 171 -
machine_learning/common/lookuptable.py
r9f0f63 r05dbb13 49 49 'keywords.txt' 50 50 """ 51 # extract keywords using the singular script 51 52 os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT) 52 53 54 # read from the file created by singular 55 dictionary = read_dictionary() 56 print(dictionary) 53 57 54 def create_table(dictionary=None): 58 # sort alphabetically 59 dictionary = np.sort(dictionary) 60 print(dictionary) 61 62 # write back to the same file 63 with open(KEYWORDS_FILE, "w") as file: 64 for word in dictionary: 65 file.write(word + "\n") 66 67 68 69 def create_table(dictionary=None, attempt_cached=True): 55 70 """ 56 71 Get a list of helpfiles, and generate a word occurance vector for each. … … 60 75 vectors = [] 61 76 62 if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY): 77 if not os.path.isfile(VECTORS_NPY) or \ 78 not os.path.isfile(HELPFILE_NPY) or \ 79 not attempt_cached: 63 80 file_list = np.array(get_list_of_htm_files()) 64 81 np.save(HELPFILE_NPY, file_list) … … 90 107 print(file) 91 108 extract_keywords() 92 vectors, files = create_table() 109 vectors, files = create_table(attempt_cached=False) 110 vectors1, files1 = create_table() 111 112 if not (vectors == vectors1).all(): 113 print("Cached version differs from original version") 114 elif not (files == files1).all(): 115 print("Cached version differs from original version") 116 else: 117 print("Cached version corresponds with original") 118 93 119 dictionary = read_dictionary(KEYWORDS_FILE) 94 120 test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html", 95 121 files[1]), dictionary) 96 print((test_vec ==vectors[1]).all())122 print((test_vec==vectors[1]).all()) 97 123 98 124 -
machine_learning/extract.lib
r9f0f63 r05dbb13 5 5 int i; 6 6 7 // combine lists, leaving out the first entry of l, which is "l" 8 list combined = delete(l, 1) + k; 9 7 10 // create file, overwrite if exists 8 write(":w keywords.txt", l[1]);11 write(":w keywords.txt", combined[1]); 9 12 10 // write entries from "names" procedure11 for (i = 1; i < size(l); i++) {12 write(":a keywords.txt", l[i+1]);13 }14 13 15 // write entries from "reservedNameList" procedure16 for (i = 1; i < size(k) + 1; i++) {17 write(":a keywords.txt", l[i]);14 // write remaining entries to file 15 for (i = 2; i < size(combined) + 1; i++) { 16 write(":a keywords.txt", combined[i]); 18 17 } 19 18 -
machine_learning/model/predictor.py
r9f0f63 r05dbb13 117 117 print(prediction) 118 118 119 120 test_vec = count_occurances("test.txt", dictionary) 121 start = time.time() 122 prediction = predictor.predict(np.array([test_vec])) 123 end = time.time() 124 print(end - start, "seconds to make prediction") 125 print(prediction) 126 119 127 if __name__ == '__main__': 120 128 cProfile.run("main()")
Note: See TracChangeset
for help on using the changeset viewer.