source: git/machine_learning/common/lookuptable.py @ 55a5abb

spielwiese
Last change on this file since 55a5abb was 55a5abb, checked in by Murray Heymann <heymann.murray@…>, 4 years ago
Optimize by normalising vectors less often
  • Property mode set to 100644
File size: 2.7 KB
Line 
1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import count_occurances, read_dictionary, \
17        normalise_vector
18from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
19                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
20                        VECTORS_NPY
21
22
23def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
24                    file_name="helpfiles.tbz2"):
25    """
26    Download data from a given url, extract to a path provided.
27    """
28    if not os.path.isdir(data_path):
29        os.makedirs(data_path)
30    tbz2_path = os.path.join(data_path, file_name)
31    urllib.request.urlretrieve(tbz2_url, tbz2_path)
32
33    tbz2_file = tarfile.open(tbz2_path)
34    tbz2_file.extractall(path=data_path)
35    tbz2_file.close()
36
37
38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
39    """
40    Return a list of htm files in the given path
41    """
42    files = [f for f in os.listdir(path) if f.endswith("htm")]
43    files.sort()
44    return files
45
46
47def extract_keywords():
48    """
49    Run Singular script to extract current keywords and save as file
50    'keywords.txt'
51    """
52    os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT)
53
54
55def create_table(dictionary=read_dictionary(KEYWORDS_FILE)):
56    """
57    Get a list of helpfiles, and generate a word occurance vector for each.
58    """
59    vectors = [] 
60
61    if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
62        file_list = np.array(get_list_of_htm_files())
63        np.save(HELPFILE_NPY, file_list)
64
65        for file in file_list:
66            vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
67                                                   file), 
68                                      dictionary, 
69                                      normalise=False)
70            vectors.append(vector)
71        vectors = np.array(vectors)
72        np.save(VECTORS_NPY, vectors)
73    else:
74        vectors = np.load(VECTORS_NPY)
75        file_list = np.load(HELPFILE_NPY)
76    for vector in vectors:
77        normalise_vector(vector)
78
79    return (vectors, file_list)
80
81
82def main():
83    """
84    Run some tests to check if the functions work.
85    """
86    fetch_tbz2_data()
87    for file in get_list_of_htm_files():
88        print(file)
89    extract_keywords()
90    vectors, files = create_table()
91    dictionary = read_dictionary(KEYWORDS_FILE)
92    test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html",
93                                             files[1]), dictionary)
94    print(test_vec == vectors[1])
95
96
97if __name__ == '__main__':
98    main()
Note: See TracBrowser for help on using the repository browser.