source: git/machine_learning/common/lookuptable.py @ d93ae5

spielwiese
Last change on this file since d93ae5 was d93ae5, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Fix pylint errors
  • Property mode set to 100644
File size: 2.8 KB
Line 
1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import count_occurances, read_dictionary, \
17        normalise_vector
18from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
19                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
20                        VECTORS_NPY
21
22
23def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
24                    file_name="helpfiles.tbz2"):
25    """
26    Download data from a given url, extract to a path provided.
27    """
28    if not os.path.isdir(data_path):
29        os.makedirs(data_path)
30    tbz2_path = os.path.join(data_path, file_name)
31    urllib.request.urlretrieve(tbz2_url, tbz2_path)
32
33    tbz2_file = tarfile.open(tbz2_path)
34    tbz2_file.extractall(path=data_path)
35    tbz2_file.close()
36
37
38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
39    """
40    Return a list of htm files in the given path
41    """
42    files = [f for f in os.listdir(path) if f.endswith("htm")]
43    files.sort()
44    return files
45
46
47def extract_keywords():
48    """
49    Run Singular script to extract current keywords and save as file
50    'keywords.txt'
51    """
52    os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT)
53
54
55def create_table(dictionary=None):
56    """
57    Get a list of helpfiles, and generate a word occurance vector for each.
58    """
59    if dictionary is None:
60        dictionary = read_dictionary(KEYWORDS_FILE)
61    vectors = []
62
63    if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
64        file_list = np.array(get_list_of_htm_files())
65        np.save(HELPFILE_NPY, file_list)
66
67        for file in file_list:
68            vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
69                                                   file),
70                                      dictionary,
71                                      normalise=False)
72            vectors.append(vector)
73        vectors = np.array(vectors)
74        np.save(VECTORS_NPY, vectors)
75    else:
76        vectors = np.load(VECTORS_NPY)
77        file_list = np.load(HELPFILE_NPY)
78    for vector in vectors:
79        normalise_vector(vector)
80
81    return (vectors, file_list)
82
83
84def main():
85    """
86    Run some tests to check if the functions work.
87    """
88    fetch_tbz2_data()
89    for file in get_list_of_htm_files():
90        print(file)
91    extract_keywords()
92    vectors, files = create_table()
93    dictionary = read_dictionary(KEYWORDS_FILE)
94    test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html",
95                                             files[1]), dictionary)
96    print(test_vec == vectors[1])
97
98
99if __name__ == '__main__':
100    main()
Note: See TracBrowser for help on using the repository browser.