source: git/machine_learning/common/lookuptable.py @ 872089

spielwiese
Last change on this file since 872089 was 872089, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Reorganize source tree, make test prediction
  • Property mode set to 100644
File size: 2.6 KB
Line 
1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import count_occurances, read_dictionary
17from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
18                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
19                        VECTORS_NPY
20
21
22def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
23                    file_name="helpfiles.tbz2"):
24    """
25    Download data from a given url, extract to a path provided.
26    """
27    if not os.path.isdir(data_path):
28        os.makedirs(data_path)
29    tbz2_path = os.path.join(data_path, file_name)
30    urllib.request.urlretrieve(tbz2_url, tbz2_path)
31
32    tbz2_file = tarfile.open(tbz2_path)
33    tbz2_file.extractall(path=data_path)
34    tbz2_file.close()
35
36
37def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
38    """
39    Return a list of htm files in the given path
40    """
41    files = [f for f in os.listdir(path) if f.endswith("htm")]
42    files.sort()
43    return files
44
45
46def extract_keywords():
47    """
48    Run Singular script to extract current keywords and save as file
49    'keywords.txt'
50    """
51    os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT)
52
53
54def create_table(dictionary=read_dictionary(KEYWORDS_FILE)):
55    """
56    Get a list of helpfiles, and generate a word occurance vector for each.
57    """
58    vectors = [] 
59
60    if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
61        file_list = np.array(get_list_of_htm_files())
62        np.save(HELPFILE_NPY, file_list)
63
64        for file in file_list:
65            vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
66                                                   file), dictionary)
67            vectors.append(vector)
68        vectors = np.array(vectors)
69        np.save(VECTORS_NPY, vectors)
70    else:
71        vectors = np.load(VECTORS_NPY)
72        file_list = np.load(HELPFILE_NPY)
73
74    return (vectors, file_list)
75
76
77def main():
78    """
79    Run some tests to check if the functions work.
80    """
81    fetch_tbz2_data()
82    for file in get_list_of_htm_files():
83        print(file)
84    extract_keywords()
85    vectors, files = create_table()
86    dictionary = read_dictionary(KEYWORDS_FILE)
87    test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html",
88                                             files[1]), dictionary)
89    print(test_vec == vectors[1])
90
91
92if __name__ == '__main__':
93    main()
Note: See TracBrowser for help on using the repository browser.