1 | """ |
---|
2 | j |
---|
3 | A Module for fetching helpfiles, creating vectors for each and bundling |
---|
4 | these up in a lookup table. |
---|
5 | """ |
---|
6 | |
---|
7 | # System imports |
---|
8 | import os |
---|
9 | |
---|
10 | # Third party imports |
---|
11 | import tarfile |
---|
12 | import numpy as np |
---|
13 | from six.moves import urllib |
---|
14 | |
---|
15 | # local imports |
---|
16 | from common.keyword_vector import count_occurances, read_dictionary, \ |
---|
17 | normalise_vector |
---|
18 | from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \ |
---|
19 | EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \ |
---|
20 | VECTORS_NPY |
---|
21 | |
---|
22 | |
---|
23 | def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH, |
---|
24 | file_name="helpfiles.tbz2"): |
---|
25 | """ |
---|
26 | Download data from a given url, extract to a path provided. |
---|
27 | """ |
---|
28 | if not os.path.isdir(data_path): |
---|
29 | os.makedirs(data_path) |
---|
30 | tbz2_path = os.path.join(data_path, file_name) |
---|
31 | urllib.request.urlretrieve(tbz2_url, tbz2_path) |
---|
32 | |
---|
33 | tbz2_file = tarfile.open(tbz2_path) |
---|
34 | tbz2_file.extractall(path=data_path) |
---|
35 | tbz2_file.close() |
---|
36 | |
---|
37 | |
---|
38 | def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")): |
---|
39 | """ |
---|
40 | Return a list of htm files in the given path |
---|
41 | """ |
---|
42 | files = [f for f in os.listdir(path) if f.endswith("htm")] |
---|
43 | files.sort() |
---|
44 | return files |
---|
45 | |
---|
46 | |
---|
47 | def extract_keywords(): |
---|
48 | """ |
---|
49 | Run Singular script to extract current keywords and save as file |
---|
50 | 'keywords.txt' |
---|
51 | """ |
---|
52 | os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT) |
---|
53 | |
---|
54 | |
---|
55 | def create_table(dictionary=read_dictionary(KEYWORDS_FILE)): |
---|
56 | """ |
---|
57 | Get a list of helpfiles, and generate a word occurance vector for each. |
---|
58 | """ |
---|
59 | vectors = [] |
---|
60 | |
---|
61 | if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY): |
---|
62 | file_list = np.array(get_list_of_htm_files()) |
---|
63 | np.save(HELPFILE_NPY, file_list) |
---|
64 | |
---|
65 | for file in file_list: |
---|
66 | vector = count_occurances(os.path.join(HELP_FILE_PATH, "html", |
---|
67 | file), |
---|
68 | dictionary, |
---|
69 | normalise=False) |
---|
70 | vectors.append(vector) |
---|
71 | vectors = np.array(vectors) |
---|
72 | np.save(VECTORS_NPY, vectors) |
---|
73 | else: |
---|
74 | vectors = np.load(VECTORS_NPY) |
---|
75 | file_list = np.load(HELPFILE_NPY) |
---|
76 | for vector in vectors: |
---|
77 | normalise_vector(vector) |
---|
78 | |
---|
79 | return (vectors, file_list) |
---|
80 | |
---|
81 | |
---|
82 | def main(): |
---|
83 | """ |
---|
84 | Run some tests to check if the functions work. |
---|
85 | """ |
---|
86 | fetch_tbz2_data() |
---|
87 | for file in get_list_of_htm_files(): |
---|
88 | print(file) |
---|
89 | extract_keywords() |
---|
90 | vectors, files = create_table() |
---|
91 | dictionary = read_dictionary(KEYWORDS_FILE) |
---|
92 | test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html", |
---|
93 | files[1]), dictionary) |
---|
94 | print(test_vec == vectors[1]) |
---|
95 | |
---|
96 | |
---|
97 | if __name__ == '__main__': |
---|
98 | main() |
---|