source: git/machine_learning/ml_python/common/lookuptable.py @ cd552e

spielwiese
Last change on this file since cd552e was cd552e, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Create dir in home for cache files for ml
  • Property mode set to 100644
File size: 4.0 KB
Line 
1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import get_vectors, read_dictionary
17from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
18                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
19                        VECTORS_NPY, HOME_DIR
20
21
22def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
23                    file_name="helpfiles.tbz2"):
24    """
25    Download data from a given url, extract to a path provided.
26    """
27    if not os.path.isdir(data_path):
28        os.makedirs(data_path)
29    tbz2_path = os.path.join(data_path, file_name)
30    urllib.request.urlcleanup()
31    urllib.request.urlretrieve(tbz2_url, tbz2_path)
32
33    tbz2_file = tarfile.open(tbz2_path)
34    tbz2_file.extractall(path=data_path)
35    tbz2_file.close()
36
37
38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
39    """
40    Return a list of htm files in the given path
41    """
42    files = [f for f in os.listdir(path) if f.endswith("htm")]
43    files.sort()
44    return files
45
46
47def extract_keywords():
48    """
49    Run Singular script to extract current keywords and save as file
50    'keywords.txt'
51    """
52    # ensure the homedir exists
53    if not os.path.isdir(HOME_DIR):
54        os.makedirs(HOME_DIR)
55
56    # extract keywords using the singular script
57    os.system(SINGULAR_BIN + " -q " + EXTRACT_SCRIPT + 
58            " | sort | uniq > " + KEYWORDS_FILE)
59
60    # read from the file created by singular
61    dictionary = read_dictionary()
62
63    return dictionary
64
65
66def create_table(dictionary=None, attempt_cached=True):
67    """
68    Get a list of helpfiles, and generate a word occurance vector for each.
69    """
70
71    if dictionary is None:
72        dictionary = read_dictionary(KEYWORDS_FILE)
73    vectors = []
74
75    if not os.path.isfile(VECTORS_NPY) or \
76            not os.path.isfile(HELPFILE_NPY) or \
77            not attempt_cached:
78        os.makedirs(HOME_DIR, exist_ok=True)
79        file_list = np.array(get_list_of_htm_files())
80        np.save(HELPFILE_NPY, file_list)
81
82        filenames = []
83        for file in file_list:
84            filename = os.path.join(HELP_FILE_PATH, "html", file)
85            filenames.append(filename)
86        filenames = np.array(filenames)
87        vectors = get_vectors(filenames, dictionary, normalise=False)
88        np.save(VECTORS_NPY, vectors)
89    else:
90        vectors = np.load(VECTORS_NPY)
91        file_list = np.load(HELPFILE_NPY)
92
93    # normalise the vectors
94    vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
95
96    return (vectors, file_list)
97
98def init_table_on_system():
99    """
100    check whether the various files exist, and create if necessary.
101    """
102    if not os.path.isdir(HOME_DIR):
103        os.makedirs(HOME_DIR)
104
105    # check for and download help files if necessary
106    tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2")
107    if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path):
108        fetch_tbz2_data()
109
110    # Use Singular to extract the keywords and save in a file.
111    if not os.path.isfile(KEYWORDS_FILE):
112        dictionary = extract_keywords()
113    else:
114        dictionary = None
115
116    if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
117        create_table(dictionary=dictionary,
118                     attempt_cached=False)
119
120def is_lookup_initialised():
121    """
122    Check whether the various files exist, return True if so, False
123    otherwise.
124    """
125    retvalue = True
126    tbz2_path = os.path.join(HELP_FILE_PATH, "helpfiles.tbz2")
127    if not os.path.isdir(HELP_FILE_PATH) or not os.path.isfile(tbz2_path):
128        retvalue = False
129    if not os.path.isfile(KEYWORDS_FILE):
130        retvalue = False
131    if not os.path.isdir(HOME_DIR) or \
132            not os.path.isfile(VECTORS_NPY) or \
133            not os.path.isfile(HELPFILE_NPY):
134        retvalue = False
135
136    return retvalue
Note: See TracBrowser for help on using the repository browser.