source: git/machine_learning/common/lookuptable.py @ fece13

spielwiese
Last change on this file since fece13 was fece13, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Make python2 compatible
  • Property mode set to 100644
File size: 2.8 KB
Line 
1"""
2j
3A Module for fetching helpfiles, creating vectors for each and bundling
4these up in a lookup table.
5"""
6
7# System imports
8import os
9
10# Third party imports
11import tarfile
12import numpy as np
13from six.moves import urllib
14
15# local imports
16from common.keyword_vector import count_occurances, read_dictionary
17from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
18                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
19                        VECTORS_NPY
20
21
22def fetch_tbz2_data(tbz2_url=HELP_FILE_URL, data_path=HELP_FILE_PATH,
23                    file_name="helpfiles.tbz2"):
24    """
25    Download data from a given url, extract to a path provided.
26    """
27    if not os.path.isdir(data_path):
28        os.makedirs(data_path)
29    tbz2_path = os.path.join(data_path, file_name)
30    urllib.request.urlcleanup()
31    urllib.request.urlretrieve(tbz2_url, tbz2_path)
32
33    tbz2_file = tarfile.open(tbz2_path)
34    tbz2_file.extractall(path=data_path)
35    tbz2_file.close()
36
37
38def get_list_of_htm_files(path=os.path.join(HELP_FILE_PATH, "html")):
39    """
40    Return a list of htm files in the given path
41    """
42    files = [f for f in os.listdir(path) if f.endswith("htm")]
43    files.sort()
44    return files
45
46
47def extract_keywords():
48    """
49    Run Singular script to extract current keywords and save as file
50    'keywords.txt'
51    """
52    # extract keywords using the singular script
53    os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT)
54
55    # read from the file created by singular
56    dictionary = read_dictionary()
57    print(dictionary)
58
59    # sort alphabetically
60    dictionary = np.sort(np.unique(dictionary))
61    print(dictionary)
62
63    # write back to the same file
64    with open(KEYWORDS_FILE, "w") as file:
65        for word in dictionary:
66            file.write(word + "\n")
67
68
69
70def create_table(dictionary=None, attempt_cached=True):
71    """
72    Get a list of helpfiles, and generate a word occurance vector for each.
73    """
74    if dictionary is None:
75        dictionary = read_dictionary(KEYWORDS_FILE)
76    vectors = []
77
78    if not os.path.isfile(VECTORS_NPY) or \
79            not os.path.isfile(HELPFILE_NPY) or \
80            not attempt_cached:
81        file_list = np.array(get_list_of_htm_files())
82        np.save(HELPFILE_NPY, file_list)
83
84        for file in file_list:
85            vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
86                                                   file),
87                                      dictionary,
88                                      normalise=False)
89            vectors.append(vector)
90        vectors = np.array(vectors)
91        np.save(VECTORS_NPY, vectors)
92    else:
93        vectors = np.load(VECTORS_NPY)
94        file_list = np.load(HELPFILE_NPY)
95
96    # normalise the vectors
97    vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
98
99    return (vectors, file_list)
100
101
Note: See TracBrowser for help on using the repository browser.