Changeset 112c79 in git


Ignore:
Timestamp:
Aug 2, 2019, 7:35:10 PM (5 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', 'b324714bf5073469800caef737deba1366fbd81f')
Children:
58603fd7faac3d049f61ca0b65d89e4be0a664f9
Parents:
fece1392f8e9ff07b64d0ed4e5ec57bfa6dbf258
git-author:
Murray Heymann <heymann.murray@gmail.com>2019-08-02 19:35:10+02:00
git-committer:
Murray Heymann <heymann.murray@gmail.com>2019-08-02 19:35:13+02:00
Message:
Use sklearn to count keyword occurances
Location:
machine_learning
Files:
6 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    rfece13 r112c79  
    77import sys
    88import numpy as np
     9from sklearn.feature_extraction.text import CountVectorizer
    910
    1011from common.constants import KEYWORDS_FILE
     
    1920        print("Please provide a valid input file as argument to read "
    2021              "dictionary")
    21         if sys.version_info[0] == 3:
     22        if sys.version_info[0] == 3: # pylint: disable=no-else-raise
    2223            raise FileNotFoundError
    2324        else:
     
    3536    return np.array(dictionary)
    3637
     38def get_vectors(filenames, dictionary, normalise=True):
     39    """
     40    Create vectors from a dictionary and populate the counts according to
     41    specified files
     42    """
     43    assert filenames is not None, \
     44            "Please provide a valid list of files as argument"
     45    assert not filenames.size == 0, \
     46            "Please provide a valid list of files as argument"
     47    for filename in filenames:
     48        if not os.path.isfile(filename):
     49            print("Please provide a valid input file as argument")
     50            if sys.version_info[0] == 3: # pylint: disable=no-else-raise
     51                raise FileNotFoundError
     52            else:
     53                print(filename)
     54                raise IOError
     55    assert dictionary is not None, \
     56            "Please provide a valid dictionary as argument"
     57    assert not dictionary.size == 0, \
     58            "Please provide a valid dictionary as argument"
     59
     60    doc_strings = []
     61    for filename in filenames:
     62        doc_string = ""
     63        with open(filename, "r+") as file:
     64            line = file.readline()
     65
     66            while not line == "":
     67                doc_string = doc_string + " " + line
     68                line = file.readline()
     69
     70        doc_string = re.sub('[^0-9a-zA-Z\-\_]', ' ', doc_string) # pylint: disable=anomalous-backslash-in-string
     71        doc_strings.append(doc_string)
     72    doc_strings = np.array(doc_strings)
     73
     74    vectorizer = CountVectorizer(vocabulary=dictionary)
     75    vectors = vectorizer.fit_transform(doc_strings)
     76    vectors = vectors.toarray()
     77    if normalise:
     78        vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
     79    return vectors
     80
    3781
    3882def count_occurances(filename, dictionary, normalise=True):
     
    4185    a specified file
    4286    """
    43     if not os.path.isfile(filename):
    44         print("Please provide a valid input file as argument")
    45         if sys.version_info[0] == 3:
    46             raise FileNotFoundError
    47         else:
    48             raise IOError
    49     assert dictionary is not None, \
    50             "Please provide a valid dictionary as argument"
    51     assert not dictionary.size == 0, \
    52             "Please provide a valid dictionary as argument"
    53 
    54     vector = create_vector_dictionary(dictionary)
    55     with open(filename, "r+") as file:
    56         line = file.readline()
    57 
    58         while not line == "":
    59             words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
    60             for word in words:
    61                 if word in vector.keys():
    62                     vector[word] = vector[word] + 1
    63             line = file.readline()
    64     vector = np.array(list(vector.values()))
    65     if normalise:
    66         vector = normalise_vector(vector)
    67     return vector
     87    res = get_vectors(np.array([filename]),
     88                      dictionary,
     89                      normalise=normalise)
     90    return res[0]
    6891
    6992
  • machine_learning/common/lookuptable.py

    rfece13 r112c79  
    1414
    1515# local imports
    16 from common.keyword_vector import count_occurances, read_dictionary
     16from common.keyword_vector import get_vectors, read_dictionary
    1717from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
    1818                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
     
    8282        np.save(HELPFILE_NPY, file_list)
    8383
     84        filenames = []
    8485        for file in file_list:
    85             vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
    86                                                    file),
    87                                       dictionary,
    88                                       normalise=False)
    89             vectors.append(vector)
    90         vectors = np.array(vectors)
     86            filename = os.path.join(HELP_FILE_PATH, "html", file)
     87            filenames.append(filename)
     88        filenames = np.array(filenames)
     89        vectors = get_vectors(filenames, dictionary, normalise=False)
    9190        np.save(VECTORS_NPY, vectors)
    9291    else:
     
    9897
    9998    return (vectors, file_list)
    100 
    101 
  • machine_learning/model/predictor.py

    rfece13 r112c79  
    44
    55# import cProfile
    6 import os
    7 import sys
    8 import time
    96
    107# Third party imports
    118import numpy as np
    129from sklearn.base import BaseEstimator, ClassifierMixin
    13 
    14 # Local imports
    15 from common.keyword_vector import vector_distance, count_occurances, \
    16         read_dictionary, normalise_vector
    17 from common.lookuptable import create_table
    18 from common.constants import KEYWORDS_FILE
    1910
    2011
  • machine_learning/predictor_runner.py

    rfece13 r112c79  
    55import os
    66import sys
     7import time
    78import numpy as np
    8 from model.predictor import *
     9
     10from model.predictor import HelpPagePredictor
     11from common.keyword_vector import read_dictionary, count_occurances
     12from common.lookuptable import create_table
     13from common.constants import KEYWORDS_FILE
    914
    1015def find_prediction(filename):
     16    """
     17    Given a file name as string, get the predicted help page name
     18    """
    1119    dictionary = read_dictionary(KEYWORDS_FILE)
     20
    1221    start = time.time()
    1322    vectors, file_list = create_table(dictionary=dictionary)
     
    1524    print(end - start, "seconds to create_table")
    1625
     26    return _find_prediction(filename, dictionary, vectors, file_list)
     27
     28
     29def _find_prediction(filename, dictionary, vectors, file_list):
     30    """
     31    Train a predictor, get the predicted help page name
     32    """
    1733    predictor = HelpPagePredictor()
    1834    predictor.fit(vectors, file_list)
     
    2440    print(end - start, "seconds to make prediction")
    2541    return prediction
    26    
     42
    2743
    2844def main():
     
    4258    predictor.fit(vectors, file_list)
    4359
    44     start = time.time()
    45     test_vec = count_occurances("extract.lib", dictionary)
    46     prediction = predictor.predict(np.array([test_vec]))
    47     end = time.time()
    48     print(end - start, "seconds to make prediction")
    49     print(prediction)
    50     print()
    51 
    5260    print("prediction for zero vector")
    5361    start = time.time()
     
    5967    print()
    6068
     69    prediction = _find_prediction("extract.lib",
     70                                  dictionary,
     71                                  vectors,
     72                                  file_list)
     73    print(prediction)
     74    print()
     75
     76
    6177    if len(sys.argv) >= 2:
    6278        for i in range(len(sys.argv)):
     
    6581            if not os.path.isfile(sys.argv[i]):
    6682                continue
     83
    6784            print("predicting for file", sys.argv[i])
    68             start = time.time()
    69             test_vec = count_occurances(sys.argv[i], dictionary)
    70             prediction = predictor.predict(np.array([test_vec]))
    71             end = time.time()
    72             print(end - start, "seconds to make prediction")
     85            prediction = _find_prediction(sys.argv[i],
     86                                          dictionary,
     87                                          vectors,
     88                                          file_list)
    7389            print(prediction)
    7490            print()
  • machine_learning/tests/common/test_lookuptable.py

    rfece13 r112c79  
    22import unittest
    33import numpy as np
     4import cProfile
    45
    56from common.lookuptable import *
     7from common.keyword_vector import count_occurances
    68from common.constants import KEYWORDS_FILE
    79
     
    3234
    3335if __name__ == '__main__':
     36    #cProfile.run("unittest.main()")
    3437    unittest.main()
  • machine_learning/tests/model/test_predictor.py

    rfece13 r112c79  
    55from model.predictor import *
    66from common.constants import KEYWORDS_FILE
     7
     8from common.keyword_vector import normalise_vector, vector_distance
    79
    810class TestPredictionMethods(unittest.TestCase):
Note: See TracChangeset for help on using the changeset viewer.