Changeset 55a5abb in git


Ignore:
Timestamp:
Jul 26, 2019, 7:15:06 PM (4 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', '828514cf6e480e4bafc26df99217bf2a1ed1ef45')
Children:
d93ae5668f2da84265ee686cd190828895b49cba
Parents:
8720894b787185ed212506a9a4492ddf6e5f014c
git-author:
Murray Heymann <heymann.murray@gmail.com>2019-07-26 19:15:06+02:00
git-committer:
Murray Heymann <heymann.murray@gmail.com>2019-07-26 19:15:13+02:00
Message:
Optimize by normalising vectors less often
Location:
machine_learning
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    r872089 r55a5abb  
    2929
    3030
    31 def count_occurances(filename, dictionary):
     31def count_occurances(filename, dictionary, normalise=True):
    3232    """
    3333    Create a vector from a dictionary and populate the counts according to
     
    5252                    vector[word] = vector[word] + 1
    5353            line = file.readline()
     54    if normalise:
     55        normalise_vector(vector)
    5456    return vector
    5557
     
    9799        return -1
    98100
    99     nvec1 = copy_vector(vec1)
    100     nvec2 = copy_vector(vec2)
    101     normalise_vector(nvec1)
    102     normalise_vector(nvec2)
    103101
    104102    dist = 0
    105     for key in nvec1:
    106         dist = dist + (nvec1[key] - nvec2[key]) ** 2
     103    for key in vec1:
     104        dist = dist + (vec1[key] - vec2[key]) ** 2
    107105
    108106    dist = math.sqrt(dist)
     
    142140
    143141    vector1 = {"hello":3, "bye":4}
     142    normalise_vector(vector1)
    144143    vector2 = {"hello":4, "bye":3}
     144    normalise_vector(vector2)
    145145    print("distance same vector: " + str(vector_distance(vector1, vector1)))
    146146    print("distance different vector: " + str(vector_distance(vector1, vector2)))
  • machine_learning/common/lookuptable.py

    r872089 r55a5abb  
    1414
    1515# local imports
    16 from common.keyword_vector import count_occurances, read_dictionary
     16from common.keyword_vector import count_occurances, read_dictionary, \
     17        normalise_vector
    1718from common.constants import HELP_FILE_URL, HELP_FILE_PATH, SINGULAR_BIN, \
    1819                        EXTRACT_SCRIPT, KEYWORDS_FILE, HELPFILE_NPY, \
     
    6465        for file in file_list:
    6566            vector = count_occurances(os.path.join(HELP_FILE_PATH, "html",
    66                                                    file), dictionary)
     67                                                   file),
     68                                      dictionary,
     69                                      normalise=False)
    6770            vectors.append(vector)
    6871        vectors = np.array(vectors)
     
    7174        vectors = np.load(VECTORS_NPY)
    7275        file_list = np.load(HELPFILE_NPY)
     76    for vector in vectors:
     77        normalise_vector(vector)
    7378
    7479    return (vectors, file_list)
  • machine_learning/model/predictor.py

    r872089 r55a5abb  
    22Define the predictor class for classifying according to help page.
    33"""
     4
     5import cProfile
     6import time
    47
    58# Third party imports
     
    9396
    9497    dictionary = read_dictionary(KEYWORDS_FILE)
     98    start = time.time()
    9599    vectors, file_list = create_table(dictionary=dictionary)
     100    end = time.time()
     101    print(end - start, "seconds to create_table")
    96102    test_vec = count_occurances("extract.lib", dictionary)
    97103    predictor.fit(vectors, file_list)
     104    start = time.time()
    98105    prediction = predictor.predict(np.array([test_vec]))
     106    end = time.time()
    99107    print(prediction)
     108    print(end - start, "seconds to make prediction")
    100109
    101110if __name__ == '__main__':
    102     main()
     111    cProfile.run("main()")
Note: See TracChangeset for help on using the changeset viewer.