Changeset 91d4ae in git


Ignore:
Timestamp:
Jul 29, 2019, 2:45:44 PM (5 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', '17f1d200f27c5bd38f5dfc6e8a0879242279d1d8')
Children:
eb2904af1f3fb2d653378807ed3b18780cfd4dbc
Parents:
d93ae5668f2da84265ee686cd190828895b49cba
Message:
Optimize methods
Location:
machine_learning
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    rd93ae5 r91d4ae  
    77import re
    88import sys
     9import numpy as np
    910
    1011### Read from file ########################################################
     
    4041        print("Please provide a valid dictionary as argument")
    4142        return {}
    42     vector = create_vector(dictionary)
     43    if dictionary is None:
     44        print("Please provide a valid dictionary as argument")
     45        return {}
     46    vector = create_vector_dictionary(dictionary)
    4347    with open(filename, "r+") as file:
    4448        line = file.readline()
    4549
    4650        while not line == "":
    47             # DONE: replace all non-alphanumeric characters with space
    48             # words = line.split()
    4951            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
    5052            for word in words:
     
    5254                    vector[word] = vector[word] + 1
    5355            line = file.readline()
     56    vector = np.array(list(vector.values()))
    5457    if normalise:
    55         normalise_vector(vector)
    56     return vector
     58        vector = normalise_vector(vector)
     59    return vector 
    5760
    5861
     
    7376    Return an identical copy of a vector
    7477    """
    75     new_vector = {}
    76     for key in vector.keys():
    77         new_vector[key] = vector[key]
    78     return new_vector
     78    return np.copy(np.array(vector))
    7979
    8080
    8181### Vector specific logic #################################################
    8282
    83 def create_vector(dictionary):
     83def create_vector_dictionary(dictionary):
    8484    """
    8585    Create a zero vector for a given dictionary
     
    9595    Calculate the Euclidean distance between two vectors.
    9696    """
    97     if not set(vec1.keys()) == set(vec2.keys()):
    98         print("Dictionaries don't have the same keys")
     97    if not len(vec1) == len(vec2):
     98        print("Vectors don't have the same sizes")
    9999        return -1
    100100
    101 
    102     dist = 0
    103     for key in vec1:
    104         dist = dist + (vec1[key] - vec2[key]) ** 2
    105 
    106     dist = math.sqrt(dist)
     101    diff_vec = vec1 - vec2
     102    dist = np.linalg.norm(diff_vec)
    107103
    108104    return dist
     
    114110    distance of 1 between the zero vector and the vector itself.
    115111    """
    116     sum_vals = 0
    117     for key in vec.keys():
    118         sum_vals = sum_vals + (vec[key] * vec[key])
    119 
    120     sum_vals = math.sqrt(sum_vals)
    121 
    122     for key in vec.keys():
    123         vec[key] = (vec[key] + 0.0) / sum_vals
     112    vec = vec / np.linalg.norm(vec)
     113    return vec
    124114
    125115
     
    133123        sys.exit(1)
    134124
    135     dic = read_dictionary(sys.argv[1])
    136 
    137     testvector = {"hello":3, "bye":4}
     125    testvector = np.array([3,4])
    138126    normalise_vector(testvector)
    139127    print("normalised vector: " + str(testvector))
    140128
    141     vector1 = {"hello":3, "bye":4}
    142     normalise_vector(vector1)
    143     vector2 = {"hello":4, "bye":3}
     129    vector1 = np.array([3,4])
     130    vector1 = normalise_vector(vector1)
     131    vector2 = np.array([4,3])
    144132    normalise_vector(vector2)
     133    vector2 = normalise_vector(vector2)
    145134    print("distance same vector: " + str(vector_distance(vector1, vector1)))
    146135    print("distance different vector: " + str(vector_distance(vector1, vector2)))
     
    148137    print(vector2)
    149138
     139    dic = read_dictionary(sys.argv[1])
    150140    print(count_occurances("../Singular/table.h", dic))
    151141
  • machine_learning/common/lookuptable.py

    rd93ae5 r91d4ae  
    7676        vectors = np.load(VECTORS_NPY)
    7777        file_list = np.load(HELPFILE_NPY)
    78     for vector in vectors:
    79         normalise_vector(vector)
     78
     79    # normalise the vectors
     80    vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
    8081
    8182    return (vectors, file_list)
  • machine_learning/model/predictor.py

    rd93ae5 r91d4ae  
    1212# Local imports
    1313from common.keyword_vector import vector_distance, count_occurances, \
    14                                     read_dictionary
     14        read_dictionary, normalise_vector
    1515from common.lookuptable import create_table
    1616from common.constants import KEYWORDS_FILE
     
    4848        for x in X: # pylint: disable=invalid-name
    4949            # find the closest vector
    50 
    5150            min_val = float("inf")
    52             min_vec = None
     51            index = -1
     52            i = 0
    5353            for vec in self.vectors:
    5454                dist = vector_distance(x, vec)
    5555                if dist < min_val:
    5656                    min_val = dist
    57                     min_vec = vec
     57                    index = i
     58                i = i + 1
    5859
    5960            # find corresponding filename
    60             index = list(self.vectors).index(min_vec)
    6161            file = self.files[index]
    6262            ret_list.append(file)
     
    7070    print("Running some tests")
    7171    predictor = HelpPagePredictor()
    72     vector1 = {"hello":1, "bye":4, "pizza": 10}
    73     vector2 = {"hello":2, "bye":3, "pizza": 1}
    74     vector3 = {"hello":3, "bye":9, "pizza": 3}
     72    vector1 = normalise_vector([1, 4, 10])
     73    vector2 = normalise_vector([2, 3, 1])
     74    vector3 = normalise_vector([3, 9, 3])
    7575
    7676    vectors = np.array([vector1, vector2, vector3])
     
    7878    print(vectors)
    7979    print(files)
     80    print()
    8081
    81     testvec = {"hello":1, "bye":1, "pizza": 1}
     82    testvec = normalise_vector([1, 1, 1])
     83    print("test vector:")
     84    print(testvec)
     85    print()
    8286
    8387    print("distance to 1")
     
    9397    predictor.fit(vectors, files)
    9498    prediction = predictor.predict(np.array([testvec]))
     99    print("Prediction:")
    95100    print(prediction)
     101    print()
    96102
    97103    dictionary = read_dictionary(KEYWORDS_FILE)
     104
    98105    start = time.time()
    99106    vectors, file_list = create_table(dictionary=dictionary)
    100107    end = time.time()
    101108    print(end - start, "seconds to create_table")
     109
    102110    test_vec = count_occurances("extract.lib", dictionary)
    103111    predictor.fit(vectors, file_list)
     112
    104113    start = time.time()
    105114    prediction = predictor.predict(np.array([test_vec]))
    106115    end = time.time()
     116    print(end - start, "seconds to make prediction")
    107117    print(prediction)
    108     print(end - start, "seconds to make prediction")
    109118
    110119if __name__ == '__main__':
Note: See TracChangeset for help on using the changeset viewer.