Changeset 2a4516e in git


Ignore:
Timestamp:
Jul 25, 2019, 4:01:14 PM (4 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', '828514cf6e480e4bafc26df99217bf2a1ed1ef45')
Children:
dd499fe3f7367342e4f0cf3495cdab72a21c2280
Parents:
1a034b64ea0141af657c8bd605ec28fcfe0d1d8b
Message:
Cleanup and test predictor.py
Location:
machine_learning
Files:
3 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    r1a034b r2a4516e  
    11#!/usr/bin/python3
     2
     3"""Some vector logic"""
    24
    35import math
     
    911
    1012def read_dictionary(filename):
     13    """
     14    Read a dictionary saved as a textfile
     15    """
    1116    if not os.path.isfile(filename):
    1217        print("Please provide a valid input file as argument")
     
    2530
    2631def count_occurances(filename, dictionary):
     32    """
     33    Create a vector from a dictionary and populate the counts according to
     34    a specified file
     35    """
    2736    if not os.path.isfile(filename):
    2837        print("Please provide a valid input file as argument")
     
    3241        return {}
    3342    vector = create_vector(dictionary)
    34     with open(filename, "r+") as file: 
     43    with open(filename, "r+") as file:
    3544        line = file.readline()
    3645
     
    3847            # DONE: replace all non-alphanumeric characters with space
    3948            # words = line.split()
    40             words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split()
     49            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
    4150            for word in words:
    4251                if word in vector.keys():
     
    4958
    5059def copy_dictionary(dictionary):
     60    """
     61    Return an identical copy of a dictionary
     62    """
    5163    new_dic = []
    5264    for word in dictionary:
     
    5668
    5769def copy_vector(vector):
     70    """
     71    Return an identical copy of a vector
     72    """
    5873    new_vector = {}
    5974    for key in vector.keys():
     
    6580
    6681def create_vector(dictionary):
    67     vector={}
     82    """
     83    Create a zero vector for a given dictionary
     84    """
     85    vector = {}
    6886    for word in dictionary:
    6987        vector[word] = 0
     
    7290
    7391def vector_distance(vec1, vec2):
     92    """
     93    Calculate the Euclidean distance between two vectors.
     94    """
    7495    if not set(vec1.keys()) == set(vec2.keys()):
    7596        print("Dictionaries don't have the same keys")
     
    82103
    83104    dist = 0
    84     for key in nvec1.keys():
     105    for key in nvec1:
    85106        dist = dist + (nvec1[key] - nvec2[key]) ** 2
    86107
    87108    dist = math.sqrt(dist)
    88    
     109
    89110    return dist
    90111
    91112
    92113def normalise_vector(vec):
    93     sum = 0
     114    """
     115    Take a given vector and normalise each entry to get a Euclidean
     116    distance of 1 between the zero vector and the vector itself.
     117    """
     118    sum_vals = 0
    94119    for key in vec.keys():
    95         sum = sum + (vec[key] * vec[key])
    96    
    97     sum = math.sqrt(sum)
     120        sum_vals = sum_vals + (vec[key] * vec[key])
     121
     122    sum_vals = math.sqrt(sum_vals)
    98123
    99124    for key in vec.keys():
    100         vec[key] = (vec[key] + 0.0) / sum
     125        vec[key] = (vec[key] + 0.0) / sum_vals
    101126
    102127
     
    107132        sys.exit(1)
    108133
    109     dic = read_dictionary(sys.argv[1])
     134    dic = read_dictionary(sys.argv[1]) # pylint: disable=invalid-name
    110135
    111     vector = {"hello":3, "bye":4}
    112     normalise_vector(vector)
    113     print("normalised vector: " + str(vector))
     136    testvector = {"hello":3, "bye":4} # pylint: disable=invalid-name
     137    normalise_vector(testvector)
     138    print("normalised vector: " + str(testvector))
    114139
    115     vector1 = {"hello":3, "bye":4}
    116     vector2 = {"hello":4, "bye":3}
     140    vector1 = {"hello":3, "bye":4} # pylint: disable=invalid-name
     141    vector2 = {"hello":4, "bye":3} # pylint: disable=invalid-name
    117142    print("distance same vector: " + str(vector_distance(vector1, vector1)))
    118143    print("distance different vector: " + str(vector_distance(vector1, vector2)))
     
    120145    print(vector2)
    121146
    122     print(count_occurances("Singular/table.h", dic))
     147    print(count_occurances("../Singular/table.h", dic))
  • machine_learning/predictor.py

    r1a034b r2a4516e  
    22Define the predictor class for classifying according to help page.
    33"""
     4
    45# Third party imports
     6import numpy as np
    57from sklearn.base import BaseEstimator, ClassifierMixin
    68
    79# Local imports
    810from common.keyword_vector import vector_distance
     11
    912
    1013class HelpPagePredictor(BaseEstimator, ClassifierMixin):
     
    2326        Setup the correspondence of vectors to help-files
    2427        """
     28        assert X is not None, "Please provide data for X"
     29        assert y is not None, "Please provide data for y"
    2530        self.vectors = X
    2631        self.files = y
     
    3136        Classify the input vectors
    3237        """
     38        assert X is not None, "Please provide data for X"
    3339        ret_list = []
    3440        for x in X: # pylint: disable=invalid-name
     
    4450
    4551            # find corresponding filename
    46             index = self.vectors.index(min_vec)
     52            index = list(self.vectors).index(min_vec)
    4753            file = self.files[index]
    4854            ret_list.append(file)
    49         return ret_list
     55        return np.array(ret_list)
     56
     57
     58if __name__ == '__main__':
     59    print("Running some tests")
     60    predictor = HelpPagePredictor() # pylint: disable=invalid-name
     61    vector1 = {"hello":1, "bye":4, "pizza": 10} # pylint: disable=invalid-name
     62    vector2 = {"hello":2, "bye":3, "pizza": 1} # pylint: disable=invalid-name
     63    vector3 = {"hello":3, "bye":9, "pizza": 3} # pylint: disable=invalid-name
     64
     65    vectors = np.array([vector1, vector2, vector3]) # pylint: disable=invalid-name
     66    files = np.array(["file1", "file2", "file3"]) # pylint: disable=invalid-name
     67    print(vectors)
     68    print(files)
     69
     70    testvec = {"hello":1, "bye":1, "pizza": 1} # pylint: disable=invalid-name
     71
     72    print("distance to 1")
     73    print(vector_distance(testvec, vector1))
     74    print()
     75    print("distance to 2")
     76    print(vector_distance(testvec, vector2))
     77    print()
     78    print("distance to 3")
     79    print(vector_distance(testvec, vector3))
     80    print()
     81
     82    predictor.fit(vectors, files)
     83    prediction = predictor.predict(np.array([testvec])) # pylint: disable=invalid-name
     84    print(prediction)
  • machine_learning/requirements.txt

    r1a034b r2a4516e  
    11numpy==1.16.1
    22#pandas==0.24.0
    3 pylint==2.2.2
     3pylint==2.3.0
    44scikit-learn==0.20.2
    55scipy==1.2.0
Note: See TracChangeset for help on using the changeset viewer.