Ignore:
Timestamp:
Jul 25, 2019, 4:01:14 PM (5 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', 'fe61d9c35bf7c61f2b6cbf1b56e25e2f08d536cc')
Children:
dd499fe3f7367342e4f0cf3495cdab72a21c2280
Parents:
1a034b64ea0141af657c8bd605ec28fcfe0d1d8b
Message:
Cleanup and test predictor.py
File:
1 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    r1a034b r2a4516e  
    11#!/usr/bin/python3
     2
     3"""Some vector logic"""
    24
    35import math
     
    911
    1012def read_dictionary(filename):
     13    """
     14    Read a dictionary saved as a textfile
     15    """
    1116    if not os.path.isfile(filename):
    1217        print("Please provide a valid input file as argument")
     
    2530
    2631def count_occurances(filename, dictionary):
     32    """
     33    Create a vector from a dictionary and populate the counts according to
     34    a specified file
     35    """
    2736    if not os.path.isfile(filename):
    2837        print("Please provide a valid input file as argument")
     
    3241        return {}
    3342    vector = create_vector(dictionary)
    34     with open(filename, "r+") as file: 
     43    with open(filename, "r+") as file:
    3544        line = file.readline()
    3645
     
    3847            # DONE: replace all non-alphanumeric characters with space
    3948            # words = line.split()
    40             words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split()
     49            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
    4150            for word in words:
    4251                if word in vector.keys():
     
    4958
    5059def copy_dictionary(dictionary):
     60    """
     61    Return an identical copy of a dictionary
     62    """
    5163    new_dic = []
    5264    for word in dictionary:
     
    5668
    5769def copy_vector(vector):
     70    """
     71    Return an identical copy of a vector
     72    """
    5873    new_vector = {}
    5974    for key in vector.keys():
     
    6580
    6681def create_vector(dictionary):
    67     vector={}
     82    """
     83    Create a zero vector for a given dictionary
     84    """
     85    vector = {}
    6886    for word in dictionary:
    6987        vector[word] = 0
     
    7290
    7391def vector_distance(vec1, vec2):
     92    """
     93    Calculate the Euclidean distance between two vectors.
     94    """
    7495    if not set(vec1.keys()) == set(vec2.keys()):
    7596        print("Dictionaries don't have the same keys")
     
    82103
    83104    dist = 0
    84     for key in nvec1.keys():
     105    for key in nvec1:
    85106        dist = dist + (nvec1[key] - nvec2[key]) ** 2
    86107
    87108    dist = math.sqrt(dist)
    88    
     109
    89110    return dist
    90111
    91112
    92113def normalise_vector(vec):
    93     sum = 0
     114    """
     115    Take a given vector and normalise each entry to get a Euclidean
     116    distance of 1 between the zero vector and the vector itself.
     117    """
     118    sum_vals = 0
    94119    for key in vec.keys():
    95         sum = sum + (vec[key] * vec[key])
    96    
    97     sum = math.sqrt(sum)
     120        sum_vals = sum_vals + (vec[key] * vec[key])
     121
     122    sum_vals = math.sqrt(sum_vals)
    98123
    99124    for key in vec.keys():
    100         vec[key] = (vec[key] + 0.0) / sum
     125        vec[key] = (vec[key] + 0.0) / sum_vals
    101126
    102127
     
    107132        sys.exit(1)
    108133
    109     dic = read_dictionary(sys.argv[1])
     134    dic = read_dictionary(sys.argv[1]) # pylint: disable=invalid-name
    110135
    111     vector = {"hello":3, "bye":4}
    112     normalise_vector(vector)
    113     print("normalised vector: " + str(vector))
     136    testvector = {"hello":3, "bye":4} # pylint: disable=invalid-name
     137    normalise_vector(testvector)
     138    print("normalised vector: " + str(testvector))
    114139
    115     vector1 = {"hello":3, "bye":4}
    116     vector2 = {"hello":4, "bye":3}
     140    vector1 = {"hello":3, "bye":4} # pylint: disable=invalid-name
     141    vector2 = {"hello":4, "bye":3} # pylint: disable=invalid-name
    117142    print("distance same vector: " + str(vector_distance(vector1, vector1)))
    118143    print("distance different vector: " + str(vector_distance(vector1, vector2)))
     
    120145    print(vector2)
    121146
    122     print(count_occurances("Singular/table.h", dic))
     147    print(count_occurances("../Singular/table.h", dic))
Note: See TracChangeset for help on using the changeset viewer.