source: git/machine_learning/common/keyword_vector.py @ 746d3d

spielwiese
Last change on this file since 746d3d was 746d3d, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Write lookup table of vectors and their helpfiles
  • Property mode set to 100755
File size: 3.7 KB
Line 
1#!/usr/bin/python3
2
3"""Some vector logic"""
4
5import math
6import os
7import re
8import sys
9
10### Read from file ########################################################
11
12def read_dictionary(filename):
13    """
14    Read a dictionary saved as a textfile
15    """
16    if not os.path.isfile(filename):
17        print("Please provide a valid input file as argument")
18        return []
19
20    dictionary = []
21
22    with open(filename, "r") as file:
23        line = file.readline()
24
25        while not line == "":
26            dictionary.append(line.strip())
27            line = file.readline()
28    return dictionary
29
30
31def count_occurances(filename, dictionary):
32    """
33    Create a vector from a dictionary and populate the counts according to
34    a specified file
35    """
36    if not os.path.isfile(filename):
37        print("Please provide a valid input file as argument")
38        return {}
39    if dictionary == []:
40        print("Please provide a valid dictionary as argument")
41        return {}
42    vector = create_vector(dictionary)
43    with open(filename, "r+") as file:
44        line = file.readline()
45
46        while not line == "":
47            # DONE: replace all non-alphanumeric characters with space
48            # words = line.split()
49            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
50            for word in words:
51                if word in vector.keys():
52                    vector[word] = vector[word] + 1
53            line = file.readline()
54    return vector
55
56
57### Copying ###############################################################
58
59def copy_dictionary(dictionary):
60    """
61    Return an identical copy of a dictionary
62    """
63    new_dic = []
64    for word in dictionary:
65        new_dic.append(word)
66    return new_dic
67
68
69def copy_vector(vector):
70    """
71    Return an identical copy of a vector
72    """
73    new_vector = {}
74    for key in vector.keys():
75        new_vector[key] = vector[key]
76    return new_vector
77
78
79### Vector specific logic #################################################
80
81def create_vector(dictionary):
82    """
83    Create a zero vector for a given dictionary
84    """
85    vector = {}
86    for word in dictionary:
87        vector[word] = 0
88    return vector
89
90
91def vector_distance(vec1, vec2):
92    """
93    Calculate the Euclidean distance between two vectors.
94    """
95    if not set(vec1.keys()) == set(vec2.keys()):
96        print("Dictionaries don't have the same keys")
97        return -1
98
99    nvec1 = copy_vector(vec1)
100    nvec2 = copy_vector(vec2)
101    normalise_vector(nvec1)
102    normalise_vector(nvec2)
103
104    dist = 0
105    for key in nvec1:
106        dist = dist + (nvec1[key] - nvec2[key]) ** 2
107
108    dist = math.sqrt(dist)
109
110    return dist
111
112
113def normalise_vector(vec):
114    """
115    Take a given vector and normalise each entry to get a Euclidean
116    distance of 1 between the zero vector and the vector itself.
117    """
118    sum_vals = 0
119    for key in vec.keys():
120        sum_vals = sum_vals + (vec[key] * vec[key])
121
122    sum_vals = math.sqrt(sum_vals)
123
124    for key in vec.keys():
125        vec[key] = (vec[key] + 0.0) / sum_vals
126
127
128def main():
129    """
130    Run some basic tests
131    """
132    if len(sys.argv) != 2:
133        print("Usage: ")
134        print(sys.argv[0] + " <dict_name>")
135        sys.exit(1)
136
137    dic = read_dictionary(sys.argv[1])
138
139    testvector = {"hello":3, "bye":4}
140    normalise_vector(testvector)
141    print("normalised vector: " + str(testvector))
142
143    vector1 = {"hello":3, "bye":4}
144    vector2 = {"hello":4, "bye":3}
145    print("distance same vector: " + str(vector_distance(vector1, vector1)))
146    print("distance different vector: " + str(vector_distance(vector1, vector2)))
147    print(vector1)
148    print(vector2)
149
150    print(count_occurances("../Singular/table.h", dic))
151
152if __name__ == '__main__':
153    main()
Note: See TracBrowser for help on using the repository browser.