source: git/machine_learning/common/keyword_vector.py @ 05dbb13

spielwiese
Last change on this file since 05dbb13 was 05dbb13, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Fix some errors, alphabetise keyword list
  • Property mode set to 100755
File size: 4.4 KB
Line 
1#!/usr/bin/python3
2
3"""Some vector logic"""
4
5import numpy as np
6import os
7import re
8import sys
9import traceback
10
11from common.constants import KEYWORDS_FILE
12
13### Read from file ########################################################
14
15def read_dictionary(filename=KEYWORDS_FILE):
16    """
17    Read a dictionary saved as a textfile
18    """
19    if not os.path.isfile(filename):
20        print("Please provide a valid input file as argument")
21        return []
22
23    dictionary = []
24
25    with open(filename, "r") as file:
26        line = file.readline()
27
28        while not line == "":
29            dictionary.append(line.strip())
30            line = file.readline()
31    return np.array(dictionary)
32
33
34def count_occurances(filename, dictionary, normalise=True):
35    """
36    Create a vector from a dictionary and populate the counts according to
37    a specified file
38    """
39    if not os.path.isfile(filename):
40        print("Please provide a valid input file as argument")
41        return []
42    if dictionary.size == 0:
43        print("Please provide a valid dictionary as argument")
44        return []
45    if dictionary is None:
46        print("Please provide a valid dictionary as argument")
47        return []
48    vector = create_vector_dictionary(dictionary)
49    with open(filename, "r+") as file:
50        line = file.readline()
51
52        while not line == "":
53            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
54            for word in words:
55                if word in vector.keys():
56                    vector[word] = vector[word] + 1
57            line = file.readline()
58    vector = np.array(list(vector.values()))
59    if normalise:
60        vector = normalise_vector(vector)
61    return vector
62
63
64### Copying ###############################################################
65
66def copy_dictionary(dictionary):
67    """
68    Return an identical copy of a dictionary
69    """
70    new_dic = []
71    for word in dictionary:
72        new_dic.append(word)
73    return new_dic
74
75
76def copy_vector(vector):
77    """
78    Return an identical copy of a vector
79    """
80    return np.copy(np.array(vector))
81
82
83### Vector specific logic #################################################
84
85def create_vector_dictionary(dictionary):
86    """
87    Create a zero vector for a given dictionary
88    """
89    assert not dictionary is None, "Please give a dictionary"
90    assert not np.array(dictionary).size == 0, "Please give a dictionary"
91    vector = {}
92    for word in dictionary:
93        vector[word] = 0
94    return vector
95
96
97def vector_distance(vec1, vec2):
98    """
99    Calculate the Euclidean distance between two vectors.
100    """
101    if not len(vec1) == len(vec2):
102        print("Vectors don't have the same sizes")
103        return -1
104
105    dist = np.linalg.norm(vec1 - vec2)
106
107    return dist
108
109
110def normalise_vector(vec):
111    """
112    Take a given vector and normalise each entry to get a Euclidean
113    distance of 1 between the zero vector and the vector itself.
114    """
115
116    if vec is None:
117        print("Please provide a valid vector")
118        print("Returning empty vector by default")
119        return np.array([])
120
121    if not isinstance(vec, np.ndarray):
122        print("Warning, vector should be a numpy array")
123    norm = np.linalg.norm(vec)
124    if not norm == 0:
125        vec = vec / norm
126    return vec
127
128
129def main():
130    """
131    Run some basic tests
132    """
133
134    testvector = np.array([3, 4])
135    normalise_vector(testvector)
136    print("normalised vector: " + str(testvector))
137
138    vector1 = np.array([3, 4])
139    vector1 = normalise_vector(vector1)
140    vector2 = np.array([4, 3])
141    normalise_vector(vector2)
142    vector2 = normalise_vector(vector2)
143    print("distance same vector: " + str(vector_distance(vector1, vector1)))
144    print("distance different vector: " + str(vector_distance(vector1, vector2)))
145    print(vector1)
146    print(vector2)
147    print()
148
149    print("Attempt to normalise the zero vector")
150    print(normalise_vector(np.array([0,0,0,0,0])))
151    print()
152
153    print("Attempt to normalise list")
154    print(normalise_vector([3,4,0,0,0]))
155    print()
156
157    print("Attempt to normalise empty vector")
158    print(normalise_vector(np.array([])))
159    print()
160
161    print("Attempt to normalise None")
162    print(normalise_vector(None))
163    print()
164
165    if len(sys.argv) == 2:
166        dic = read_dictionary(filename=sys.argv[1])
167    else:
168        dic = read_dictionary()
169    print("vector of ../Singular/table.h")
170    print(count_occurances("../Singular/table.h", dic))
171
172if __name__ == '__main__':
173    main()
Note: See TracBrowser for help on using the repository browser.