source: git/machine_learning/common/keyword_vector.py @ feed60b

spielwiese
Last change on this file since feed60b was feed60b, checked in by Murray Heymann <heymann.murray@…>, 4 years ago
Fix some minor errors
  • Property mode set to 100755
File size: 4.4 KB
Line 
1#!/usr/bin/python3
2
3"""Some vector logic"""
4
5import os
6import re
7import sys
8import numpy as np
9
10from common.constants import KEYWORDS_FILE
11
12### Read from file ########################################################
13
14def read_dictionary(filename=KEYWORDS_FILE):
15    """
16    Read a dictionary saved as a textfile
17    """
18    if not os.path.isfile(filename):
19        print("Please provide a valid input file as argument")
20        return np.array([])
21
22    dictionary = []
23
24    with open(filename, "r") as file:
25        line = file.readline()
26
27        while not line == "":
28            dictionary.append(line.strip())
29            line = file.readline()
30    return np.array(dictionary)
31
32
33def count_occurances(filename, dictionary, normalise=True):
34    """
35    Create a vector from a dictionary and populate the counts according to
36    a specified file
37    """
38    if not os.path.isfile(filename):
39        print("Please provide a valid input file as argument")
40        return []
41    if dictionary.size == 0:
42        print("Please provide a valid dictionary as argument")
43        return []
44    if dictionary is None:
45        print("Please provide a valid dictionary as argument")
46        return []
47    vector = create_vector_dictionary(dictionary)
48    with open(filename, "r+") as file:
49        line = file.readline()
50
51        while not line == "":
52            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split() # pylint: disable=anomalous-backslash-in-string
53            for word in words:
54                if word in vector.keys():
55                    vector[word] = vector[word] + 1
56            line = file.readline()
57    vector = np.array(list(vector.values()))
58    if normalise:
59        vector = normalise_vector(vector)
60    return vector
61
62
63### Copying ###############################################################
64
65def copy_dictionary(dictionary):
66    """
67    Return an identical copy of a dictionary
68    """
69    new_dic = []
70    for word in dictionary:
71        new_dic.append(word)
72    return new_dic
73
74
75def copy_vector(vector):
76    """
77    Return an identical copy of a vector
78    """
79    return np.copy(np.array(vector))
80
81
82### Vector specific logic #################################################
83
84def create_vector_dictionary(dictionary):
85    """
86    Create a zero vector for a given dictionary
87    """
88    assert not dictionary is None, "Please give a dictionary"
89    assert not np.array(dictionary).size == 0, "Please give a dictionary"
90    vector = {}
91    for word in dictionary:
92        vector[word] = 0
93    return vector
94
95
96def vector_distance(vec1, vec2):
97    """
98    Calculate the Euclidean distance between two vectors.
99    """
100    if not len(vec1) == len(vec2):
101        print("Vectors don't have the same sizes")
102        return -1
103
104    dist = np.linalg.norm(vec1 - vec2)
105
106    return dist
107
108
109def normalise_vector(vec):
110    """
111    Take a given vector and normalise each entry to get a Euclidean
112    distance of 1 between the zero vector and the vector itself.
113    """
114
115    if vec is None:
116        print("Please provide a valid vector")
117        print("Returning empty vector by default")
118        return np.array([])
119
120    if not isinstance(vec, np.ndarray):
121        print("Warning, vector should be a numpy array")
122    norm = np.linalg.norm(vec)
123    if not norm == 0:
124        vec = vec / norm
125    return vec
126
127
128def main():
129    """
130    Run some basic tests
131    """
132
133    testvector = np.array([3, 4])
134    normalise_vector(testvector)
135    print("normalised vector: " + str(testvector))
136
137    vector1 = np.array([3, 4])
138    vector1 = normalise_vector(vector1)
139    vector2 = np.array([4, 3])
140    normalise_vector(vector2)
141    vector2 = normalise_vector(vector2)
142    print("distance same vector: " + str(vector_distance(vector1, vector1)))
143    print("distance different vector: " + str(vector_distance(vector1, vector2)))
144    print(vector1)
145    print(vector2)
146    print()
147
148    print("Attempt to normalise the zero vector")
149    print(normalise_vector(np.array([0, 0, 0, 0, 0])))
150    print()
151
152    print("Attempt to normalise list")
153    print(normalise_vector([3, 4, 0, 0, 0]))
154    print()
155
156    print("Attempt to normalise empty vector")
157    print(normalise_vector(np.array([])))
158    print()
159
160    print("Attempt to normalise None")
161    print(normalise_vector(None))
162    print()
163
164    if len(sys.argv) == 2:
165        dic = read_dictionary(filename=sys.argv[1])
166    else:
167        dic = read_dictionary()
168    print("vector of ../Singular/table.h")
169    print(count_occurances("../Singular/table.h", dic))
170
171if __name__ == '__main__':
172    main()
Note: See TracBrowser for help on using the repository browser.