1 | #!/usr/bin/python3 |
---|
2 | |
---|
3 | """Some vector logic""" |
---|
4 | |
---|
5 | import os |
---|
6 | import re |
---|
7 | import sys |
---|
8 | import numpy as np |
---|
9 | from sklearn.feature_extraction.text import CountVectorizer |
---|
10 | |
---|
11 | from common.constants import KEYWORDS_FILE |
---|
12 | |
---|
13 | ### Read from file ######################################################## |
---|
14 | |
---|
15 | def read_dictionary(filename=KEYWORDS_FILE): |
---|
16 | """ |
---|
17 | Read a dictionary saved as a textfile |
---|
18 | """ |
---|
19 | if not os.path.isfile(filename): |
---|
20 | print("Please provide a valid input file as argument to read " |
---|
21 | "dictionary") |
---|
22 | if sys.version_info[0] == 3: # pylint: disable=no-else-raise |
---|
23 | raise FileNotFoundError |
---|
24 | else: |
---|
25 | raise IOError |
---|
26 | |
---|
27 | |
---|
28 | dictionary = [] |
---|
29 | |
---|
30 | with open(filename, "r") as file: |
---|
31 | line = file.readline() |
---|
32 | |
---|
33 | while not line == "": |
---|
34 | dictionary.append(line.strip()) |
---|
35 | line = file.readline() |
---|
36 | return np.array(dictionary) |
---|
37 | |
---|
38 | def get_vectors(filenames, dictionary, normalise=True): |
---|
39 | """ |
---|
40 | Create vectors from a dictionary and populate the counts according to |
---|
41 | specified files |
---|
42 | """ |
---|
43 | assert filenames is not None, \ |
---|
44 | "Please provide a valid list of files as argument" |
---|
45 | assert not filenames.size == 0, \ |
---|
46 | "Please provide a valid list of files as argument" |
---|
47 | for filename in filenames: |
---|
48 | if not os.path.isfile(filename): |
---|
49 | print("Please provide a valid input file as argument") |
---|
50 | if sys.version_info[0] == 3: # pylint: disable=no-else-raise |
---|
51 | raise FileNotFoundError |
---|
52 | else: |
---|
53 | print(filename) |
---|
54 | raise IOError |
---|
55 | assert dictionary is not None, \ |
---|
56 | "Please provide a valid dictionary as argument" |
---|
57 | assert not dictionary.size == 0, \ |
---|
58 | "Please provide a valid dictionary as argument" |
---|
59 | |
---|
60 | doc_strings = [] |
---|
61 | for filename in filenames: |
---|
62 | doc_string = "" |
---|
63 | with open(filename, "r+") as file: |
---|
64 | line = file.readline() |
---|
65 | |
---|
66 | while not line == "": |
---|
67 | doc_string = doc_string + " " + line |
---|
68 | line = file.readline() |
---|
69 | |
---|
70 | doc_string = re.sub('[^0-9a-zA-Z\-\_]', ' ', doc_string) # pylint: disable=anomalous-backslash-in-string |
---|
71 | doc_strings.append(doc_string) |
---|
72 | doc_strings = np.array(doc_strings) |
---|
73 | |
---|
74 | vectorizer = CountVectorizer(vocabulary=dictionary) |
---|
75 | vectors = vectorizer.fit_transform(doc_strings) |
---|
76 | vectors = vectors.toarray() |
---|
77 | if normalise: |
---|
78 | vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis] |
---|
79 | return vectors |
---|
80 | |
---|
81 | |
---|
82 | def count_occurances(filename, dictionary, normalise=True): |
---|
83 | """ |
---|
84 | Create a vector from a dictionary and populate the counts according to |
---|
85 | a specified file |
---|
86 | """ |
---|
87 | res = get_vectors(np.array([filename]), |
---|
88 | dictionary, |
---|
89 | normalise=normalise) |
---|
90 | return res[0] |
---|
91 | |
---|
92 | |
---|
93 | ### Copying ############################################################### |
---|
94 | |
---|
95 | def copy_dictionary(dictionary): |
---|
96 | """ |
---|
97 | Return an identical copy of a dictionary |
---|
98 | """ |
---|
99 | return np.copy(np.array(dictionary)) |
---|
100 | |
---|
101 | |
---|
102 | def copy_vector(vector): |
---|
103 | """ |
---|
104 | Return an identical copy of a vector |
---|
105 | """ |
---|
106 | return np.copy(np.array(vector)) |
---|
107 | |
---|
108 | |
---|
109 | ### Vector specific logic ################################################# |
---|
110 | |
---|
111 | def create_vector_dictionary(dictionary): |
---|
112 | """ |
---|
113 | Create a zero lookup dictionary for a given dictionary |
---|
114 | """ |
---|
115 | assert not dictionary is None, "Please give a dictionary" |
---|
116 | assert not np.array(dictionary).size == 0, "Please give a dictionary" |
---|
117 | vector = {} |
---|
118 | for word in dictionary: |
---|
119 | vector[word] = 0 |
---|
120 | return vector |
---|
121 | |
---|
122 | |
---|
123 | def vector_distance(vec1, vec2): |
---|
124 | """ |
---|
125 | Calculate the Euclidean distance between two vectors. |
---|
126 | """ |
---|
127 | assert len(vec1) == len(vec2), \ |
---|
128 | "Vectors don't have the same sizes" |
---|
129 | |
---|
130 | dist = np.linalg.norm(vec1 - vec2) |
---|
131 | |
---|
132 | return dist |
---|
133 | |
---|
134 | |
---|
135 | def normalise_vector(vec): |
---|
136 | """ |
---|
137 | Take a given vector and normalise each entry to get a Euclidean |
---|
138 | distance of 1 between the zero vector and the vector itself. |
---|
139 | """ |
---|
140 | |
---|
141 | if vec is None: |
---|
142 | print("Warning, None is not a valid vector") |
---|
143 | print("Returning empty vector by default") |
---|
144 | return np.array([]) |
---|
145 | |
---|
146 | if not isinstance(vec, np.ndarray): |
---|
147 | print("Warning, vector should be a numpy array") |
---|
148 | |
---|
149 | if np.array(vec).size == 0: |
---|
150 | print("Warning, vector being normalised is empty") |
---|
151 | |
---|
152 | norm = np.linalg.norm(vec) |
---|
153 | if not norm == 0: |
---|
154 | vec = vec / norm |
---|
155 | return vec |
---|