source: git/machine_learning/common/keyword_vector.py @ 112c79

spielwiese
Last change on this file since 112c79 was 112c79, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Use sklearn to count keyword occurances
  • Property mode set to 100755
File size: 4.3 KB
Line 
1#!/usr/bin/python3
2
3"""Some vector logic"""
4
5import os
6import re
7import sys
8import numpy as np
9from sklearn.feature_extraction.text import CountVectorizer
10
11from common.constants import KEYWORDS_FILE
12
13### Read from file ########################################################
14
15def read_dictionary(filename=KEYWORDS_FILE):
16    """
17    Read a dictionary saved as a textfile
18    """
19    if not os.path.isfile(filename):
20        print("Please provide a valid input file as argument to read "
21              "dictionary")
22        if sys.version_info[0] == 3: # pylint: disable=no-else-raise
23            raise FileNotFoundError
24        else:
25            raise IOError
26
27
28    dictionary = []
29
30    with open(filename, "r") as file:
31        line = file.readline()
32
33        while not line == "":
34            dictionary.append(line.strip())
35            line = file.readline()
36    return np.array(dictionary)
37
38def get_vectors(filenames, dictionary, normalise=True):
39    """
40    Create vectors from a dictionary and populate the counts according to
41    specified files
42    """
43    assert filenames is not None, \
44            "Please provide a valid list of files as argument"
45    assert not filenames.size == 0, \
46            "Please provide a valid list of files as argument"
47    for filename in filenames:
48        if not os.path.isfile(filename):
49            print("Please provide a valid input file as argument")
50            if sys.version_info[0] == 3: # pylint: disable=no-else-raise
51                raise FileNotFoundError
52            else:
53                print(filename)
54                raise IOError
55    assert dictionary is not None, \
56            "Please provide a valid dictionary as argument"
57    assert not dictionary.size == 0, \
58            "Please provide a valid dictionary as argument"
59
60    doc_strings = []
61    for filename in filenames:
62        doc_string = ""
63        with open(filename, "r+") as file:
64            line = file.readline()
65
66            while not line == "":
67                doc_string = doc_string + " " + line
68                line = file.readline()
69
70        doc_string = re.sub('[^0-9a-zA-Z\-\_]', ' ', doc_string) # pylint: disable=anomalous-backslash-in-string
71        doc_strings.append(doc_string)
72    doc_strings = np.array(doc_strings)
73
74    vectorizer = CountVectorizer(vocabulary=dictionary)
75    vectors = vectorizer.fit_transform(doc_strings)
76    vectors = vectors.toarray()
77    if normalise:
78        vectors = vectors / np.sqrt((vectors ** 2).sum(-1))[..., np.newaxis]
79    return vectors
80
81
82def count_occurances(filename, dictionary, normalise=True):
83    """
84    Create a vector from a dictionary and populate the counts according to
85    a specified file
86    """
87    res = get_vectors(np.array([filename]),
88                      dictionary,
89                      normalise=normalise)
90    return res[0]
91
92
93### Copying ###############################################################
94
95def copy_dictionary(dictionary):
96    """
97    Return an identical copy of a dictionary
98    """
99    return np.copy(np.array(dictionary))
100
101
102def copy_vector(vector):
103    """
104    Return an identical copy of a vector
105    """
106    return np.copy(np.array(vector))
107
108
109### Vector specific logic #################################################
110
111def create_vector_dictionary(dictionary):
112    """
113    Create a zero lookup dictionary for a given dictionary
114    """
115    assert not dictionary is None, "Please give a dictionary"
116    assert not np.array(dictionary).size == 0, "Please give a dictionary"
117    vector = {}
118    for word in dictionary:
119        vector[word] = 0
120    return vector
121
122
123def vector_distance(vec1, vec2):
124    """
125    Calculate the Euclidean distance between two vectors.
126    """
127    assert len(vec1) == len(vec2), \
128            "Vectors don't have the same sizes"
129
130    dist = np.linalg.norm(vec1 - vec2)
131
132    return dist
133
134
135def normalise_vector(vec):
136    """
137    Take a given vector and normalise each entry to get a Euclidean
138    distance of 1 between the zero vector and the vector itself.
139    """
140
141    if vec is None:
142        print("Warning, None is not a valid vector")
143        print("Returning empty vector by default")
144        return np.array([])
145
146    if not isinstance(vec, np.ndarray):
147        print("Warning, vector should be a numpy array")
148
149    if np.array(vec).size == 0:
150        print("Warning, vector being normalised is empty")
151
152    norm = np.linalg.norm(vec)
153    if not norm == 0:
154        vec = vec / norm
155    return vec
Note: See TracBrowser for help on using the repository browser.