source: git/machine_learning/common/keyword_vector.py @ 1a034b

spielwiese
Last change on this file since 1a034b was 1a034b, checked in by Murray Heymann <heymann.murray@…>, 5 years ago
Write basic predictor
  • Property mode set to 100755
File size: 3.0 KB
Line 
1#!/usr/bin/python3
2
3import math
4import os
5import re
6import sys
7
8### Read from file ########################################################
9
10def read_dictionary(filename):
11    if not os.path.isfile(filename):
12        print("Please provide a valid input file as argument")
13        return []
14
15    dictionary = []
16
17    with open(filename, "r") as file:
18        line = file.readline()
19
20        while not line == "":
21            dictionary.append(line.strip())
22            line = file.readline()
23    return dictionary
24
25
26def count_occurances(filename, dictionary):
27    if not os.path.isfile(filename):
28        print("Please provide a valid input file as argument")
29        return {}
30    if dictionary == []:
31        print("Please provide a valid dictionary as argument")
32        return {}
33    vector = create_vector(dictionary)
34    with open(filename, "r+") as file: 
35        line = file.readline()
36
37        while not line == "":
38            # DONE: replace all non-alphanumeric characters with space
39            # words = line.split()
40            words = re.sub('[^0-9a-zA-Z\-\_]', ' ', line).split()
41            for word in words:
42                if word in vector.keys():
43                    vector[word] = vector[word] + 1
44            line = file.readline()
45    return vector
46
47
48### Copying ###############################################################
49
50def copy_dictionary(dictionary):
51    new_dic = []
52    for word in dictionary:
53        new_dic.append(word)
54    return new_dic
55
56
57def copy_vector(vector):
58    new_vector = {}
59    for key in vector.keys():
60        new_vector[key] = vector[key]
61    return new_vector
62
63
64### Vector specific logic #################################################
65
66def create_vector(dictionary):
67    vector={}
68    for word in dictionary:
69        vector[word] = 0
70    return vector
71
72
73def vector_distance(vec1, vec2):
74    if not set(vec1.keys()) == set(vec2.keys()):
75        print("Dictionaries don't have the same keys")
76        return -1
77
78    nvec1 = copy_vector(vec1)
79    nvec2 = copy_vector(vec2)
80    normalise_vector(nvec1)
81    normalise_vector(nvec2)
82
83    dist = 0
84    for key in nvec1.keys():
85        dist = dist + (nvec1[key] - nvec2[key]) ** 2
86
87    dist = math.sqrt(dist)
88   
89    return dist
90
91
92def normalise_vector(vec):
93    sum = 0
94    for key in vec.keys():
95        sum = sum + (vec[key] * vec[key])
96   
97    sum = math.sqrt(sum)
98
99    for key in vec.keys():
100        vec[key] = (vec[key] + 0.0) / sum
101
102
103if __name__ == '__main__':
104    if len(sys.argv) != 2:
105        print("Usage: ")
106        print(sys.argv[0] + " <dict_name>")
107        sys.exit(1)
108
109    dic = read_dictionary(sys.argv[1])
110
111    vector = {"hello":3, "bye":4}
112    normalise_vector(vector)
113    print("normalised vector: " + str(vector))
114
115    vector1 = {"hello":3, "bye":4}
116    vector2 = {"hello":4, "bye":3}
117    print("distance same vector: " + str(vector_distance(vector1, vector1)))
118    print("distance different vector: " + str(vector_distance(vector1, vector2)))
119    print(vector1)
120    print(vector2)
121
122    print(count_occurances("Singular/table.h", dic))
Note: See TracBrowser for help on using the repository browser.