Changeset 05dbb13 in git


Ignore:
Timestamp:
Jul 29, 2019, 5:43:56 PM (5 years ago)
Author:
Murray Heymann <heymann.murray@…>
Branches:
(u'spielwiese', '17f1d200f27c5bd38f5dfc6e8a0879242279d1d8')
Children:
c1b9ababa50e8059dd847da0957f50aea901e6a8
Parents:
9f0f63787f503527775273cff2d354bd558c9cda
Message:
Fix some errors, alphabetise keyword list
Location:
machine_learning
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • machine_learning/common/keyword_vector.py

    r9f0f63 r05dbb13  
    33"""Some vector logic"""
    44
     5import numpy as np
    56import os
    67import re
    78import sys
    8 import numpy as np
     9import traceback
     10
     11from common.constants import KEYWORDS_FILE
    912
    1013### Read from file ########################################################
    1114
    12 def read_dictionary(filename):
     15def read_dictionary(filename=KEYWORDS_FILE):
    1316    """
    1417    Read a dictionary saved as a textfile
     
    2629            dictionary.append(line.strip())
    2730            line = file.readline()
    28     return dictionary
     31    return np.array(dictionary)
    2932
    3033
     
    3740        print("Please provide a valid input file as argument")
    3841        return []
    39     if dictionary == []:
     42    if dictionary.size == 0:
    4043        print("Please provide a valid dictionary as argument")
    4144        return []
     
    110113    distance of 1 between the zero vector and the vector itself.
    111114    """
    112     vec = vec / np.linalg.norm(vec)
     115
     116    if vec is None:
     117        print("Please provide a valid vector")
     118        print("Returning empty vector by default")
     119        return np.array([])
     120
     121    if not isinstance(vec, np.ndarray):
     122        print("Warning, vector should be a numpy array")
     123    norm = np.linalg.norm(vec)
     124    if not norm == 0:
     125        vec = vec / norm
    113126    return vec
    114127
     
    118131    Run some basic tests
    119132    """
    120     if len(sys.argv) != 2:
    121         print("Usage: ")
    122         print(sys.argv[0] + " <dict_name>")
    123         sys.exit(1)
    124133
    125134    testvector = np.array([3, 4])
     
    136145    print(vector1)
    137146    print(vector2)
     147    print()
    138148
    139     dic = read_dictionary(sys.argv[1])
     149    print("Attempt to normalise the zero vector")
     150    print(normalise_vector(np.array([0,0,0,0,0])))
     151    print()
     152
     153    print("Attempt to normalise list")
     154    print(normalise_vector([3,4,0,0,0]))
     155    print()
     156
     157    print("Attempt to normalise empty vector")
     158    print(normalise_vector(np.array([])))
     159    print()
     160
     161    print("Attempt to normalise None")
     162    print(normalise_vector(None))
     163    print()
     164
     165    if len(sys.argv) == 2:
     166        dic = read_dictionary(filename=sys.argv[1])
     167    else:
     168        dic = read_dictionary()
     169    print("vector of ../Singular/table.h")
    140170    print(count_occurances("../Singular/table.h", dic))
    141171
  • machine_learning/common/lookuptable.py

    r9f0f63 r05dbb13  
    4949    'keywords.txt'
    5050    """
     51    # extract keywords using the singular script
    5152    os.system(SINGULAR_BIN + " " + EXTRACT_SCRIPT)
    5253
     54    # read from the file created by singular
     55    dictionary = read_dictionary()
     56    print(dictionary)
    5357
    54 def create_table(dictionary=None):
     58    # sort alphabetically
     59    dictionary = np.sort(dictionary)
     60    print(dictionary)
     61
     62    # write back to the same file
     63    with open(KEYWORDS_FILE, "w") as file:
     64        for word in dictionary:
     65            file.write(word + "\n")
     66
     67
     68
     69def create_table(dictionary=None, attempt_cached=True):
    5570    """
    5671    Get a list of helpfiles, and generate a word occurance vector for each.
     
    6075    vectors = []
    6176
    62     if not os.path.isfile(VECTORS_NPY) or not os.path.isfile(HELPFILE_NPY):
     77    if not os.path.isfile(VECTORS_NPY) or \
     78            not os.path.isfile(HELPFILE_NPY) or \
     79            not attempt_cached:
    6380        file_list = np.array(get_list_of_htm_files())
    6481        np.save(HELPFILE_NPY, file_list)
     
    90107        print(file)
    91108    extract_keywords()
    92     vectors, files = create_table()
     109    vectors, files = create_table(attempt_cached=False)
     110    vectors1, files1 = create_table()
     111
     112    if not (vectors == vectors1).all():
     113        print("Cached version differs from original version")
     114    elif not (files == files1).all():
     115        print("Cached version differs from original version")
     116    else:
     117        print("Cached version corresponds with original")
     118
    93119    dictionary = read_dictionary(KEYWORDS_FILE)
    94120    test_vec = count_occurances(os.path.join(HELP_FILE_PATH, "html",
    95121                                             files[1]), dictionary)
    96     print((test_vec == vectors[1]).all())
     122    print((test_vec==vectors[1]).all())
    97123
    98124
  • machine_learning/extract.lib

    r9f0f63 r05dbb13  
    55int i;
    66
     7// combine lists, leaving out the first entry of l, which is "l"
     8list combined = delete(l, 1) + k;
     9
    710// create file, overwrite if exists
    8 write(":w keywords.txt", l[1]);
     11write(":w keywords.txt", combined[1]);
    912
    10 // write entries from "names" procedure
    11 for (i = 1; i < size(l); i++) {
    12         write(":a keywords.txt", l[i+1]);
    13 }
    1413
    15 // write entries from "reservedNameList" procedure
    16 for (i = 1; i < size(k) + 1; i++) {
    17         write(":a keywords.txt", l[i]);
     14// write remaining entries to file
     15for (i = 2; i < size(combined) + 1; i++) {
     16        write(":a keywords.txt", combined[i]);
    1817}
    1918
  • machine_learning/model/predictor.py

    r9f0f63 r05dbb13  
    117117    print(prediction)
    118118
     119
     120    test_vec = count_occurances("test.txt", dictionary)
     121    start = time.time()
     122    prediction = predictor.predict(np.array([test_vec]))
     123    end = time.time()
     124    print(end - start, "seconds to make prediction")
     125    print(prediction)
     126
    119127if __name__ == '__main__':
    120128    cProfile.run("main()")
Note: See TracChangeset for help on using the changeset viewer.