-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocab.py
109 lines (68 loc) · 2.86 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 20:18:30 2014
@author: nicolai
An object to act as the "linear projection" layer in an unsupervised neural network
For use in semi-supervised representation learning
"""
import numpy as np
from random import choice
# Construct a look-up table containing the vector representations of the vocab
# It's a matrix of dimension p x |V|, where p is an arbitrary choice of
# dimensionality (should be subject to cross validation)
class vocabulary:
"""
Init this class with a list of triplet tokens
Each word will have a numerical ID, and the object can be queried with a
list of triplets, which will return a concatenated vector of triplet representations
"""
def __init__(self, vocab, p):
self.p = p
# Matrix of triplet vectors
# Each column is a representation of a triplet
self.W = np.random.random((p,len(vocab)))
# Set up vocabulary look-up table
self.lookup = {}
for i in range(len(vocab)):
self.lookup[ vocab[i] ] = i
# List of IDs of previously retrieved triplet vectors
self.IDs = []
def retrieve(self, list_of_triplets):
"""
Give a list of triplets, get a concatenated vector of representations
"""
self.IDs = []
IDs = []
for trip in list_of_triplets:
IDs.append( self.lookup[trip.upper()] )
self.IDs = IDs
# Randomly sample the middle triplet to make x_hat
ID_hat = list(IDs)
proposals = np.asarray(self.lookup.values())
indx = np.where(proposals == ID_hat[len(ID_hat)/2])
proposals = np.delete(proposals, indx)
ID_hat[len(ID_hat)/2] = choice(proposals)
# Concatenate an output vector
x = []
for indx in IDs:
x += self.W[:,indx].tolist()
x_hat = []
for indx in ID_hat:
x_hat += self.W[:,indx].tolist()
# Prepend bias
x = [1] + x
x_hat = [1] + x_hat
return (np.asarray(x), np.asarray(x_hat))
def update(self, updated_vector):
"""
After adjusting the vector representations with backpropagation
update the representation matrix W
"""
# Remove bias
new_vec = updated_vector[1:]
# Each triplet is a substring of length p in this vector
indx = 0
for ID in self.IDs:
# Update the W column corresponding to this ID
self.W[:,ID] = np.asarray( new_vec[indx:indx+self.p] )
indx += self.p