LSTM を書いてみた
出典元)Python 機械学習プログラミング https://www.amazon.co.jp/dp/4295003379/
tar 解凍
import tarfile with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar: tar.extractall()
データ作成
import pandas as pd import os base_path = 'aclImdb' labels = {'pos':1, 'neg':0}
from tqdm import tqdm_notebook df = pd.DataFrame() for s in ('test', 'train'): for l in labels.keys(): path = os.path.join(base_path, s, l) for file in tqdm_notebook(os.listdir(path)): with open(os.path.join(path, file), 'r', encoding='utf-8') as infile: txt = infile.read() df = df.append([[txt, labels[l]]], ignore_index=True)
# データシャッフル from sklearn.utils import shuffle df = shuffle(df).reset_index(drop=True) # char2int from string import punctuation from collections import Counter counts = Counter() for i, review in tqdm_notebook(enumerate(df['review'])): text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower() df.loc[i, 'review'] = text counts.update(text.split()) word_counts = sorted(counts, key=counts.get, reverse=True) print(word_counts[:5]) word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)} mapped_reviews = [] for review in tqdm_notebook(df['review']): mapped_reviews.append([word_to_int[word] for word in review.split()]) # 200 文字で 0padding sequence_length = 200 sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int) for i, row in tqdm_notebook(enumerate(mapped_reviews)): review_arr = np.array(row) sequences[i, -len(row):] = review_arr[-sequence_length:]
LSTM
X_train = sequences[:25000, :] y_train = df.loc[:25000, 'sentiment'].values X_test = sequences[25000:, :] y_test = df.loc[25000:, 'sentiment'].values np.random.seed(123) # バッチジェネレータ def create_batch_generator(x, y=None, batch_size=64): n_batch = len(x) // batch_size x = x[:n_batch*batch_size] if y is not None: y = y[:n_batch*batch_size] for ii in range(0, len(x), batch_size): if y is not None: yield x[ii:ii+batch_size], y[ii:ii+batch_size] else: yield x[ii:ii+batch_size]
import tensorflow as tf class SentimentRNN(object): def __init__(self, n_words, seq_len=200, lstm_size=256, num_layers=1, batch_size=64, learning_rate=0.0001, embed_size=200): self.n_words = n_words self.seq_len = seq_len self.lstm_size = lstm_size # 隠れユニットの個数 self.num_layers = num_layers self.batch_size = batch_size self.learning_rate = learning_rate self.embed_size = embed_size self.g = tf.Graph() with self.g.as_default(): tf.set_random_seed(123) self.build() self.saver = tf.train.Saver() self.init_op = tf.global_variables_initializer() def build(self): # プレースホルダ tf_x = tf.placeholder(tf.int32, shape=(self.batch_size, self.seq_len), name='tf_x') tf_y = tf.placeholder(tf.float32, shape=(self.batch_size), name='tf_y') tf_keepprob = tf.placeholder(tf.float32, name='tf_keepprob') embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1)) embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embeded_x') cells = tf.contrib.rnn.MultiRNNCell( [tf.contrib.rnn.DropoutWrapper( tf.contrib.rnn.BasicLSTMCell(self.lstm_size), output_keep_prob=tf_keepprob) for i in range(self.num_layers) ] ) # 初期状態を定義 self.initial_state = cells.zero_state(self.batch_size, tf.float32) print(' << initial state >> ', self.initial_state) lstm_outputs, self.final_state = \ tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state) print('\n << lstm_output >> ', lstm_outputs) print('\n << final_state >> ', self.final_state) logits = tf.layers.dense(inputs=lstm_outputs[:, -1], units=1, activation=None, name='logits') logits = tf.squeeze(logits, name='logits_squeeze') print('\n << logits >>', logits) y_proba = tf.nn.sigmoid(logits, name='probabilities') predictions = { 'probabilities': y_proba, 'labels': tf.cast(tf.round(y_proba), tf.int32, name='labels') } print('\n << predictions >>', predictions) cost = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y, logits=logits), name='cost' ) optimizer = tf.train.AdamOptimizer(self.learning_rate) train_op = optimizer.minimize(cost, name='train_op') def train(self, X_train, y_train, num_epochs): with tf.Session(graph=self.g) as sess: sess.run(self.init_op) iteration = 1 for epoch in range(num_epochs): state = sess.run(self.initial_state) for batch_x, batch_y in create_batch_generator(X_train, y_train, self.batch_size): feed = {'tf_x:0': batch_x, 'tf_y:0': batch_y, 'tf_keepprob:0': 0.5, self.initial_state: state} loss, _, state = sess.run( ['cost:0', 'train_op', self.final_state], feed_dict=feed ) if iteration % 20 == 0: print("Epoch: %d / %d Iteration: %d | Train loss: %.5f" % (epoch + 1, num_epochs, iteration, loss)) iteration += 1 if (epoch + 1) % 10 == 0: self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch) def predict(self, X_data, return_proba=False): preds = [] with tf.Session(graph=self.g) as sess: self.saver.restore(sess, tf.train.latest_checkpoint('./model/')) test_state = sess.run(self.initial_state) for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size), 1): feed = {'tf_x:0': batch_x, 'tf_keepprob:0': 1.0, self.initial_state: test_state} if return_proba: pred, test_state = sess.run(['probabilities:0', self.final_state], feed_dict=feed) else: pred, test_state = sess.run(['labels:0', self.final_state], feed_dict=feed) preds.append(pred) return np.concatenate(preds)
train+test
n_words = max(list(word_to_int.values())) + 1 rnn = SentimentRNN( n_words=n_words, seq_len=sequence_length, embed_size=256, lstm_size=128, num_layers=1, batch_size=100, learning_rate=0.001 )
<< initial state >> (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),) << lstm_output >> Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32) << final_state >> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),) << logits >> Tensor("logits_squeeze:0", shape=(100,), dtype=float32) << predictions >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
rnn.train(X_train, y_train, num_epochs=40)
Epoch: 1 / 40 Iteration: 20 | Train loss: 0.68427 Epoch: 1 / 40 Iteration: 40 | Train loss: 0.64362 Epoch: 1 / 40 Iteration: 60 | Train loss: 0.67976 Epoch: 1 / 40 Iteration: 80 | Train loss: 0.60497 ...
preds = rnn.predict(X_test) y_true = y_test[:len(preds)] print(np.sum(preds == y_true) / len(y_true))
INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt 0.854