LSTM を書いてみた

出典元)Python 機械学習プログラミング https://www.amazon.co.jp/dp/4295003379/

tar 解凍

import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

ai.stanford.edu

データ作成

import pandas as pd
import os

base_path = 'aclImdb'
labels = {'pos':1, 'neg':0}
from tqdm import tqdm_notebook

df = pd.DataFrame()
for s in ('test', 'train'):
    for l in labels.keys():
        path = os.path.join(base_path, s, l)
        for file in tqdm_notebook(os.listdir(path)):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)
# データシャッフル
from sklearn.utils import shuffle
df = shuffle(df).reset_index(drop=True)

# char2int
from string import punctuation
from collections import Counter

counts = Counter()
for i, review in tqdm_notebook(enumerate(df['review'])):
    text = ''.join([c if c not in punctuation else ' '+c+' ' for c in review]).lower()
    df.loc[i, 'review'] = text
    counts.update(text.split())

word_counts = sorted(counts, key=counts.get, reverse=True)
print(word_counts[:5])
word_to_int = {word: ii for ii, word in enumerate(word_counts, 1)}

mapped_reviews = []
for review in tqdm_notebook(df['review']):
    mapped_reviews.append([word_to_int[word] for word in review.split()])

# 200 文字で 0padding
sequence_length = 200
sequences = np.zeros((len(mapped_reviews), sequence_length), dtype=int)

for i, row in tqdm_notebook(enumerate(mapped_reviews)):
    review_arr = np.array(row)
    sequences[i, -len(row):] = review_arr[-sequence_length:]

LSTM

X_train = sequences[:25000, :]
y_train = df.loc[:25000, 'sentiment'].values
X_test = sequences[25000:, :]
y_test = df.loc[25000:, 'sentiment'].values

np.random.seed(123)

# バッチジェネレータ
def create_batch_generator(x, y=None, batch_size=64):
    n_batch = len(x) // batch_size
    x = x[:n_batch*batch_size]
    if y is not None:
        y = y[:n_batch*batch_size]
    for ii in range(0, len(x), batch_size):
        if y is not None:
            yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        else:
            yield x[ii:ii+batch_size]
import tensorflow as tf

class SentimentRNN(object):
    def __init__(self, n_words, seq_len=200,
                lstm_size=256, num_layers=1, batch_size=64,
                learning_rate=0.0001, embed_size=200):
        self.n_words = n_words
        self.seq_len = seq_len
        self.lstm_size = lstm_size  # 隠れユニットの個数
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.embed_size = embed_size
        
        self.g = tf.Graph()
        with self.g.as_default():
            tf.set_random_seed(123)
            self.build()
            self.saver = tf.train.Saver()
            self.init_op = tf.global_variables_initializer()
            
    def build(self):
        # プレースホルダ
        tf_x = tf.placeholder(tf.int32,
                             shape=(self.batch_size, self.seq_len),
                             name='tf_x')
        tf_y = tf.placeholder(tf.float32,
                             shape=(self.batch_size),
                             name='tf_y')
        tf_keepprob = tf.placeholder(tf.float32,
                             name='tf_keepprob')
        
        embedding = tf.Variable(tf.random_uniform((self.n_words, self.embed_size), minval=-1, maxval=1))
        
        embed_x = tf.nn.embedding_lookup(embedding, tf_x, name='embeded_x')
        
        cells = tf.contrib.rnn.MultiRNNCell(
            [tf.contrib.rnn.DropoutWrapper(
                tf.contrib.rnn.BasicLSTMCell(self.lstm_size),
                output_keep_prob=tf_keepprob)
             for i in range(self.num_layers)
            ]
        )
        
        # 初期状態を定義
        
        self.initial_state = cells.zero_state(self.batch_size, tf.float32)
        print('   << initial state >>  ', self.initial_state)
        
        lstm_outputs, self.final_state = \
            tf.nn.dynamic_rnn(cells, embed_x, initial_state=self.initial_state)
            
        print('\n  << lstm_output  >>  ', lstm_outputs)
        print('\n  << final_state  >>  ', self.final_state)
        
        logits = tf.layers.dense(inputs=lstm_outputs[:, -1],
                                units=1, activation=None, name='logits')
        logits = tf.squeeze(logits, name='logits_squeeze')
        
        print('\n  <<  logits          >>', logits)
        
        y_proba = tf.nn.sigmoid(logits, name='probabilities')
        predictions = {
            'probabilities': y_proba,
            'labels': tf.cast(tf.round(y_proba), tf.int32, name='labels')
        }
        
        print('\n  <<  predictions      >>', predictions)
        
        cost = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=tf_y,
                                                   logits=logits),
            name='cost'
        )
        optimizer = tf.train.AdamOptimizer(self.learning_rate)
        train_op = optimizer.minimize(cost, name='train_op')
    
    def train(self, X_train, y_train, num_epochs):
        with tf.Session(graph=self.g) as sess:
            sess.run(self.init_op)
            iteration = 1
            for epoch in range(num_epochs):
                state = sess.run(self.initial_state)
                
                for batch_x, batch_y in create_batch_generator(X_train, y_train, self.batch_size):
                    feed = {'tf_x:0': batch_x,
                           'tf_y:0': batch_y,
                           'tf_keepprob:0': 0.5,
                           self.initial_state: state}
                    loss, _, state = sess.run(
                        ['cost:0', 'train_op', self.final_state],
                        feed_dict=feed
                    )
                    
                    if iteration % 20 == 0:
                        print("Epoch: %d / %d Iteration: %d | Train loss: %.5f"
                             % (epoch + 1, num_epochs, iteration, loss))
                    iteration += 1
                
                if (epoch + 1) % 10 == 0:
                    self.saver.save(sess, "model/sentiment-%d.ckpt" % epoch)
    
    def predict(self, X_data,  return_proba=False):
        preds = []
        with tf.Session(graph=self.g) as sess:
            self.saver.restore(sess, tf.train.latest_checkpoint('./model/'))
            test_state = sess.run(self.initial_state)

            for ii, batch_x in enumerate(create_batch_generator(X_data, None, batch_size=self.batch_size), 1):
                feed = {'tf_x:0': batch_x,
                           'tf_keepprob:0': 1.0,
                           self.initial_state: test_state}
                if return_proba:
                    pred, test_state = sess.run(['probabilities:0', self.final_state],
                                               feed_dict=feed)
                else:
                    pred, test_state = sess.run(['labels:0', self.final_state],
                                               feed_dict=feed)
                preds.append(pred)

            return np.concatenate(preds)

train+test

n_words = max(list(word_to_int.values())) + 1
rnn = SentimentRNN(
    n_words=n_words,
    seq_len=sequence_length,
    embed_size=256,
    lstm_size=128,
    num_layers=1,
    batch_size=100,
    learning_rate=0.001
)
   << initial state >>   (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)

  << lstm_output  >>   Tensor("rnn/transpose_1:0", shape=(100, 200, 128), dtype=float32)

  << final_state  >>   (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)

  <<  logits          >> Tensor("logits_squeeze:0", shape=(100,), dtype=float32)

  <<  predictions      >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>, 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
rnn.train(X_train, y_train, num_epochs=40)
Epoch: 1 / 40 Iteration: 20 | Train loss: 0.68427
Epoch: 1 / 40 Iteration: 40 | Train loss: 0.64362
Epoch: 1 / 40 Iteration: 60 | Train loss: 0.67976
Epoch: 1 / 40 Iteration: 80 | Train loss: 0.60497
...
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print(np.sum(preds == y_true) / len(y_true))
INFO:tensorflow:Restoring parameters from ./model/sentiment-39.ckpt
0.854