LDA(Latent Dirichlet Allocation) でのトピック抽出

以下の形式のsample.csvからデータを取得し、sklean の LDA でトピック抽出する。

id text
1 今日は晴れ。明日は雨
2 今日はカープが優勝した。
... ...
  • text2topic.py
#!/usr/bin/env python
# coding:utf-8

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import pickle
import os
import pandas as pd

import MeCab

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

FETCHED_PAGES_DIR_NAME = 'fetched_pages'
QUERIES = '胃もたれ 虫歯 花粉症対策 鬱 機械 骨折 肩こり 書類'.split(' ')
NB_PKL_FILENAME = 'naive_bayes_classifier.pkl'
MIN_TFIDF = 0.001
TFIDF_RESULT_PKL_FILENAME = 'tfidf_result.pkl'
TFIDF_VECTORIZER_PKL_FILENAME = 'tfidf_vectorizer.pkl'

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])

def is_bigger_than_min_tfidf(term, terms, tfidfs):
    [term for term in terms if is_bigger_than_min_tfidf(term, terms, tfidfs)]で使う
    if tfidfs[terms.index(term)] > MIN_TFIDF:
        return True
    return False

def tfidf(values):
    # analyzerは文字列を入れると文字列のlistが返る関数
    vectorizer = TfidfVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features)
    corpus = [v for v in values]

    x = vectorizer.fit_transform(corpus)

    return x, vectorizer  # xはtfidf_resultとしてmainで受け取る

def countvec(values):
    # analyzerは文字列を入れると文字列のlistが返る関数
    vectorizer = CountVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features)
    corpus = [v for v in values]

    x = vectorizer.fit_transform(corpus)

    return x, vectorizer  # xはtfidf_resultとしてmainで受け取る

def _split_to_words(text, to_stem=False):
    入力: 'すべて自分のほうへ'
    出力: tuple(['すべて', '自分', 'の', 'ほう', 'へ'])
    tagger = MeCab.Tagger('mecabrc')  # 別のTaggerを使ってもいい
    mecab_result = tagger.parse(text)
    info_of_words = mecab_result.split('\n')
    words = []
    for info in info_of_words:
        # macabで分けると、文の最後に’’が、その手前に'EOS'が来る
        if info == 'EOS' or info == '':
            # info => 'な\t助詞,終助詞,*,*,*,*,な,ナ,ナ'
        info_elems = info.split(',')
        # 6番目に、無活用系の単語が入る。もし6番目が'*'だったら0番目を入れる
        if info_elems[6] == '*':
            # info_elems[0] => 'ヴァンロッサム\t名詞'
        if to_stem:
            # 語幹に変換
        # 語をそのまま
    return words

def words(text):
    words = _split_to_words(text=text, to_stem=False)
    return words

def stems(text):
    stems = _split_to_words(text=text, to_stem=True)
    return stems

if __name__ == '__main__':
    print("Loading dataset...")
    t0 = time()
    df = pd.read_csv('/tmp/sample.csv', header=None)
    y = df.values[:n_samples, 0]
    print("done in %0.3fs." % (time() - t0))

    print("Extracting tf features for tfidf...")
    t0 = time()
    tfidf_result, tfidf_vectorizer = tfidf(df.values[:n_samples, 1])  # tfidf_resultはtfidf関数のx

    pkl_tfidf_result_path = os.path.join('.', TFIDF_RESULT_PKL_FILENAME)
    pkl_tfidf_vectorizer_path = os.path.join('.', TFIDF_VECTORIZER_PKL_FILENAME)

    with open(pkl_tfidf_result_path, 'wb') as f:
        pickle.dump(tfidf_result, f)
    with open(pkl_tfidf_vectorizer_path, 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    t0 = time()
    print("Extracting tf features for countvec...")
    tf_result, tf_vectorizer = countvec(df.values[:n_samples, 1])
    t0 = time()
    print("done in %0.3fs." % (time() - t0))

    # Fit the NMF model
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              alpha=.1, l1_ratio=.5).fit(tfidf_result)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (Frobenius norm):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    # Fit the NMF model
    print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
          "tf-idf features, n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
    t0 = time()
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

