LDA(Latent Dirichlet Allocation) でのトピック抽出
以下の形式のsample.csvからデータを取得し、sklean の LDA でトピック抽出する。
id | text |
---|---|
1 | 今日は晴れ。明日は雨 |
2 | 今日はカープが優勝した。 |
... | ... |
- text2topic.py
#!/usr/bin/env python # coding:utf-8 from __future__ import print_function from time import time from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.decomposition import NMF, LatentDirichletAllocation import pickle import os import pandas as pd import MeCab n_samples = 2000 n_features = 1000 n_components = 10 n_top_words = 20 FETCHED_PAGES_DIR_NAME = 'fetched_pages' QUERIES = '胃もたれ 虫歯 花粉症対策 鬱 機械 骨折 肩こり 書類'.split(' ') NB_PKL_FILENAME = 'naive_bayes_classifier.pkl' DOC_NUM = 0 MIN_TFIDF = 0.001 TFIDF_RESULT_PKL_FILENAME = 'tfidf_result.pkl' TFIDF_VECTORIZER_PKL_FILENAME = 'tfidf_vectorizer.pkl' def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): message = "Topic #%d: " % topic_idx message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]) print(message) print() def is_bigger_than_min_tfidf(term, terms, tfidfs): ''' [term for term in terms if is_bigger_than_min_tfidf(term, terms, tfidfs)]で使う list化した、語たちのtfidfの値のなかから、順番に当てる関数。 tfidfの値がMIN_TFIDFよりも大きければTrueを返す ''' if tfidfs[terms.index(term)] > MIN_TFIDF: return True return False def tfidf(values): # analyzerは文字列を入れると文字列のlistが返る関数 vectorizer = TfidfVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features) corpus = [v for v in values] x = vectorizer.fit_transform(corpus) return x, vectorizer # xはtfidf_resultとしてmainで受け取る def countvec(values): # analyzerは文字列を入れると文字列のlistが返る関数 vectorizer = CountVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features) corpus = [v for v in values] x = vectorizer.fit_transform(corpus) return x, vectorizer # xはtfidf_resultとしてmainで受け取る def _split_to_words(text, to_stem=False): """ 入力: 'すべて自分のほうへ' 出力: tuple(['すべて', '自分', 'の', 'ほう', 'へ']) """ tagger = MeCab.Tagger('mecabrc') # 別のTaggerを使ってもいい mecab_result = tagger.parse(text) info_of_words = mecab_result.split('\n') words = [] for info in info_of_words: # macabで分けると、文の最後に’’が、その手前に'EOS'が来る if info == 'EOS' or info == '': break # info => 'な\t助詞,終助詞,*,*,*,*,な,ナ,ナ' info_elems = info.split(',') # 6番目に、無活用系の単語が入る。もし6番目が'*'だったら0番目を入れる if info_elems[6] == '*': # info_elems[0] => 'ヴァンロッサム\t名詞' words.append(info_elems[0][:-3]) continue if to_stem: # 語幹に変換 words.append(info_elems[6]) continue # 語をそのまま words.append(info_elems[0][:-3]) return words def words(text): words = _split_to_words(text=text, to_stem=False) return words def stems(text): stems = _split_to_words(text=text, to_stem=True) return stems if __name__ == '__main__': print("Loading dataset...") t0 = time() df = pd.read_csv('/tmp/sample.csv', header=None) y = df.values[:n_samples, 0] print("done in %0.3fs." % (time() - t0)) print("Extracting tf features for tfidf...") t0 = time() tfidf_result, tfidf_vectorizer = tfidf(df.values[:n_samples, 1]) # tfidf_resultはtfidf関数のx pkl_tfidf_result_path = os.path.join('.', TFIDF_RESULT_PKL_FILENAME) pkl_tfidf_vectorizer_path = os.path.join('.', TFIDF_VECTORIZER_PKL_FILENAME) with open(pkl_tfidf_result_path, 'wb') as f: pickle.dump(tfidf_result, f) with open(pkl_tfidf_vectorizer_path, 'wb') as f: pickle.dump(tfidf_vectorizer, f) print("done in %0.3fs." % (time() - t0)) # Use tf (raw term count) features for LDA. t0 = time() print("Extracting tf features for countvec...") tf_result, tf_vectorizer = countvec(df.values[:n_samples, 1]) t0 = time() print("done in %0.3fs." % (time() - t0)) print() # Fit the NMF model print("Fitting the NMF model (Frobenius norm) with tf-idf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_result) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (Frobenius norm):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) # Fit the NMF model print("Fitting the NMF model (generalized Kullback-Leibler divergence) with " "tf-idf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)) t0 = time() nmf = NMF(n_components=n_components, random_state=1, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1, l1_ratio=.5).fit(tfidf_result) print("done in %0.3fs." % (time() - t0)) print("\nTopics in NMF model (generalized Kullback-Leibler divergence):") tfidf_feature_names = tfidf_vectorizer.get_feature_names() print_top_words(nmf, tfidf_feature_names, n_top_words) print("Fitting LDA models with tf features, " "n_samples=%d and n_features=%d..." % (n_samples, n_features)) lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() lda.fit(tf_result) print("done in %0.3fs." % (time() - t0)) print("\nTopics in LDA model:") tf_feature_names = tf_vectorizer.get_feature_names() print_top_words(lda, tf_feature_names, n_top_words)