janusgraph+cassandra で graph database を構築

Cassandra の yum install

  • /etc/yum.repos.d/cassandra.repo
name=Apache Cassandra
yum -y install cassandra

vim /etc/cassandra/conf/cassandra.yaml

### : /etc/cassandra/conf/cassandra.yaml
<start_rpc: false
>start_rpc: true

service cassandra start
service cassandra enable


wget https://github.com/JanusGraph/janusgraph/releases/download/v0.2.0/janusgraph-0.2.0-hadoop2.zip

unzip janusgraph-0.2.0-hadoop2.zip

cd janusgraph-0.2.0-hadoop2

gremlin> graph = JanusGraphFactory.open('conf/janusgraph-cassandra.properties')
gremlin> GraphOfTheGodsFactory.loadWithoutMixedIndex(graph, true)
gremlin> g = graph.traversal()
==>graphtraversalsource[standardjanusgraph[cassandrathrift:[]], standard]

Python での S3 からファイル取得(boto3)

boto3というモジュールが存在して、それを使ってS3 のファイルが取得できる。


In [1]: import boto3

In [7]: import botocore

In [21]: s3 = boto3.resource('s3', aws_access_key_id=S3_ACCESS_KEY, aws_secret_access_key=S3_SECRET_KEY,)

In [22]: try:
    ...:     s3.meta.client.head_bucket(Bucket=S3_BUCKET)
    ...: except Exception as e:
    ...:     print(e)

In [23]: for bucket in s3.buckets.all():
    ...:     for key in bucket.objects.all():
    ...:          print(key.key)


In [26]:try:
   ...:     s3.Bucket(S3_BUCKET).download_file(KEY, FILE)
   ...: except botocore.exceptions.ClientError as e:
   ...:     if e.response['Error']['Code'] == "404":
   ...:         raise S3FileNotFound
   ...:     else:
   ...:         raise

Boto 3 Documentation — Boto 3 Docs 1.7.19 documentation


private static DataType parseDataType(Config fieldsConfig) {
  String type = fieldsConfig.getString(FIELD_TYPE_CONFIG);
  switch (type) {
    case "string":
      return DataTypes.StringType;
    case "byte":
      return DataTypes.ByteType;
    case "short":
      return DataTypes.ShortType;
    case "int":
      return DataTypes.IntegerType;
    case "long":
      return DataTypes.LongType;
    case "float":
      return DataTypes.FloatType;
    case "double":
      return DataTypes.DoubleType;
    case "decimal":
      ConfigUtils.assertConfig(fieldsConfig, DECIMAL_SCALE_CONFIG);
      ConfigUtils.assertConfig(fieldsConfig, DECIMAL_PRECISION_CONFIG);
      return DataTypes.createDecimalType(
    case "boolean":
      return DataTypes.BooleanType;
    case "binary":
      return DataTypes.BinaryType;
    case "date":
      return DataTypes.DateType;
    case "timestamp":
      return DataTypes.TimestampType;
    case "array":
    case "map":
    case "struct":
      throw new RuntimeException("Schema check does not currently support complex types");
      throw new RuntimeException("Unknown type: " + type);

LDA(Latent Dirichlet Allocation) でのトピック抽出

以下の形式のsample.csvからデータを取得し、sklean の LDA でトピック抽出する。

id text
1 今日は晴れ。明日は雨
2 今日はカープが優勝した。
... ...
  • text2topic.py
#!/usr/bin/env python
# coding:utf-8

from __future__ import print_function
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

import pickle
import os
import pandas as pd

import MeCab

n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

FETCHED_PAGES_DIR_NAME = 'fetched_pages'
QUERIES = '胃もたれ 虫歯 花粉症対策 鬱 機械 骨折 肩こり 書類'.split(' ')
NB_PKL_FILENAME = 'naive_bayes_classifier.pkl'
MIN_TFIDF = 0.001
TFIDF_RESULT_PKL_FILENAME = 'tfidf_result.pkl'
TFIDF_VECTORIZER_PKL_FILENAME = 'tfidf_vectorizer.pkl'

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])

def is_bigger_than_min_tfidf(term, terms, tfidfs):
    [term for term in terms if is_bigger_than_min_tfidf(term, terms, tfidfs)]で使う
    if tfidfs[terms.index(term)] > MIN_TFIDF:
        return True
    return False

def tfidf(values):
    # analyzerは文字列を入れると文字列のlistが返る関数
    vectorizer = TfidfVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features)
    corpus = [v for v in values]

    x = vectorizer.fit_transform(corpus)

    return x, vectorizer  # xはtfidf_resultとしてmainで受け取る

def countvec(values):
    # analyzerは文字列を入れると文字列のlistが返る関数
    vectorizer = CountVectorizer(analyzer=stems, min_df=1, max_df=50, max_features=n_features)
    corpus = [v for v in values]

    x = vectorizer.fit_transform(corpus)

    return x, vectorizer  # xはtfidf_resultとしてmainで受け取る

def _split_to_words(text, to_stem=False):
    入力: 'すべて自分のほうへ'
    出力: tuple(['すべて', '自分', 'の', 'ほう', 'へ'])
    tagger = MeCab.Tagger('mecabrc')  # 別のTaggerを使ってもいい
    mecab_result = tagger.parse(text)
    info_of_words = mecab_result.split('\n')
    words = []
    for info in info_of_words:
        # macabで分けると、文の最後に’’が、その手前に'EOS'が来る
        if info == 'EOS' or info == '':
            # info => 'な\t助詞,終助詞,*,*,*,*,な,ナ,ナ'
        info_elems = info.split(',')
        # 6番目に、無活用系の単語が入る。もし6番目が'*'だったら0番目を入れる
        if info_elems[6] == '*':
            # info_elems[0] => 'ヴァンロッサム\t名詞'
        if to_stem:
            # 語幹に変換
        # 語をそのまま
    return words

def words(text):
    words = _split_to_words(text=text, to_stem=False)
    return words

def stems(text):
    stems = _split_to_words(text=text, to_stem=True)
    return stems

if __name__ == '__main__':
    print("Loading dataset...")
    t0 = time()
    df = pd.read_csv('/tmp/sample.csv', header=None)
    y = df.values[:n_samples, 0]
    print("done in %0.3fs." % (time() - t0))

    print("Extracting tf features for tfidf...")
    t0 = time()
    tfidf_result, tfidf_vectorizer = tfidf(df.values[:n_samples, 1])  # tfidf_resultはtfidf関数のx

    pkl_tfidf_result_path = os.path.join('.', TFIDF_RESULT_PKL_FILENAME)
    pkl_tfidf_vectorizer_path = os.path.join('.', TFIDF_VECTORIZER_PKL_FILENAME)

    with open(pkl_tfidf_result_path, 'wb') as f:
        pickle.dump(tfidf_result, f)
    with open(pkl_tfidf_vectorizer_path, 'wb') as f:
        pickle.dump(tfidf_vectorizer, f)
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    t0 = time()
    print("Extracting tf features for countvec...")
    tf_result, tf_vectorizer = countvec(df.values[:n_samples, 1])
    t0 = time()
    print("done in %0.3fs." % (time() - t0))

    # Fit the NMF model
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              alpha=.1, l1_ratio=.5).fit(tfidf_result)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (Frobenius norm):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    # Fit the NMF model
    print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
          "tf-idf features, n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1,
              beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
    t0 = time()
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)

Topic extraction with Non-negative Matrix Factorization and Latent Dirichlet Allocation — scikit-learn 0.19.1 documentation




The Term Vector Component | Apache Solr Reference Guide 7.1

#!/usr/bin/env python
# coding:utf-8

import requests
import json
from heapq import heappush, heappop

URL = "http://localhost:8983/solr/project/tvrh"

def get_tvlist(url, _id, start, rows):
    params = {
        'q': 'id:{}'.format(_id),
        'rows': rows,
        'start': start,
        'indent': 'true',
        'tv.tf_idf': 'true',
        'tv.fl': 'includes',
        'fl': 'id',
    r = requests.get(URL, params=params)
    dic = json.loads(r.text)
    tv_list = dic.get('termVectors', [])
    return tv_list

def analisys_dic(tv_list):
    res = {}
    num = len(tv_list) // 2
    for i in range(num):
        _id = tv_list[2 * i]
        term_list = tv_list[2 * i + 1][3]
        term_list_num = len(term_list) // 2
        _heap = []
        for h in range(term_list_num):
            string = term_list[2 * h]
            tfidf = term_list[2 * h + 1][1]
            if tfidf < 0.1:
                heappush(_heap, (- tfidf, string))
        tfidf_list = []
        for n in range(10):
            if _heap == []:
            pop_tuple = heappop(_heap)
            tfidf_list.append((pop_tuple[1], - pop_tuple[0]))

        res.update({_id: tfidf_list})
    return res



AWS にて、 dockerの立ち上げがうまく行かなかった



$ git clone https://github.com/efkbook/blog-sample

$ cd blog-sample/

$ docker-compose up -d
$ free -m
             total       used       free     shared    buffers     cached
Mem:          2001       1938         62          0          0         27
-/+ buffers/cache:       1910         90
Swap:            0          0          0

$ free -m
             total       used       free     shared    buffers     cached
Mem:          2001       1084        917          0         12         81
-/+ buffers/cache:        990       1010
Swap:            0          0          0

## CPU
$ sar -u
12:00:01 AM     CPU     %user     %nice   %system   %iowait    %steal     %idle
05:20:01 PM     all      0.04      0.00      0.03      0.01      0.02     99.90
05:30:01 PM     all      0.04      0.00      0.03      0.01      0.02     99.91
05:40:01 PM     all      0.04      0.00      0.03      0.01      0.02     99.90
05:50:01 PM     all     13.82      0.00      3.01      1.03      0.08     82.06
06:00:36 PM     all     14.26      0.00      5.28     78.84      1.12      0.50
06:10:01 PM     all     34.88      0.00      6.14     56.52      0.89      1.58

## ロードアベレージ
$ sar -q
12:00:01 AM   runq-sz  plist-sz   ldavg-1   ldavg-5  ldavg-15
05:20:01 PM         0       147      0.00      0.00      0.00
05:30:01 PM         1       147      0.00      0.00      0.00
05:40:01 PM         0       147      0.00      0.00      0.00
05:50:01 PM         1       157      0.94      0.39      0.15
06:00:36 PM         0       277      9.69      7.82      4.20
06:10:01 PM         0       201      1.51      6.54      5.89

## メモリ
$ sar -r
12:00:01 AM kbmemfree kbmemused  %memused kbbuffers  kbcached  kbcommit   %commit

05:20:01 PM    909852   1139400     55.60    166756    410508    971816     47.42
05:30:01 PM    909884   1139368     55.60    166756    410508    971816     47.42
05:40:01 PM    909764   1139488     55.61    166756    410524    971816     47.42
05:50:01 PM     74796   1974456     96.35    234604   1085612   1069228     52.18
06:00:36 PM     65172   1984080     96.82       116     25968   3084236    150.51
06:10:01 PM    336440   1712812     83.58      4668     46448   2591224    126.45

## Disk I/O
$ sar -b

12:00:01 AM       tps      rtps      wtps   bread/s   bwrtn/s
05:20:01 PM      0.16      0.00      0.16      0.00      1.90
05:30:01 PM      0.15      0.00      0.15      0.00      1.70
05:40:01 PM      0.16      0.00      0.16      0.00      1.82
05:50:01 PM     89.59      6.19     83.40    330.35  12059.86
06:00:36 PM   1179.01   1157.77     21.24 110175.33   3395.56
06:10:01 PM    791.31    786.20      5.11  81238.48    647.16