SolrのTermVectorsComponentでキーワード抽出してみる
tfidfを用いてなにかできないかと模索。
結果、検索で引っかかつキーワードが出てきただけ。。
The Term Vector Component | Apache Solr Reference Guide 7.1
#!/usr/bin/env python # coding:utf-8 import requests import json from heapq import heappush, heappop URL = "http://localhost:8983/solr/project/tvrh" def get_tvlist(url, _id, start, rows): params = { 'q': 'id:{}'.format(_id), 'rows': rows, 'start': start, 'indent': 'true', 'tv.tf_idf': 'true', 'tv.fl': 'includes', 'fl': 'id', } r = requests.get(URL, params=params) dic = json.loads(r.text) tv_list = dic.get('termVectors', []) return tv_list def analisys_dic(tv_list): res = {} num = len(tv_list) // 2 for i in range(num): _id = tv_list[2 * i] term_list = tv_list[2 * i + 1][3] term_list_num = len(term_list) // 2 _heap = [] for h in range(term_list_num): string = term_list[2 * h] tfidf = term_list[2 * h + 1][1] if tfidf < 0.1: heappush(_heap, (- tfidf, string)) tfidf_list = [] for n in range(10): if _heap == []: break pop_tuple = heappop(_heap) tfidf_list.append((pop_tuple[1], - pop_tuple[0])) res.update({_id: tfidf_list}) return res