SolrのTermVectorsComponentでキーワード抽出してみる

tfidfを用いてなにかできないかと模索。
結果、検索で引っかかつキーワードが出てきただけ。。

The Term Vector Component | Apache Solr Reference Guide 7.1

#!/usr/bin/env python
# coding:utf-8

import requests
import json
from heapq import heappush, heappop

URL = "http://localhost:8983/solr/project/tvrh"


def get_tvlist(url, _id, start, rows):
    params = {
        'q': 'id:{}'.format(_id),
        'rows': rows,
        'start': start,
        'indent': 'true',
        'tv.tf_idf': 'true',
        'tv.fl': 'includes',
        'fl': 'id',
    }
    r = requests.get(URL, params=params)
    dic = json.loads(r.text)
    tv_list = dic.get('termVectors', [])
    return tv_list


def analisys_dic(tv_list):
    res = {}
    num = len(tv_list) // 2
    for i in range(num):
        _id = tv_list[2 * i]
        term_list = tv_list[2 * i + 1][3]
        term_list_num = len(term_list) // 2
        _heap = []
        for h in range(term_list_num):
            string = term_list[2 * h]
            tfidf = term_list[2 * h + 1][1]
            if tfidf < 0.1:
                heappush(_heap, (- tfidf, string))
        tfidf_list = []
        for n in range(10):
            if _heap == []:
                break
            pop_tuple = heappop(_heap)
            tfidf_list.append((pop_tuple[1], - pop_tuple[0]))

        res.update({_id: tfidf_list})
    return res