Solrのelevate.xmlを確認する方法
elevate.xmlへの変更をtsvに出力。 import xml.etree.ElementTree as ETにて処理。 最初urllib2でとってきていたが、socketのエラーとかでデータが飛ぶので、requestsに変更。正規表現とかも意外に苦労した。
# -*- coding: utf-8 -*- # vim:tabstop=4:shiftwidth=4:expandtab import xml.etree.ElementTree as ET import re import urllib import codecs from socket import error as SocketError import errno import requests def run(): r = re.compile(u'\(\(\(text_ja:(.*)\)') elem = ET.parse('elevate_next.xml') result = dict() for query in elem.findall(".//query"): word = r.search(query.get('text').split()[0]).group(1) doc_list = [] doc_exclude_list = [] for doc in query.findall(".//doc"): doc_id = doc.get('id') if doc.get('exclude'): doc_exclude_list.append(doc_id) continue doc_list.append(doc_id) o = order_search(word, doc_list, doc_exclude_list) result[word] = o with codecs.open('result.tsv', mode='w', encoding='cp932') as fd: for word, res in result.items(): fd.write(word + u'\n') rrr = sorted([(v[0], v[1], v[2], k) for k, v in res.items()]) for num, _num, exist, word in rrr: fd.write(u'\t' + unicode(num) + u'\t' + unicode(_num) + u'\t' + unicode(exist) + u'\t' + word + u'\n') def order_search(word, doc_id_list, doc_exclude_list): url = "[url名]?q={word}&l={count}".format( word=urllib.quote(word.encode('utf-8')), count=len(doc_id_list) + 20) res = dict() try: # furl = urllib2.urlopen(url) # json_string = furl.read() # parsed = json.loads(json_string) r = requests.get(url) parsed = r.json() items = parsed.get(u'result').get(u'items') for num, term in enumerate(doc_id_list): res[term] = [num+1, -1, 0] for num, term in enumerate(doc_exclude_list): res[term] = [-1, -1, 0] # import pdb; pdb.set_trace() for num, item in enumerate(items): sh = unicode(item.get([商品id])) if sh in doc_id_list: res[sh][1] = num + 1 if sh in doc_exclude_list: res[sh][1] = num + 100001 print sh res[sh][2] = solr_index_exist(sh) except SocketError as e: print url if e.errno != errno.ECONNRESET: raise # Not error we are looking for finally: return res def solr_index_exist(doc_id): url = "[Solrのurl名]/solr/[core名]/select?q=([商品id]:{doc_id})".format( doc_id=doc_id ) + '&fl=&wt=json&indent=true' r = requests.get(url) parsed = r.json() response = parsed.get(u'response') if response and response.get(u'numFound') == 1: return 1 return 0 def main(): run() if __name__ == '__main__': main()