日本語のURLのスクレイピング方法(Python2.7)
Python2.7の日本語スクレイピング方法があったので、使ってみた。
引数は読み込み用のcsvファイルを使用する。(1列目はURL)
openをcloseする必要がある気がする。
# coding:utf-8 import requests from bs4 import BeautifulSoup import csv import sys import urllib from urlparse import urlparse csv_reader = csv.reader(open( sys.argv[1], "rb"), delimiter=",", quotechar='"') with open('[書き込み用csvファイル]', 'awb') as f: for row in csv_reader: csvWriter = csv.writer(f) try: if row[0] == "": csvWriter.writerow([i for i in row]) continue p = urlparse(row[0]) query = urllib.quote_plus(p.query, safe='=&') row[0] = '{}://{}{}{}{}{}{}{}{}'.format( p.scheme, p.netloc, p.path, ';' if p.params else '', p.params, '?' if p.query else '', query, '#' if p.fragment else '', p.fragment) r = requests.get(row[0]) soup = BeautifulSoup(r.text.encode(r.encoding)) result = soup.find(id='resultsGoodsPagingResultsShow') listdata = [i for i in row] listdata.append(result.text.encode('utf-8').strip()) csvWriter.writerow(listdata) except Exception as e1: try: p = urlparse(row[0]) query = urllib.quote_plus(p.query, safe='=&') row[0] = '{}://{}{}{}{}{}{}{}{}'.format( p.scheme, p.netloc, p.path, ';' if p.params else '', p.params, '?' if p.query else '', query, '#' if p.fragment else '', p.fragment) r = requests.get(row[0]) soup = BeautifulSoup(r.text.encode(r.encoding)) message = soup.find(id='errors').find('li') listdata = [i for i in row] listdata.append(message.text.encode('utf-8')) csvWriter.writerow(listdata+[e1]) except Exception as e2: listdata = [i for i in row] csvWriter.writerow([i for i in row]+[e2])