From a684f28f51eed7031ea48fa42af3ef12546108f1 Mon Sep 17 00:00:00 2001 From: Asoul Yang Date: Fri, 14 Oct 2016 01:11:17 +0800 Subject: [PATCH] Fix api change bug, refact crawl.py --- crawl.py | 139 ++++++++++++++++++++++++++----------------------------- 1 file changed, 65 insertions(+), 74 deletions(-) diff --git a/crawl.py b/crawl.py index 0da3d3616..99600db0a 100644 --- a/crawl.py +++ b/crawl.py @@ -1,101 +1,92 @@ #!/usr/bin/python # -*- coding: utf-8 -*- -import requests +import os import json import csv -from os import mkdir -from os.path import isdir -from datetime import date - -class CrawlerController(): - '''Split targetList into several Crawler''' - - def __init__(self, targetList, maxN = 50): - self.crawlerList = [] - - for i in range(len(targetList) / maxN + 1): - crawler = Crawler(targetList[maxN * i: maxN * (i+1)]) - self.crawlerList.append(crawler) - - def getStockData(self): - dataList = [] +import time - for crawler in self.crawlerList: - dataList.extend(crawler.getStockData()) +from datetime import date - return dataList +import requests -class Crawler(): - '''request to Market Information System''' +class CrawlerController(object): + '''Split targets into several Crawler, avoid request url too long''' - def __init__(self, targetList): - self.queryURL = self._getQueryURL(targetList) + def __init__(self, targets, max_stock_per_crawler=50): + self.crawlers = [] - def _getQueryURL(self, targetList): + for index in range(0, len(targets), max_stock_per_crawler): + crawler = Crawler(targets[index:index + max_stock_per_crawler]) + self.crawlers.append(crawler) - query = 'http://mis.twse.com.tw/stock/api/getStockInfo.jsp?ex_ch=' - for target in targetList: - query += ('tse_{}.tw|'.format(target)) + def run(self): + data = [] + for crawler in self.crawlers: + data.extend(crawler.get_data()) + return data - return query[:-1] +class Crawler(object): + '''Request to Market Information System''' + def __init__(self, targets): + endpoint = 'http://mis.twse.com.tw/stock/api/getStockInfo.jsp' + # Add 1000 seconds for prevent time inaccuracy + timestamp = int(time.time() * 1000 + 1000000) + channels = '|'.join('tse_{}.tw'.format(target) for target in targets) + self.query_url = '{}?_={}&ex_ch={}'.format(endpoint, timestamp, channels) - def _handleResponse(self, response): + def get_data(self): try: - content = json.loads(response.content) - except Exception, e: - print e - data = {} + # Get original page to get session + req = requests.session() + req.get('http://mis.twse.com.tw/stock/index.jsp', + headers={'Accept-Language': 'zh-TW'}) + + response = req.get(self.query_url) + content = json.loads(response.text) + except Exception as err: + print(err) + data = [] else: data = content['msgArray'] - finally: - return data - - def getStockData(self): - # 先戳原頁面,再拿資料 - req = requests.session() - req.get('http://mis.twse.com.tw/stock/index.jsp', - headers = {'Accept-Language':'zh-TW'} - ) - - response = req.get(self.queryURL) - dataList = self._handleResponse(response) - - return dataList - -class Recorder(): - '''record data to csv''' - def __init__(self, path='data'): - self.folderPath = '{}/{}'.format(path, date.today().strftime('%Y%m%d')) - self._checkTodayFolder() - def _checkTodayFolder(self): - if not isdir(self.folderPath): - mkdir(self.folderPath) + return data - def recordCSV(self, dataList): +class Recorder(object): + '''Record data to csv''' + def __init__(self, path='data'): + self.folder_path = '{}/{}'.format(path, date.today().strftime('%Y%m%d')) + if not os.path.isdir(self.folder_path): + os.mkdir(self.folder_path) - for data in dataList: + def record_to_csv(self, data): + for row in data: try: - fo = open('{}/{}.csv'.format(self.folderPath, data['c']), 'ab') - cw = csv.writer(fo, delimiter=',') - cw.writerow([data['t'], data['z'], data['tv'], data['v'], - data['a'], data['f'], data['b'], data['g']] - ) - - except Exception as e: - print e - continue + file_path = '{}/{}.csv'.format(self.folder_path, row['c']) + with open(file_path, 'a') as output_file: + writer = csv.writer(output_file, delimiter=',') + writer.writerow([ + row['t'],# 資料時間 + row['z'],# 最近成交價 + row['tv'],# 當盤成交量 + row['v'],# 當日累計成交量 + row['a'],# 最佳五檔賣出價格 + row['f'],# 最價五檔賣出數量 + row['b'],# 最佳五檔買入價格 + row['g']# 最佳五檔買入數量 + ]) + + except Exception as err: + print(err) def main(): - - targetList = [_.strip() for _ in open('stocknumber.csv', 'rb')] + targets = [_.strip().decode() for _ in open('stocknumber.csv', 'rb')] + + controller = CrawlerController(targets) + data = controller.run() - controller = CrawlerController(targetList) recorder = Recorder() + recorder.record_to_csv(data) - dataList = controller.getStockData() - recorder.recordCSV(dataList) - if __name__ == '__main__': main()