diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" new file mode 100644 index 0000000..5ba1fd3 --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/link_extraction.py" @@ -0,0 +1,36 @@ +from bs4 import BeautifulSoup +import requests + +start_url = 'http://bj.ganji.com/wu/' + +def get_link_urls(url): + wb_data = requests.get(start_url) + soup = BeautifulSoup(wb_data.text,'lxml') + links = soup.select('.fenlei dt a') + for link in links: + page_url = 'http://bj.ganji.com' + link.get('href') + print(page_url) + +#get_link_urls(start_url) + +link_list = ''' + http://bj.ganji.com/jiaju/ + http://bj.ganji.com/rirongbaihuo/ + http://bj.ganji.com/shoujihaoma/ + http://bj.ganji.com/bangong/ + http://bj.ganji.com/nongyongpin/ + http://bj.ganji.com/jiadian/ + http://bj.ganji.com/ershoubijibendiannao/ + http://bj.ganji.com/ruanjiantushu/ + http://bj.ganji.com/yingyouyunfu/ + http://bj.ganji.com/diannao/ + http://bj.ganji.com/xianzhilipin/ + http://bj.ganji.com/fushixiaobaxuemao/ + http://bj.ganji.com/meironghuazhuang/ + http://bj.ganji.com/shuma/ + http://bj.ganji.com/laonianyongpin/ + http://bj.ganji.com/xuniwupin/ + http://bj.ganji.com/qitawupin/ + http://bj.ganji.com/ershoufree/ + http://bj.ganji.com/wupinjiaohuan/ +''' \ No newline at end of file diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/page_parsing.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/page_parsing.py" new file mode 100644 index 0000000..3b9ea4a --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/page_parsing.py" @@ -0,0 +1,51 @@ +from bs4 import BeautifulSoup +import requests +import time +import pymongo + +client = pymongo.MongoClient('localhost', 27017) +ceshi = client['ceshi'] +url_list = ceshi['url_list'] +item_info = ceshi['item_info'] +# 在最左边是在python 中对象的名称,后面的是在数据库中的名称 + +# spider 1 +def get_links_from(channel, pages, who_sells=0): + # td.t 没有这个就终止 + list_view = '{}{}/pn{}/'.format(channel, str(who_sells), str(pages)) + wb_data = requests.get(list_view) + time.sleep(1) + soup = BeautifulSoup(wb_data.text, 'lxml') + if soup.find('td', 't'): + for link in soup.select('td.t a.t'): + item_link = link.get('href').split('?')[0] + url_list.insert_one({'url': item_link}) + print(item_link) + # return urls + else: + # It's the last page ! + pass + +# spider 2 +def get_item_info(url): + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text, 'lxml') + no_longer_exist = '404' in soup.find('script', type="text/javascript").get('src').split('/') + if no_longer_exist: + pass + else: + title = soup.title.text + price = soup.select('span.price.c_f50')[0].text + date = soup.select('.time')[0].text + area = list(soup.select('.c_25d a')[0].stripped_strings) if soup.find_all('span', 'c_25d') else None + item_info.insert_one({'title': title, 'price': price, 'date': date, 'area': area, 'url': url}) + data = { + 'title': title, + 'price': price, + 'date': date, + 'area': area, + 'url': url + } + item_info.insert_one(data) + +get_item_info('http://bj.58.com/pingbandiannao/24493393391935x.shtml') \ No newline at end of file diff --git "a/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" new file mode 100644 index 0000000..da7f136 --- /dev/null +++ "b/week2\345\244\247\344\275\234\344\270\232\346\217\220\344\272\244/guozifeng/web_page_parsing.py" @@ -0,0 +1,50 @@ +#coding = utf-8 +from bs4 import BeautifulSoup +import requests +import time +import pymongo + +client = pymongo.MongoClient('localhost',27017) +project_market = client['project_market'] +url_list = project_market['url_list'] +item_info = project_market['item_info'] + +def get_links_from(link,pages,who_type=1): + url = '{}a{}o{}/' .format (link, str(who_type),str(pages)) + #url = 'http://bj.ganji.com/jiaju/a1o1/' + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text,'lxml') + if soup.find('div','pageBox'): + for link in soup.select('.ft-tit'): + item_link = link.get('href') + url_list.insert_one({'url':item_link}) + print(item_link) + else: + return + +def get_item_info(url): + wb_data = requests.get(url) + soup = BeautifulSoup(wb_data.text,'lxml') + if soup.find('h1','title-name'): + title = soup.select('.title-name')[0].text.split('-')[0] + post_time = soup.select('.pr-5')[0].text.strip().split(' ')[0] + type_info = soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(1) > span > a')[0].text + price = soup.select('.f22')[0].text + area = list(map(lambda x:x.text,soup.select('div.leftBox > div:nth-of-type(3) > div > ul > li:nth-of-type(3) > a '))) + data = { + '商品标题':title, + '发帖时间':post_time, + '类型':type_info, + '价格':price, + '交易地点':area, + 'url':url + } + print(data) + item_info.insert_one(data) + + else: + pass + +# url = 'http://biz.click.ganji.com/bizClick?url=pZwY0jCfsvFJshI6UhGGshPfUiqJpy7JIitQPHEYP1nOrH9vXaOCIAd0njTQPDDzn1cvwDndnHn3ENRjnbFAnHwanNDknH03rH0zwNP0njTQPjEdPWT3nWTdn1DdnjE3rHnzPHbvndkknjDVgjTknHELnjbQPHEQn7kknjDQP7kknjDQPHEYP1nOrH9vgjTknHDQn7kknjDvndkknjDQPj60njTQnHF0njTQnHEdPHD3rHcvnWmkPdkknjDQgjTknHD1nRkknj7BpywbpyOMgjTknH70njTQuv78ph-WUvdx0AI0njTQn7kknjDYPjNvnj9znjN1nHNkPj9On1cdrHm1gjTknHK0njTQnWc1sW01sWE3sW0OgjTknNdfXh-_UADfPi3kca6gpyObULI1cDONcjDksWTec7I5R1mY2iKK0ZK_uRI-mbVGIatdn108n1m92DVcRDdnsaK_pyV-cDI-mvVf2iKjpZFfUyNfPj98na3zPHmYsWbLc7P6uh7zpitdn108n1u0njTQPH9YgjTkniYQgjTkniYQgjTknNPC0hqVuE&v=2' +#url = 'http://bj.ganji.com/jiaju/1956529909x.htm' +#get_item_info(url) \ No newline at end of file