#爬虫
最初折腾爬虫还是在校期间,一个玩具项目,想把校内信息爬取下来,写一个小型校内搜索引擎,整合到公众号里
而后原理实现之后,就不想打磨细节了,急着跳入下一个坑。如那个笑话里的,程序员好不容易买齐了笔墨纸砚,写下hello world
,就心满意足地离去了
去年在知乎里答过一个爬虫题目,拿了1000来个赞,陆陆续续收到一些私信问有没有兴趣换工作啥的,当时的百度和美团还是蛮有吸引力的,哎谁知一年多工夫,局势变了好多,看来拒绝的还算有先鉴之明。如pyspider作者binux所言,爬虫就那点东西,有趣的部分并不多,所以也从未考虑过此类工作。
闲暇时间,陆陆续续写过简易的几个爬虫玩具,帮朋友爬过1w枚种子,据说质量奇高,是xx中的艺术片。也爬过豆瓣知乎,数据拿来训练机器人。
说要种子的,我也不会给你啦,代码有毒,身体要紧啊同学们
写爬虫比较钟爱pyspider,选择器是jquery语法(用了pyquery),调试起来十分方便,而且又有web界面很是舒服
以下几个demo,需求都来自小伙伴。
#pyspider介绍
pyspider作者在在博客里说到
pyspider 来源于以前做的一个垂直搜索引擎使用的爬虫后端。我们需要从200个站点(由于站点失效,不是都同时啦,同时有100+在跑吧)采集数据,并要求在5分钟内将对方网站的更新更新到库中。
和大多开源项目一样,pyspider也始于作者自己真实的需求,而后发现这个解决方案对于其他人也是适用的,于是开源出来。正是自己最初真实的需求,使项目在细节方面十分周到贴心
十分推荐有兴趣的小伙伴把作者博客里的几篇文章读完:Binuxの杂货铺
这是关于爬虫方面我读过最简易清晰的几篇文章了
#安装
##mac下安装与运行
1
2
3
4
5
6
|
virtualenv env #使用virtualenv建立虚拟环境
source env/bin/activate
pip install pyspider
brew install phantomjs #运行js的环境(ajax)
pyspider phantomjs #开启服务在25555端口
pyspider
|
##使用docker安装
1
2
|
docker pull binux/pyspider:master
sudo docker run -d -p 5000:5000 -v /opt/pyspider:/opt/pyspider binux/pyspider:master
|
之后在5000端口上就可以写爬虫逻辑啦,web本身作为控制面板
参考Running-pyspider-with-Docker
#demo
###1
爬取豆瓣8分以上书籍
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-09-17
# Project: douban_book
from pyspider.libs.base_handler import *
import re
class Handler(BaseHandler):
crawl_config = {
}
def on_start(self):
self.crawl('http://book.douban.com/tag/', callback=self.index_page)
def index_page(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match(r"http://www.douban.com/tag/\w+", each.attr.href, re.U):
#print each.attr.href
#romantic novels tag : http://www.douban.com/tag/%E8%A8%80%E6%83%85/?focus=book
self.crawl(each.attr.href , callback=self.list_page)
def list_page(self, response):
#print re.search(r'http://www.douban.com/tag/[^/]*',response.url ,).group() + '/book'
self.crawl(re.search(r'http://www.douban.com/tag/[^/]*',response.url ,).group() + '/book', callback=self.list_page_detail)
def list_page_detail(self, response):
for each in response.doc('a[href^="http"]').items():
if re.match(r'http://book.douban.com/subject/\d+',each.attr.href):
self.crawl(each.attr.href, callback=self.page_detail)
for each in response.doc('.next > a').items():
#print each.attr.href
self.crawl(each.attr.href, callback=self.list_page_detail)
def page_detail(self, response):
vote_average = response.doc('strong[property="v:average"]').text()
if float(vote_average) >= 8:
return {
"url":re.search(r'http://book.douban.com/subject/\d+',response.url).group(),
"title":response.doc('h1 > span').text(),
"vote_average":response.doc('strong[property="v:average"]').text(),
"vote_num":response.doc('span[property="v:votes"]').text()
}
|
###demo2
爬取京东众筹数据
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-11-30
# Project: jdzc
from pyspider.libs.base_handler import *
import re
class Handler(BaseHandler):
crawl_config = {
}
def on_start(self, response):
#手动输入最大页面,这部分有ajax很烦 手写吧
#max_page = response.doc('a.next')
#print max_page
max_page=218 #当前是218,有了phantomjs后可以自动获取
for page_num in range(1,max_page):
self.crawl('http://z.jd.com/bigger/search.html?'+str(page_num), callback=self.enter_item, method='POST', data={'sort': "zhtj", 'page': str(page_num)})
def enter_item(self, response):
#print response.doc(".lr-lists .link-pic")
for each in response.doc(".lr-lists .link-pic").items():
self.crawl(each.attr.href, fetch_type='js',callback=self.item_detail)
def item_detail(self, response):
item_name = response.doc('.project-tilte > h1').text() #项目名称
item_owner = response.doc('.font18 > a').text()#项目发起人
item_status = response.doc('.icon_pag').text() #项目状态
#item_begintime = response.doc('div.font14 > .lh24 > span:first').text()#发起日期,只有筹款中的才有发起日期,完成的没有
item_deadline = response.doc('div.font14 > .lh24 > span').eq(0).text()#截止日期
item_target_fundraising = response.doc('div.font14 > .lh24 > span').eq(1).text() #众筹目标
item_real_fundraising = response.doc('.font45').text() #已筹款数
supporter_num = response.doc('.pr > p > strong').text() #支持人数
topic_num = response.doc('#topicBtn > span').text() #话题数,需要执行完js才得到,所以需要phantomjs
return {
"item_name":item_name,
"item_owner":item_owner,
"item_status":item_status,
"item_deadline":item_deadline,
"item_target_fundraising":item_target_fundraising,
"item_real_fundraising":item_real_fundraising,
"supporter_num":supporter_num
}
|
###demo3
爬取知乎指定用户收藏夹
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
|
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-03-24 21:56:49
# Project: zhihu_collection
from pyspider.libs.base_handler import *
class Handler(BaseHandler):
crawl_config = {}
@every(minutes=24 * 60)
def on_start(self):
start_url = "http://www.zhihu.com/people/liu-shi-jiu-84/collections"
self.crawl(start_url, callback=self.index_page )
@config(age=10 * 24 * 60 * 60)
def index_page (self, response):
for each in response.doc(".zm-profile-fav-item-title").items():
collection_name = each.text()
self.crawl(each.attr.href, callback=self.all_answer_pages,save={"collection_name":collection_name})
def all_answer_pages(self,response):
pages= response.doc(".border-pager a")
max_pages = int(pages[-2].text) #倒数第二个, 最大页数
all_collections_pages = [str(response.url)+"?page="+str(page) for page in range(1,max_pages+1)]
for url in all_collections_pages:
print url
self.crawl(url, callback=self.collection_page,save={"collection_name":response.save["collection_name"]})
def collection_page(self, response):
for each in response.doc(".toggle-expand").items():
self.crawl(each.attr.href, callback=self.detail_page,save={"collection_name":response.save["collection_name"]})
@config(priority=2)
def detail_page(self, response):
return {
"collection_name":response.save["collection_name"],
"url": response.url,
"title": response.doc("title").text(),
"body":response.doc('.autohide-false .zm-item-rich-text > div').text()}
|
#附录
###Ubuntu 12.04 64bit 安装PhantomJS
1
2
3
4
5
6
|
sudo wget https://phantomjs.googlecode.com/files/phantomjs-1.9.0-linux-x86_64.tar.bz2
sudo tar xjf phantomjs-1.9.0-linux-x86_64.tar.bz2
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/local/share/phantomjs;
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/local/bin/phantomjs;
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/bin/phantomjs
phantomjs --version
|