#爬虫 最初折腾爬虫还是在校期间,一个玩具项目,想把校内信息爬取下来,写一个小型校内搜索引擎,整合到公众号里

而后原理实现之后,就不想打磨细节了,急着跳入下一个坑。如那个笑话里的,程序员好不容易买齐了笔墨纸砚,写下hello world,就心满意足地离去了

去年在知乎里答过一个爬虫题目,拿了1000来个赞,陆陆续续收到一些私信问有没有兴趣换工作啥的,当时的百度和美团还是蛮有吸引力的,哎谁知一年多工夫,局势变了好多,看来拒绝的还算有先鉴之明。如pyspider作者binux所言,爬虫就那点东西,有趣的部分并不多,所以也从未考虑过此类工作。

闲暇时间,陆陆续续写过简易的几个爬虫玩具,帮朋友爬过1w枚种子,据说质量奇高,是xx中的艺术片。也爬过豆瓣知乎,数据拿来训练机器人。

说要种子的,我也不会给你啦,代码有毒,身体要紧啊同学们

写爬虫比较钟爱pyspider,选择器是jquery语法(用了pyquery),调试起来十分方便,而且又有web界面很是舒服

以下几个demo,需求都来自小伙伴。

#pyspider介绍 pyspider作者在在博客里说到

pyspider 来源于以前做的一个垂直搜索引擎使用的爬虫后端。我们需要从200个站点(由于站点失效,不是都同时啦,同时有100+在跑吧)采集数据,并要求在5分钟内将对方网站的更新更新到库中。

和大多开源项目一样,pyspider也始于作者自己真实的需求,而后发现这个解决方案对于其他人也是适用的,于是开源出来。正是自己最初真实的需求,使项目在细节方面十分周到贴心

十分推荐有兴趣的小伙伴把作者博客里的几篇文章读完:Binuxの杂货铺

这是关于爬虫方面我读过最简易清晰的几篇文章了

#安装 ##mac下安装与运行

1
2
3
4
5
6
virtualenv env #使用virtualenv建立虚拟环境
source env/bin/activate
pip install pyspider
brew install phantomjs #运行js的环境(ajax)
pyspider phantomjs #开启服务在25555端口
pyspider

##使用docker安装

1
2
docker pull binux/pyspider:master
sudo docker run -d -p 5000:5000 -v /opt/pyspider:/opt/pyspider binux/pyspider:master

之后在5000端口上就可以写爬虫逻辑啦,web本身作为控制面板

参考Running-pyspider-with-Docker

#demo ###1 爬取豆瓣8分以上书籍

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-09-17
# Project: douban_book

from pyspider.libs.base_handler import *
import re

class Handler(BaseHandler):
    crawl_config = {
    }

    def on_start(self):
        self.crawl('http://book.douban.com/tag/', callback=self.index_page)

    def index_page(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(r"http://www.douban.com/tag/\w+", each.attr.href, re.U):
                #print each.attr.href
                #romantic novels tag :  http://www.douban.com/tag/%E8%A8%80%E6%83%85/?focus=book
                self.crawl(each.attr.href , callback=self.list_page)

    def list_page(self, response):
        #print re.search(r'http://www.douban.com/tag/[^/]*',response.url ,).group() + '/book'
        self.crawl(re.search(r'http://www.douban.com/tag/[^/]*',response.url ,).group() + '/book', callback=self.list_page_detail)
    
    def list_page_detail(self, response):
        for each in response.doc('a[href^="http"]').items():
            if re.match(r'http://book.douban.com/subject/\d+',each.attr.href):
                self.crawl(each.attr.href, callback=self.page_detail)
        for each in response.doc('.next > a').items():
            #print each.attr.href
            self.crawl(each.attr.href, callback=self.list_page_detail)
    
    def page_detail(self, response):
        vote_average = response.doc('strong[property="v:average"]').text()
        if float(vote_average) >= 8:
            return {
              "url":re.search(r'http://book.douban.com/subject/\d+',response.url).group(),
              "title":response.doc('h1 > span').text(),
              "vote_average":response.doc('strong[property="v:average"]').text(),
              "vote_num":response.doc('span[property="v:votes"]').text()
            }

###demo2 爬取京东众筹数据

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-11-30
# Project: jdzc

from pyspider.libs.base_handler import *
import re

class Handler(BaseHandler):
    crawl_config = {
    }



    def on_start(self, response):
        #手动输入最大页面,这部分有ajax很烦 手写吧
        #max_page = response.doc('a.next')
        #print max_page
        max_page=218 #当前是218,有了phantomjs后可以自动获取
        for page_num in range(1,max_page):
            self.crawl('http://z.jd.com/bigger/search.html?'+str(page_num), callback=self.enter_item, method='POST', data={'sort': "zhtj", 'page': str(page_num)})
        
    
    def enter_item(self, response):
        #print response.doc(".lr-lists .link-pic")
        for each in response.doc(".lr-lists .link-pic").items():
                self.crawl(each.attr.href, fetch_type='js',callback=self.item_detail)
    
    def item_detail(self, response):
        
        item_name = response.doc('.project-tilte > h1').text() #项目名称
        item_owner = response.doc('.font18 > a').text()#项目发起人
        item_status = response.doc('.icon_pag').text() #项目状态
        #item_begintime =  response.doc('div.font14 > .lh24 > span:first').text()#发起日期,只有筹款中的才有发起日期,完成的没有
        item_deadline =  response.doc('div.font14 > .lh24 > span').eq(0).text()#截止日期
        item_target_fundraising = response.doc('div.font14 > .lh24 > span').eq(1).text() #众筹目标
        item_real_fundraising = response.doc('.font45').text() #已筹款数
        supporter_num = response.doc('.pr > p > strong').text() #支持人数
        topic_num = response.doc('#topicBtn > span').text() #话题数,需要执行完js才得到,所以需要phantomjs
        return {
              "item_name":item_name,
              "item_owner":item_owner,
              "item_status":item_status,
              "item_deadline":item_deadline,
              "item_target_fundraising":item_target_fundraising,
              "item_real_fundraising":item_real_fundraising,
              "supporter_num":supporter_num
            }

###demo3 爬取知乎指定用户收藏夹

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2015-03-24 21:56:49
# Project: zhihu_collection

from pyspider.libs.base_handler import *


class Handler(BaseHandler):
    crawl_config = {}

    @every(minutes=24 * 60)
    def on_start(self):
        start_url = "http://www.zhihu.com/people/liu-shi-jiu-84/collections"
        self.crawl(start_url, callback=self.index_page )

    @config(age=10 * 24 * 60 * 60)
    def index_page (self, response):
         for each in response.doc(".zm-profile-fav-item-title").items():
             collection_name = each.text()
             self.crawl(each.attr.href, callback=self.all_answer_pages,save={"collection_name":collection_name})

    def all_answer_pages(self,response):
         pages= response.doc(".border-pager a")
         max_pages = int(pages[-2].text) #倒数第二个, 最大页数
         all_collections_pages = [str(response.url)+"?page="+str(page) for page in range(1,max_pages+1)]
         for url in all_collections_pages:
             print url
             self.crawl(url, callback=self.collection_page,save={"collection_name":response.save["collection_name"]})

    def collection_page(self, response):
         for each in response.doc(".toggle-expand").items():
             self.crawl(each.attr.href, callback=self.detail_page,save={"collection_name":response.save["collection_name"]})

    @config(priority=2)
    def detail_page(self, response):
        return {
             "collection_name":response.save["collection_name"],
             "url": response.url,
             "title": response.doc("title").text(),
             "body":response.doc('.autohide-false .zm-item-rich-text > div').text()}
             

#附录

###Ubuntu 12.04 64bit 安装PhantomJS

1
2
3
4
5
6
sudo wget https://phantomjs.googlecode.com/files/phantomjs-1.9.0-linux-x86_64.tar.bz2
sudo tar xjf phantomjs-1.9.0-linux-x86_64.tar.bz2
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/local/share/phantomjs; 
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/local/bin/phantomjs;
sudo ln -s /usr/local/share/phantomjs-1.9.0-linux-x86_64/bin/phantomjs /usr/bin/phantomjs
phantomjs --version