用node写小爬虫

先前有用php写过小爬虫,其实本质就是获取到网页中的Dom结构,然后分析里面的元素,提取出自己想要的东西。最近想找东西练练node,就想到用node做小爬虫。去看了看慕课网scott老师讲的。感觉很棒,学到了不少东西。自己跟着做了一篇,爬了下慕课网的课程。明天准备再去写一个脚本爬自己博客。下面是学到东西的总结。感谢scott老师。


包模块的选择

这里除了http核心模块外,还用到了bluebird和cheerio,这两个模块。bluebird封装了promise,可以异步来调用。cheerio则是可以让我们更方便的操作Dom,就可以像jquery一样来操作Dom。我们的目的是,分析dom结构,然后操作dom节点,获取我们想要的,并且拼接成下面的数组:

1
2
3
4
5
6
7
8
9
10
var coursesData = {
title:title,
number:number,
videos:[
title:title,
videos:[

]
]
}


直接上代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
var http = require('http')
var Promise = require('bluebird')
var cheerio = require('cheerio')
var baseUrl = 'http://www.imooc.com/learn/'
var videoIds = [348,259,197,134,75]

function filterChapter(data){
var $ = cheerio.load(data)
var title = $('#main .w .path span').text()

var number = parseInt($($('.statics .static-item')[2]).find('strong').text().trim(),10)
var chapters = $('.chapter')
var chapterCourses = []

var courseData = {
title:title,
number:number,
videos:[]
}

//每一个章节的标题和视频,这里把声明放在外面,或许会更好
var chapter;
var chapterTitle = ""
var chapterVideos = []
chapters.each(function(item){
chapter = $(this)
chapterTitle = chapter.find('strong').text()
chapterVideos = chapter.find('.video').children('li')
chapterCourses = {
chapterTitle:chapterTitle,
chapterVideos:[]
}
chapterVideos.each(function(item){
var video = $(this).find('.studyvideo')
var videoTitle = video.text()
var id = video.attr('href').split('/')[1]

chapterCourses.chapterVideos.push({
title:videoTitle,
id:id
})
})
courseData.videos.push(chapterCourses)
})
return courseData
}

//打印数组

function printChapter(coursesData){
coursesData.forEach(function(courseData){
//console.log(courseData)
console.log(courseData.number + '个人学过 ' + courseData.title+' \n')
})

coursesData.forEach(function(courseData){
console.log(courseData.title+"###"+"\n")
courseData.videos.forEach(function(item){
var chapterTitle = item.chapterTitle
console.log(chapterTitle+'\n')

item.chapterVideos.forEach(function(video) {
console.log(' ['+video.id+']'+video.title + '\n')
})
})
})

}

//通过promise的resolve和reject返回promise
function getPageAsync(url){
return new Promise(function(resolve,reject){

http.get(url,function(res){
var html = "";
res.on('data',function(data){
html += data
})
res.on('end',function(){
resolve(html)
})
}).on('error',function(e){
reject(e)
console.log('获取失败')
})
})
}

var fetchCourseArr = []

videoIds.forEach(function(id){
console.log('开始爬取网站'+baseUrl+id)
fetchCourseArr.push(getPageAsync(baseUrl + id))
})

//promise的方法all和then
Promise.all(fetchCourseArr)
.then(function(pages){
var coursesData = []

pages.forEach(function(html){
var courses = filterChapter(html)
coursesData.push(courses)
})

coursesData.sort(function(a,b){
return a.number-b.number;
})

printChapter(coursesData)
})

中间收获

用request发起请求

如何用node伪造一个请求。比如就在本地提交评论到慕课网?

注意这里的headers可以直接先去评论一个,去看看请求headers,然后复制过来就可以啦。

中间有个错误。querystring都是小写。虽然官方文档介绍它的时候是queryString。这个地方要注意了。然后可以用request方法发起一个请求。然后最后把数据post过去就可以了。Cookie那个地方要用本地cookie.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
var http = require('http')
var querystring = require('querystring')

var postData = querystring.stringify({
'content':'老师你真帅,测试',
'cid':348
})

var options = {
hostname:'www.imooc.com',
port:80,
path:'/course/docomment',
method:'POST',
headers:{
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'zh-CN,zh;q=0.8',
'Connection':'keep-alive',
'Content-Length':82,
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'......',
'Host':'www.imooc.com',
'Origin':'http://www.imooc.com',
'Referer':'http://www.imooc.com/comment/348',
'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/40.0.2214.111 Chrome/40.0.2214.111 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
}

var req = http.request(options,function(res){
console.log('res statecode'+res.statusCode)
console.log('res headers'+ JSON.stringify(res.headers))

res.on('data',function(chunk){
console.log(Buffer.isBuffer(chunk))
console.log(typeof chunk)
})

res.on('end',function(){
console.log('评论完毕')
})

res.on('error',function(e){
console.log('评论出现错误'+e.message())
})
})

req.write(postData)
req.end()

这中间又出了个错,就是content-lengthpost过去的数据长度不一致。导致socket hang up解决方法就是改长度啦。


关于promise

promise的话,node新版本是已经当成核心模块了。但是如果是旧版本就可以用bluebird这些东西。然后再看看promise有哪里好。代码或许可以解释好多。下面是三个小球。我们让他们一个接着一个产生动画。

看下dom结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
<style>
.box{
width:30px;
height:30px;
-webkit-border-radius:50%;
-moz-border-radius:50%;
border-radius:50%;
}
.box1{
background:red;
}
.box2{
background:yellow;
}
.box3{
background:blue;
}
</style>

<div class="box box1" style="margin-left:0px"></div>
<div class="box box2" style="margin-left:0px"></div>
<div class="box box3" style="margin-left:0px"></div>

没用promise之前,callback嵌套。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
<script src="node_modules/bluebird/js/browser/bluebird.js"></script>
<script>
var box1 = document.querySelector(".box1");
var box2 = document.querySelector(".box2");
var box3 = document.querySelector(".box3");

function animate(ball,distance,callback){
setTimeout(function(){
var marginLeft = parseInt(ball.style.marginLeft,10);
if(marginLeft === distance){
callback && callback();
}else{
if(marginLeft<distance){
marginLeft++;
}else{
marginLeft--;
}
console.log(marginLeft);
ball.style.marginLeft = marginLeft + "px";
animate(ball,distance,callback);
}

},13)
}
animate(box1,100,function(){
animate(box2,200,function(){
animate(box3,300,function(){
animate(box3,150,function(){
animate(box2,150,function(){
animate(box1,150,function(){
console.log("animate done");
})
})
})
})
})
})
</script>

用了promise后,也是异步调用,但是确实同步编写。是不是看着很好?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
<!DOCTYPE html>
<script src="node_modules/bluebird/js/browser/bluebird.js"></script>
<script>
var box1 = document.querySelector(".box1");
var box2 = document.querySelector(".box2");
var box3 = document.querySelector(".box3");


var Promise = window.Promise;

function promiseAnimate(ball,distance){

return new Promise(function(resolve,reject){
function _animate(){
setTimeout(function(){
var marginLeft = parseInt(ball.style.marginLeft,10);
if(marginLeft === distance){
resolve();
}else{
if(marginLeft<distance){
marginLeft++;
}else{
marginLeft--;
}
ball.style.marginLeft = marginLeft + "px";
_animate();
}
},5)
}
_animate();
})
}

promiseAnimate(box1,100)
.then(function(){
return promiseAnimate(box2,200);
})
.then(function(){
return promiseAnimate(box3,300);
})
.then(function(){
return promiseAnimate(box3,150);
})
.then(function(){
return promiseAnimate(box2,150);
})
.then(function(){
return promiseAnimate(box1,150);
});

</script>

官网上关于promise有个例子,里面我看到了这个insertAdjacentHTML,觉得好棒。大部分浏览器都支持了。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
<script>
'use strict';
var promiseCount = 0;

function testPromise() {
var thisPromiseCount = ++promiseCount;

var log = document.getElementById('log');
log.insertAdjacentHTML('beforeend', thisPromiseCount +
') Started (<small>Sync code started</small>)<br/>');

// We make a new promise: we promise a numeric count of this promise, starting from 1 (after waiting 3s)
var p1 = new Promise(
// The resolver function is called with the ability to resolve or
// reject the promise
function(resolve, reject) {
log.insertAdjacentHTML('beforeend', thisPromiseCount +
') Promise started (<small>Async code started</small>)<br/>');
// This is only an example to create asynchronism
window.setTimeout(
function() {
// We fulfill the promise !
resolve(thisPromiseCount);
}, Math.random() * 2000 + 1000);
}
);

// We define what to do when the promise is resolved/fulfilled with the then() call,
// and the catch() method defines what to do if the promise is rejected.
p1.then(
// Log the fulfillment value
function(val) {
log.insertAdjacentHTML('beforeend', val +
') Promise fulfilled (<small>Async code terminated</small>)<br/>');
})
.catch(
// Log the rejection reason
function(reason) {
console.log('Handle rejected promise ('+reason+') here.');
});

log.insertAdjacentHTML('beforeend', thisPromiseCount +
') Promise made (<small>Sync code terminated</small>)<br/>');
}
</script>

总结

觉得学到挺多东西的。其实感觉node不知道怎么入手,书也不知道看啥好,有点迷茫。有看一些node书,感觉大部分都是讲语法和API,还是多做点东西,实践中学习,心里稍微踏实点。再次谢谢scott老师。