以可备案域名后缀查询为例.
PhantomJS 优势为: 可以模拟页面渲染(执行js).
request 优势就是效率高咯.
思路很简单:
const driver = require('node-phantom-simple');
const phantom = require('phantomjs-prebuilt');
driver.create({
path: phantom.path
}, (err, browser) => {
browser.createPage((err2, page) => {
page.set('settings.userAgent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3175.4 Safari/537.36');
page.open('URL地址,和谐你我他', (err3, status) => {
console.log('opened site? ', status);
/* global nextPage,previousPage */
let n = 0;
const domains = [];
const fib = () => {
// eslint-disable-next-line
page.evaluate(function () {
/* eslint-disable */
var i = 1;
var ele;
var result = [];
while (ele = document.getElementById(i)) {
result.push(ele.children[3].innerText.trim());
i++;
}
return {
data: result,
cur: ~~previousPage.toString().split(' = ')[2].split(';')[0] + 1
};
}, (err4, result) => {
/* eslint-enable */
const { data, cur } = result;
console.log('page %d done', cur);
if (cur === n + 1) {
domains.push(...data);
n += 1;
}
if (data.length === 0) {
console.log('-------');
console.log('total domains:', domains.length);
const arr = [...new Set(domains)];
console.log('unique domains:', arr.length);
console.log('-------');
console.log(arr.sort().join('\n'));
browser.exit();
process.exit();
}
});
setTimeout(() => {
// eslint-disable-next-line
page.evaluate(function () {
nextPage();
}, () => {
setTimeout(fib, 3000);
});
}, 3000);
};
fib();
});
});
});
其中几个注意点:
这些都是为了绕过知道创宇的反爬虫机制.
在这个示例里, 不推荐使用 phantom, 因为这样的界面上并没有动态的内容, 而且页面间通过传统表单形式进行跳转, 这就有一些可以利用的空间了.
比如pagesize
默认选项只有三种: 5, 10, 20. 但经过测试, 设置1000也能正常获取数据. 所以我们这里就直接设置1000,一次性搞定.
const request = require('request');
const cheerio = require('cheerio');
const gbk = require('gbk');
request({
url: 'URL地址,和谐你我他',
method: 'POST',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3175.4 Safari/537.36'
},
form: {
domainName: '',
domainBlur: 0,
domainType: 0,
'page.pageSize': 1000,
pageNo: 1,
jumpPageNo: ''
},
encoding: null
}, (err, httpResponse, body) => {
const $ = cheerio.load(gbk.toString('utf-8', body));
const domains = [];
$('tr[id]').each((i, ele) => {
domains.push($(ele).children('td').eq(3).text().trim());
});
console.log('-------');
console.log('total domains:', domains.length);
const arr = [...new Set(domains)];
console.log('unique domains:', arr.length);
console.log('-------');
console.log(arr.sort().join('\n'));
process.exit();
});
完整的项目源码位于: https://github.com/willin/beian-domain/tree/develop