-
Notifications
You must be signed in to change notification settings - Fork 0
Open
Description
var puppeteer = require('puppeteer')
var fs = require('fs')
var city = 'bj'
var sleep = () => {
var sleepTime = Math.random() * 500
console.log(`休息 ${sleepTime} ms`)
return new Promise(resolve => {
setTimeout(() => {
resolve()
}, sleepTime)
})
}
var autoScroll = (page) => {
return page.evaluate(async () => {
return new Promise(resolse => {
var distance = 100000
var height = 0
var pn = 0
var timer = setInterval(_ => {
if (height === document.body.scrollHeight){
clearInterval(timer)
resolse()
}
else {
height = document.body.scrollHeight
window.scrollBy(0, distance)
console.log(`第 ${pn ++} 页加载中...`)
}
}, 1000)
})
.then(_ => {
console.log('已经加载到底部了')
})
})
}
// 进去列表页得到详情页列表
var getDetailUrls = async () => {
var {browser, page} = await createBrowserPage()
var listUrl = `https://${city}.xx.com/pinpaigongyu/`
console.log(listUrl)
await page.goto(listUrl)
await autoScroll(page)
var urls = await page.evaluate(() => {
return [].slice.call(document.querySelectorAll('.list a')).map(i => i.href)
})
await page.close()
await browser.close()
return urls
}
// 得到详情页信息
var getDetails = async (detailUrls) => {
var {browser, page} = await createBrowserPage()
var details = []
var count = detailUrls.length
for (var i=0; i<count; i++){
var url = detailUrls[i]
console.log(`索引 ${i}/${count} , 开始处理 ${url}`)
try{
await page.goto(url)
var detail = await page.evaluate( () => {
return {
url: location.href.split('?')[0],
name: (document.querySelector('.logo-name') || document.querySelector('.apartment-info .name')).innerText,
code: document.querySelector('.license').innerText,
}
})
detail['city'] = city
appendFile(detail)
await sleep()
}
catch (ex) {
console.error(ex)
console.log('下次使用新的 browser page!!!')
var bp = await createBrowserPage()
browser = bp.browser
page = bp.page
}
}
await page.close()
await browser.close()
return details
}
var createBrowserPage = async () => {
var browser = await puppeteer.launch()
var page = await browser.newPage()
page.on('console', msg => {
if (msg._type === 'log'){
console.log('PAGE LOG:', msg.text())
}
})
return {browser, page}
}
// 把内容写到文件
var appendFile = (json) => {
fs.appendFileSync(`./${city}-result.js`, JSON.stringify(json) + '\n')
}
void (async () => {
try{
var detailUrls = await getDetailUrls()
var details = await getDetails(detailUrls)
console.log('爬取结束,谢谢!!')
}
catch (ex){
console.log(ex)
}
})()
有两个问题:
1、当 page 实例运行时间过长,有时候会出现 Session closed. Most likely the page has been closed 的错误,简单处理方法是当 catch 到错误,直接构建一个新的 browser 和 page来用.
2、当整个爬虫结束时候,有时候控制台不会断开连接,需要手动 ctrl + c 结束,不知为何~
Metadata
Metadata
Assignees
Labels
No labels