Skip to content

尝试 puppeteer 当爬虫 #13

@bravf

Description

@bravf
var puppeteer = require('puppeteer')
var fs = require('fs')

var city = 'bj'

var sleep = () => {
  var sleepTime = Math.random() * 500
  console.log(`休息 ${sleepTime} ms`)

  return new Promise(resolve => {
    setTimeout(() => {
      resolve()
    }, sleepTime)
  })
}

var autoScroll = (page) => {
  return page.evaluate(async () => {
    return new Promise(resolse => {
      var distance = 100000
      var height = 0
      var pn = 0
      
      var timer = setInterval(_ => {
        if (height === document.body.scrollHeight){
          clearInterval(timer)
          resolse()
        }
        else {
          height = document.body.scrollHeight
          window.scrollBy(0, distance)
          console.log(`第 ${pn ++} 页加载中...`)
        }
      }, 1000)
    })
    .then(_ => {
      console.log('已经加载到底部了')
    })
  })
}

// 进去列表页得到详情页列表
var getDetailUrls = async () => {
  var {browser, page} = await createBrowserPage()
  var listUrl = `https://${city}.xx.com/pinpaigongyu/`
  console.log(listUrl)
  await page.goto(listUrl)
  await autoScroll(page)

  var urls = await page.evaluate(() => {
    return [].slice.call(document.querySelectorAll('.list a')).map(i => i.href)
  })

  await page.close()
  await browser.close()
  return urls
}

// 得到详情页信息
var getDetails = async (detailUrls) => {
  var {browser, page} = await createBrowserPage()
  var details = []
  var count = detailUrls.length
  
  for (var i=0; i<count; i++){
    var url = detailUrls[i]
    console.log(`索引 ${i}/${count} , 开始处理 ${url}`)
    try{
      await page.goto(url)

      var detail = await page.evaluate( () => {
        return {
          url: location.href.split('?')[0],
          name: (document.querySelector('.logo-name') || document.querySelector('.apartment-info .name')).innerText,
          code: document.querySelector('.license').innerText,
        }
      })
      detail['city'] = city
      appendFile(detail)
      await sleep()
    }
    catch (ex) {
      console.error(ex)
      console.log('下次使用新的 browser page!!!')
      var bp = await createBrowserPage()
      browser = bp.browser
      page = bp.page
    }
  }

  await page.close()
  await browser.close()
  return details
}

var createBrowserPage = async () => {
  var browser = await puppeteer.launch()
  var page = await browser.newPage()

  page.on('console', msg => {
    if (msg._type === 'log'){
      console.log('PAGE LOG:', msg.text())
    }
  })

  return {browser, page}
}

// 把内容写到文件
var appendFile = (json) => {
  fs.appendFileSync(`./${city}-result.js`, JSON.stringify(json) + '\n')
}

void (async () => {
  try{
    var detailUrls = await getDetailUrls()
    var details = await getDetails(detailUrls)
    console.log('爬取结束,谢谢!!')
  }
  catch (ex){
    console.log(ex)
  }
})()

有两个问题:
1、当 page 实例运行时间过长,有时候会出现 Session closed. Most likely the page has been closed 的错误,简单处理方法是当 catch 到错误,直接构建一个新的 browser 和 page来用.

2、当整个爬虫结束时候,有时候控制台不会断开连接,需要手动 ctrl + c 结束,不知为何~

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions