Skip to content

Scrape a website asynchronously using a list of tor circuits #44

@ghost

Description

I want to scrape a website asynchronously using a list of tor circuits with different exit nodes and making sure each exit node only makes a request every 5 seconds.

For testing purposes, I'm using the website https://books.toscrape.com/ and I'm lowering the sleep time, number of circuits and number of pages to scrape.

It works fine without tor, but I'm getting the following error when I use tor.:

2022-09-06 11:08:49,380 [DEBUG] Loaded 10 authorities dir
2022-09-06 11:08:49,383 [DEBUG] Loaded 141 fallbacks dir
2022-09-06 11:08:49,383 [DEBUG] Using selector: EpollSelector
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
{}


import asyncio
import aiohttp
import logging

from docopt import docopt
from torpy import TorClient
from typing import Dict, List

logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s [%(levelname)s] %(message)s",
    handlers=[
        logging.FileHandler("debug.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)


def main():
    """
    Usage:
        scraper.py <url>... [--tor]
        scraper.py -h | --help

    Options:
        -h --help   Show this screen.
        --tor       Use tor to scrape website
    """
    args = docopt(main.__doc__)
    urls = args['<url>']
    tor = args['--tor']
    scrape_website(urls, tor)


def scrape_test_website() -> None:
    TEST_URL = "https://books.toscrape.com/catalogue/"
    urls = [f"{TEST_URL}page-{str(i)}.html" for i in range(1, 5)]
    print(scrape_website(urls, tor=True))


def scrape_website(urls: List[str], tor: bool = False) -> Dict:
    if tor:
        scraper = TorWebScraper(urls)
    else:
        scraper = WebScraper(urls)
    asyncio.run(scraper.run())
    return scraper.master_dict


class WebScraper(object):
    def __init__(self, urls: List[str]):
        self.urls = urls
        self.all_data = []
        self.master_dict = {}

    async def fetch(self, url: str) -> str:
        try:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as response:
                    text = await response.text()
                    return url, text
        except Exception as e:
            logger.error(e)

    async def run(self) -> None:
        tasks = []
        for url in self.urls:
            tasks.append(self.fetch(url))
        self.all_data = await asyncio.gather(*tasks)
        for data in self.all_data:
            if data is not None:
                url = data[0]
                self.master_dict[url] = {'raw_html': data[1]}


def get_circuits(n: int = 2) -> List:
    """
    Get a list of one-hop tor circuits with different nodes
    """
    circuits = []
    with TorClient() as tor:
        for _ in range(n):
            circuits.append(tor.create_circuit())
    return circuits


class TorWebScraper(WebScraper):
    def __init__(self, urls: List[str]):
        super().__init__(urls)
        self.circuits = get_circuits(2)

    async def fetch(self, url: str) -> str:
        try:
            async with aiohttp.ClientSession() as session:
                for circuit in self.circuits:
                    async with circuit.create_stream() as stream:
                        async with session.get(url, proxy=stream.proxy) as response:
                            await asyncio.sleep(20e-3)
                            text = await response.text()
                            return url, text
        except Exception as e:
            logger.error(e)


if __name__ == '__main__':
    #main()
    scrape_test_website()

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions