Scrape a website asynchronously using a list of tor circuits

I want to scrape a website asynchronously using a list of tor circuits with different exit nodes and making sure each exit node only makes a request every 5 seconds.

For testing purposes, I'm using the website https://books.toscrape.com/ and I'm lowering the sleep time, number of circuits and number of pages to scrape.

It works fine without tor, but I'm getting the following error when I use tor.:

    2022-09-06 11:08:49,380 [DEBUG] Loaded 10 authorities dir
    2022-09-06 11:08:49,383 [DEBUG] Loaded 141 fallbacks dir
    2022-09-06 11:08:49,383 [DEBUG] Using selector: EpollSelector
    2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
    2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
    2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
    2022-09-06 11:08:49,384 [ERROR] '_GeneratorContextManager' object has no attribute 'create_stream'
    {}


    import asyncio
    import aiohttp
    import logging
    
    from docopt import docopt
    from torpy import TorClient
    from typing import Dict, List
    
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s [%(levelname)s] %(message)s",
        handlers=[
            logging.FileHandler("debug.log"),
            logging.StreamHandler()
        ]
    )
    logger = logging.getLogger(__name__)
    
    
    def main():
        """
        Usage:
            scraper.py <url>... [--tor]
            scraper.py -h | --help
    
        Options:
            -h --help   Show this screen.
            --tor       Use tor to scrape website
        """
        args = docopt(main.__doc__)
        urls = args['<url>']
        tor = args['--tor']
        scrape_website(urls, tor)
    
    
    def scrape_test_website() -> None:
        TEST_URL = "https://books.toscrape.com/catalogue/"
        urls = [f"{TEST_URL}page-{str(i)}.html" for i in range(1, 5)]
        print(scrape_website(urls, tor=True))
    
    
    def scrape_website(urls: List[str], tor: bool = False) -> Dict:
        if tor:
            scraper = TorWebScraper(urls)
        else:
            scraper = WebScraper(urls)
        asyncio.run(scraper.run())
        return scraper.master_dict
    
    
    class WebScraper(object):
        def __init__(self, urls: List[str]):
            self.urls = urls
            self.all_data = []
            self.master_dict = {}
    
        async def fetch(self, url: str) -> str:
            try:
                async with aiohttp.ClientSession() as session:
                    async with session.get(url) as response:
                        text = await response.text()
                        return url, text
            except Exception as e:
                logger.error(e)
    
        async def run(self) -> None:
            tasks = []
            for url in self.urls:
                tasks.append(self.fetch(url))
            self.all_data = await asyncio.gather(*tasks)
            for data in self.all_data:
                if data is not None:
                    url = data[0]
                    self.master_dict[url] = {'raw_html': data[1]}
    
    
    def get_circuits(n: int = 2) -> List:
        """
        Get a list of one-hop tor circuits with different nodes
        """
        circuits = []
        with TorClient() as tor:
            for _ in range(n):
                circuits.append(tor.create_circuit())
        return circuits
    
    
    class TorWebScraper(WebScraper):
        def __init__(self, urls: List[str]):
            super().__init__(urls)
            self.circuits = get_circuits(2)
    
        async def fetch(self, url: str) -> str:
            try:
                async with aiohttp.ClientSession() as session:
                    for circuit in self.circuits:
                        async with circuit.create_stream() as stream:
                            async with session.get(url, proxy=stream.proxy) as response:
                                await asyncio.sleep(20e-3)
                                text = await response.text()
                                return url, text
            except Exception as e:
                logger.error(e)
    
    
    if __name__ == '__main__':
        #main()
        scrape_test_website()


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Scrape a website asynchronously using a list of tor circuits #44

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

Scrape a website asynchronously using a list of tor circuits #44

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions