Revisions to Asyncio HTTP Request Queue - Code Review Stack Exchange

deleted 44 characters in body

Source Link

edited Aug 5, 2018 at 13:18

user176168

# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        ssl = re.compile(r'^https://')
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])

# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        ssl = re.compile(r'^https://')
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])

# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])

added 84 characters in body

Source Link

edited Aug 4, 2018 at 1:52

user176168

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete. Github source code is available if needed.

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete.

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete. Github source code is available if needed.

deleted 69 characters in body

Source Link

edited Aug 3, 2018 at 15:06

user176168

An example use case is as followsUse case:

Note: timeout=10, limit=100, maxsize=500

added 4 characters in body

Source Link

edited Aug 3, 2018 at 13:52

user176168

Loading

added 72 characters in body

Source Link

edited Aug 3, 2018 at 12:02

user176168

Loading

added 1744 characters in body

Source Link

edited Aug 3, 2018 at 11:10

user176168

Loading

added 1744 characters in body

Source Link

edited Aug 3, 2018 at 11:05

user176168

Loading

deleted 97 characters in body

Source Link

edited Aug 3, 2018 at 0:41

user176168

Loading

added 1 character in body

Source Link

edited Aug 1, 2018 at 22:45

user176168

Loading

deleted 43 characters in body

Source Link

edited Aug 1, 2018 at 6:49

user176168

Loading

Tweeted twitter.com/StackCodeReview/status/1024308740311470080

occurred Jul 31, 2018 at 15:00

added 23 characters in body

Source Link

edited Jul 31, 2018 at 10:06

user176168

Loading

edited title

Link

edited Jul 31, 2018 at 5:49

Heslacher

51k
5
83
177

Loading

added 3 characters in body; edited tags; edited title

Source Link

edited Jul 31, 2018 at 4:16

Jamal

35.2k
13
134
238

Loading

added 15 characters in body

Source Link

edited Jul 31, 2018 at 1:41

user176168

Loading

deleted 35 characters in body

Source Link

edited Jul 31, 2018 at 1:32

user176168

Loading

deleted 35 characters in body

Link

edited Jul 31, 2018 at 1:26

user176168

Loading

deleted 35 characters in body

Source Link

edited Jul 31, 2018 at 1:21

user176168

Loading

Source Link

asked Jul 31, 2018 at 1:15

user176168

Loading

Stack Exchange Network

Return to Question