Skip to main content
deleted 44 characters in body
Source Link
user176168
user176168
# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        ssl = re.compile(r'^https://')
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])
# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        ssl = re.compile(r'^https://')
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])
# -*- coding: utf-8 -*-

from timeit import default_timer as timer
from sys import exit as abort

import time
import sys
import logging
import asyncio
import aiohttp
import defusedxml.ElementTree

class Logger(object):

    FMT = '%(name)s: %(levelname)s: %(message)s'

    def __init__(self):
        self._logger = logging.getLogger(__name__)
        self._logger.setLevel(level=logging.INFO)

        stdout = logging.StreamHandler(stream=sys.stdout)
        stderr = logging.StreamHandler(stream=sys.stderr)

        stdout.setLevel(level=logging.INFO)
        stderr.setLevel(level=logging.WARNING)

        stdout.addFilter(lambda record: record.levelno == logging.INFO)

        stdout.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))
        stderr.setFormatter(
            logging.Formatter(
                fmt=self.FMT,
                datefmt=None,
                style='%'))

        self._logger.addHandler(hdlr=stdout)
        self._logger.addHandler(hdlr=stderr)

    def __del__(self):
        if not self._logger.hasHandlers():
            return
        for handler in self._logger.handlers:
            if isinstance(handler, logging.StreamHandler):
                handler.flush()
                handler.close()
            self._logger.removeHandler(handler)


class Config(object):

    """Base Config."""

    LIMIT = 100
    TIMEOUT = None
    USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
    MAXSIZE = 0


class ProdConfig(Config):

    """Prod Config."""

    TIMEOUT = 8
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'
    MAXSIZE = 500


class Checker(object):

    """Sitemap Checker."""

    def __init__(self):
        self._logger = Logger()
        self._loop = asyncio.get_event_loop()
        self._queue = asyncio.Queue(
            maxsize=ProdConfig.MAXSIZE, loop=self._loop)

    def check(self, url):
        """Main() entry-point."""
        start = timer()
        self._loop.run_until_complete(self._fetch_links(url))
        elapsed = time.strftime(
            '%H:%M:%S', time.gmtime(timer() - start))
        self._logger._logger.info('time elapsed {}'.format(elapsed))

    async def _fetch_doc(self, client, url):
        """Fetch a sitemap.xml document."""
        self._logger._logger.info('fetching sitemap @ {}'.format(url))
        try:
            async with client.get(
                url=url,
                allow_redirects=True,
                timeout=ProdConfig.TIMEOUT, 
                verify_ssl=True
                    if url.startswith('https') else False) as response:
                response.raise_for_status()
                return await response.text()
        except aiohttp.ClientResponseError as error:
            self._logger._logger.error(
                'sitemap yielded <{}>'.format(
                    error.status))
        except aiohttp.ClientError as error:
            self._logger._logger.error(str(error))
        abort(1)

    async def _producer(self, doc):
        """Parse sitemap.xml and queue discovered links."""
        try:
            root = defusedxml.ElementTree.fromstring(doc)
        except defusedxml.ElementTree.ParseError:
            self._logger._logger.error('failed to parse *.xml document')
            abort(1)
        self._logger._logger.info(
            '*.xml document contains ({}) links'.format(
                len(root)))
        for link in root:
            if link:
                await self._queue.put(''.join(link[0].text.split()))

    async def _consumer(self, client):
        """Process queued links with HEAD requests."""
        while True:
            url = await self._queue.get()
            async with client.head(
                    url=url,
                    allow_redirects=True,
                    timeout=ProdConfig.TIMEOUT, 
                    verify_ssl=True if url.startswith('https') else False) as http:
                self._logger._logger.info(
                    '<{}> {} - {}'.format(http.status, http.reason, url))
                self._queue.task_done()

    async def _fetch_links(self, url):
        """Fetch sitemap.xml links."""
        headers = {'User-Agent': ProdConfig.USER_AGENT}
        connector = aiohttp.TCPConnector(
            limit=ProdConfig.LIMIT, loop=self._loop)
        async with aiohttp.ClientSession(
                connector=connector, loop=self._loop, headers=headers) as client:
            doc = await self._fetch_doc(client, url)
            consumer = asyncio.ensure_future(self._consumer(client))
            await self._producer(doc)
            await self._queue.join()
            consumer.cancel()

    def __del__(self):
        if self._loop:
            if not self._loop.is_running:
                self._loop.close()

if __name__ == '__main__':
    Checker().check(sys.argv[1])
added 84 characters in body
Source Link
user176168
user176168

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete. Github source code is available if needed.

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete.

When check() is given the URL https://www.google.com/flights/sitemap.xml with ~310 links, it takes approximately 00:03:24 minutes to complete. Github source code is available if needed.

deleted 69 characters in body
Source Link
user176168
user176168

An example use case is as followsUse case:

Note: timeout=10, limit=100, maxsize=500

An example use case is as follows:

Note: timeout=10, limit=100, maxsize=500

Use case:

added 4 characters in body
Source Link
user176168
user176168
Loading
added 72 characters in body
Source Link
user176168
user176168
Loading
added 1744 characters in body
Source Link
user176168
user176168
Loading
added 1744 characters in body
Source Link
user176168
user176168
Loading
deleted 97 characters in body
Source Link
user176168
user176168
Loading
added 1 character in body
Source Link
user176168
user176168
Loading
deleted 43 characters in body
Source Link
user176168
user176168
Loading
Tweeted twitter.com/StackCodeReview/status/1024308740311470080
added 23 characters in body
Source Link
user176168
user176168
Loading
edited title
Link
Heslacher
  • 51k
  • 5
  • 83
  • 177
Loading
added 3 characters in body; edited tags; edited title
Source Link
Jamal
  • 35.2k
  • 13
  • 134
  • 238
Loading
added 15 characters in body
Source Link
user176168
user176168
Loading
deleted 35 characters in body
Source Link
user176168
user176168
Loading
deleted 35 characters in body
Link
user176168
user176168
Loading
deleted 35 characters in body
Source Link
user176168
user176168
Loading
Source Link
user176168
user176168
Loading