From 11d3f4713e49feb39f77f9febc712282eef8c860 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Jul 2022 10:48:33 +0200 Subject: [PATCH 001/126] changelog: cover 0.2.1 --- CHANGES.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 3dbc42c..5dd80c2 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,13 @@ Changes ======= +0.2.1 (to be released) +---------------------- + +* ``aiohttp.client_exceptions.ClientConnectorError`` is now treated as a + network error and retried accordingly. +* Removed the unused ``zyte_api.sync`` module. + 0.2.0 (2022-07-14) ------------------ From 8c8ffd8da39acc7504140ae056131e37f6695e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Jul 2022 13:37:51 +0200 Subject: [PATCH 002/126] Set a release date for 0.2.1 --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5dd80c2..c07b466 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changes ======= -0.2.1 (to be released) ----------------------- +0.2.1 (2022-07-29) +------------------ * ``aiohttp.client_exceptions.ClientConnectorError`` is now treated as a network error and retried accordingly. From f16b99cc7c79c21ef8cee4970cbff48cc93d8f91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 29 Jul 2022 13:38:20 +0200 Subject: [PATCH 003/126] =?UTF-8?q?Bump=20version:=200.2.0=20=E2=86=92=200?= =?UTF-8?q?.2.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e5ee2df..98c4b12 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0 +current_version = 0.2.1 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 17a8f56..6d1d7a5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.2.0' +release = u'0.2.1' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 7fd229a..fc79d63 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.2.0' +__version__ = '0.2.1' From 4a0cd6230a3ae7001461863ab1af45f3d82267b0 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 29 Jul 2022 20:38:32 +0500 Subject: [PATCH 004/126] clean up AggStats * n_results is renamed to n_success; * n_extracted_queries is removed, because it's always the same as n_results (i.e. n_success); * n_input_queries is removed: it wasn't really a number of input queries, (it was a number of processed queries), and it can be computed from other stats: success + fatal errors; * added a short comment which explains each stat value --- zyte_api/aio/client.py | 5 +---- zyte_api/stats.py | 26 ++++++++++++++------------ 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index e82c832..f939b59 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -114,14 +114,11 @@ async def request(): try: # Try to make a request result = await request() - self.agg_stats.n_extracted_queries += 1 + self.agg_stats.n_success += 1 except Exception: self.agg_stats.n_fatal_errors += 1 raise - finally: - self.agg_stats.n_input_queries += 1 - self.agg_stats.n_results += 1 return result def request_parallel_as_completed(self, diff --git a/zyte_api/stats.py b/zyte_api/stats.py index d2d804f..1789ee5 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -24,16 +24,13 @@ class AggStats: def __init__(self): self.time_connect_stats = Statistics() self.time_total_stats = Statistics() - self.n_results = 0 - self.n_fatal_errors = 0 - self.n_attempts = 0 - self.n_429 = 0 - self.n_errors = 0 + self.n_success = 0 # number of successful results returned to the user + self.n_fatal_errors = 0 # number of errors returned to the user, after all retries - self.n_input_queries = 0 - self.n_extracted_queries = 0 # Queries answered without any type of error - self.n_query_responses = 0 + self.n_attempts = 0 # total amount of requests made to Zyte API, including retries + self.n_429 = 0 # number of 429 (throttling) responses + self.n_errors = 0 # number of errors, including errors which were retried self.status_codes = Counter() self.exception_types = Counter() @@ -47,8 +44,8 @@ def __str__(self): self.n_errors - self.n_fatal_errors, self.n_fatal_errors, self.error_ratio(), - self.n_extracted_queries, - self.n_input_queries, + self.n_success, + self.n_processed, self.success_ratio() ) @@ -66,7 +63,7 @@ def summary(self): self.n_fatal_errors, self.n_errors - self.n_fatal_errors) + "Successful URLs: {} of {}\n".format( - self.n_extracted_queries, self.n_input_queries) + + self.n_success, self.n_processed) + "Success ratio: {:0.1%}\n".format(self.success_ratio()) ) @@ -80,7 +77,12 @@ def error_ratio(self): @zero_on_division_error def success_ratio(self): - return self.n_extracted_queries / self.n_input_queries + return self.n_success / self.n_processed + + @property + def n_processed(self): + """ Total number of processed URLs """ + return self.n_success + self.n_fatal_errors @attr.s From 438306714020a17d89511f588e37b91fb5e39ab3 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 29 Jul 2022 22:19:30 +0500 Subject: [PATCH 005/126] Changelog for 0.3.0 (#28) --- CHANGES.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index c07b466..c49c129 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,19 @@ Changes ======= +0.3.0 (to be released) +---------------------- + +Internal AggStats class is cleaned up: + +* ``AggStats.n_extracted_queries`` attribute is removed, as it was a duplicate + of ``AggStats.n_results`` +* ``AggStats.n_results`` is renamed to ``AggStats.n_success`` +* ``AggStats.n_input_queries`` is removed as redundant and misleading; + AggStats got a new ``AggStats.n_processed`` property instead. + +This change is backwards incompatible if you used stats directly. + 0.2.1 (2022-07-29) ------------------ From cb77806e979b73b79592bb2337cd1607da3a0c6e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 29 Jul 2022 22:22:57 +0500 Subject: [PATCH 006/126] Set a release date for 0.3.0 --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c49c129..21448ce 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changes ======= -0.3.0 (to be released) ----------------------- +0.3.0 (2022-07-29) +------------------ Internal AggStats class is cleaned up: From 095e47e6f645d31eac30f8a89e41297496b80644 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 29 Jul 2022 22:23:12 +0500 Subject: [PATCH 007/126] =?UTF-8?q?Bump=20version:=200.2.1=20=E2=86=92=200?= =?UTF-8?q?.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 98c4b12..abe6c79 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1 +current_version = 0.3.0 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 6d1d7a5..f0b95be 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.2.1' +release = u'0.3.0' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index fc79d63..0404d81 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.2.1' +__version__ = '0.3.0' From 6350223d185604c131ea9054419bd47c1b7d5618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 16 Sep 2022 09:46:34 +0200 Subject: [PATCH 008/126] =?UTF-8?q?Zyte=20Data=20API=20=E2=86=92=20Zyte=20?= =?UTF-8?q?API?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.rst | 6 +++--- docs/command_line.rst | 20 ++++++++++---------- docs/index.rst | 4 ++-- docs/install.rst | 4 ++-- setup.py | 2 +- zyte_api/__init__.py | 2 +- zyte_api/__main__.py | 12 ++++++------ zyte_api/aio/__init__.py | 2 +- zyte_api/aio/client.py | 4 ++-- zyte_api/aio/retry.py | 3 +-- zyte_api/errors.py | 2 +- 11 files changed, 30 insertions(+), 31 deletions(-) diff --git a/README.rst b/README.rst index d20d06b..75dc378 100644 --- a/README.rst +++ b/README.rst @@ -18,7 +18,7 @@ python-zyte-api :target: https://codecov.io/gh/zytedata/zyte-api :alt: Coverage report -Python client libraries for `Zyte Data API`_. +Python client libraries for `Zyte API`_. Command-line utility and asyncio-based library are provided by this package. @@ -34,7 +34,7 @@ Installation API key ======= -Make sure you have an API key for the `Zyte Data API`_ service. +Make sure you have an API key for the `Zyte API`_ service. You can set ``ZYTE_API_KEY`` environment variable with the key to avoid passing it around explicitly. @@ -46,4 +46,4 @@ License is BSD 3-clause. * Source code: https://github.com/zytedata/python-zyte-api * Issue tracker: https://github.com/zytedata/python-zyte-api/issues -.. _Zyte Data API: https://docs.zyte.com/zyte-api/get-started.html +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html diff --git a/docs/command_line.rst b/docs/command_line.rst index 0bddd20..c8cd39b 100644 --- a/docs/command_line.rst +++ b/docs/command_line.rst @@ -20,9 +20,9 @@ Then run a script, to get the results: .. note:: You may use ``python -m zyte_api`` instead of ``zyte-api``. -Requests to get browser HTML from those input URLs will be sent to Zyte Data -API, using up to 20 parallel connections, and the API responses will be stored -in the ``res.jsonl`` `JSON Lines`_ file, 1 response per line. +Requests to get browser HTML from those input URLs will be sent to Zyte API, +using up to 20 parallel connections, and the API responses will be stored in +the ``res.jsonl`` `JSON Lines`_ file, 1 response per line. .. _JSON Lines: https://jsonlines.org/ @@ -34,7 +34,7 @@ the content belongs to. If you need more flexibility, you can customize the requests by creating a JSON Lines file with queries: a JSON object per line. You can pass any -`Zyte Data API`_ options there. For example, you could create the following +`Zyte API`_ options there. For example, you could create the following ``requests.jsonl`` file: .. code-block:: json @@ -46,7 +46,7 @@ a JSON Lines file with queries: a JSON object per line. You can pass any See `API docs`_ for a description of all supported parameters. .. _API docs: https://docs.zyte.com/zyte-api/openapi.html -.. _Zyte Data API: https://docs.zyte.com/zyte-api/get-started.html +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html To get results for this ``requests.jsonl`` file, run: @@ -76,11 +76,11 @@ throttling errors. They are handled by CLI automatically, but they make extraction less efficient; please tune the concurrency options to not hit the throttling errors (HTTP 429) often. -You may be also limited by the website speed. The Zyte Data API tries not to hit -any individual website too hard, but it could be better to limit this on -a client side as well. If you're extracting data from a single website, -it could make sense to decrease the amount of parallel requests; it can ensure -higher success ratio overall. +You may be also limited by the website speed. The Zyte API tries not to hit any +individual website too hard, but it could be better to limit this on a client +side as well. If you're extracting data from a single website, it could make +sense to decrease the amount of parallel requests; it can ensure higher success +ratio overall. If you're extracting data from multiple websites, it makes sense to spread the load across time: if you have websites A, B and C, don't send requests in diff --git a/docs/index.rst b/docs/index.rst index f066ab3..e34d233 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,7 +2,7 @@ python-zyte-api =============== -Python client libraries for `Zyte Data API`_. +Python client libraries for `Zyte API`_. Command-line utility and asyncio-based library are provided by this package. @@ -25,4 +25,4 @@ Command-line utility and asyncio-based library are provided by this package. changelog license -.. _Zyte Data API: https://docs.zyte.com/zyte-api/get-started.html \ No newline at end of file +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html \ No newline at end of file diff --git a/docs/install.rst b/docs/install.rst index 7816f09..587b4fd 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -13,8 +13,8 @@ Installation API key ======= -Make sure you have an API key for the `Zyte Data API`_ service. +Make sure you have an API key for the `Zyte API`_ service. You can set ``ZYTE_API_KEY`` environment variable with the key to avoid passing it around explicitly. -.. _Zyte Data API: https://docs.zyte.com/zyte-api/get-started.html +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html diff --git a/setup.py b/setup.py index d97d3eb..353c5b1 100755 --- a/setup.py +++ b/setup.py @@ -14,7 +14,7 @@ def get_version(): setup( name='zyte-api', version=get_version(), - description='Python interface to Zyte Data API', + description='Python interface to Zyte API', long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), long_description_content_type='text/x-rst', author='Zyte Group Ltd', diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 74f9aaa..a903afd 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -1,3 +1,3 @@ """ -Python client libraries and command line utilities for Zyte Data API +Python client libraries and command line utilities for Zyte API """ \ No newline at end of file diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index b47b5f4..5ad2407 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -1,4 +1,4 @@ -""" Basic command-line interface for Zyte Data APIs. """ +""" Basic command-line interface for Zyte API. """ import argparse import json @@ -77,11 +77,11 @@ def read_input(input_fp, intype): def _main(program_name='zyte-api'): - """ Process urls from input file through Zyte Data API """ + """ Process urls from input file through Zyte API """ p = argparse.ArgumentParser( prog=program_name, description=""" - Process input URLs from a file using Zyte Data API. + Process input URLs from a file using Zyte API. """, ) p.add_argument("input", @@ -107,11 +107,11 @@ def _main(program_name='zyte-api'): help="number of connections to the API server " "(default: %(default)s)") p.add_argument("--api-key", - help="Zyte Data API key. " + help="Zyte API key. " "You can also set %s environment variable instead " "of using this option." % ENV_VARIABLE) p.add_argument("--api-url", - help="Zyte Data API endpoint (default: %(default)s)", + help="Zyte API endpoint (default: %(default)s)", default=API_URL) p.add_argument("--loglevel", "-L", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], @@ -130,7 +130,7 @@ def _main(program_name='zyte-api'): queries = queries[:args.limit] logger.info(f"Loaded {len(queries)} urls from {args.input.name}; shuffled: {args.shuffle}") - logger.info(f"Running Zyte Data API (connections: {args.n_conn})") + logger.info(f"Running Zyte API (connections: {args.n_conn})") loop = asyncio.get_event_loop() coro = run(queries, diff --git a/zyte_api/aio/__init__.py b/zyte_api/aio/__init__.py index db16ed4..b69b052 100644 --- a/zyte_api/aio/__init__.py +++ b/zyte_api/aio/__init__.py @@ -1,3 +1,3 @@ """ -Asyncio client for Zyte Data API +Asyncio client for Zyte API """ \ No newline at end of file diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index f939b59..82162eb 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -1,5 +1,5 @@ """ -Asyncio client for Zyte Data API +Asyncio client for Zyte API """ import asyncio @@ -127,7 +127,7 @@ def request_parallel_as_completed(self, endpoint: str = 'extract', session: Optional[aiohttp.ClientSession] = None, ) -> Iterator[asyncio.Future]: - """ Send multiple requests to Zyte Data API in parallel. + """ Send multiple requests to Zyte API in parallel. Return an `asyncio.as_completed` iterator. ``queries`` is a list of requests to process (dicts). diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index aaaa959..8777a52 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -1,8 +1,7 @@ # -*- coding: utf-8 -*- """ -Zyte Data Extraction retrying logic. +Zyte API retrying logic. -TODO: add sync support; only aio is supported at the moment. TODO: Implement retry logic for temparary errors (520) using the proposed retry-after header. """ import asyncio diff --git a/zyte_api/errors.py b/zyte_api/errors.py index eab2183..b608bf1 100644 --- a/zyte_api/errors.py +++ b/zyte_api/errors.py @@ -6,7 +6,7 @@ @attr.s(auto_attribs=True) class ParsedError: - """ Parsed error from Zyte Data API """ + """ Parsed error from Zyte API """ response_body: bytes data: Optional[dict] parse_error: Optional[str] From 5b943cb884fbdb90293357a0536d3e190f65dc14 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Mon, 19 Sep 2022 20:49:10 +0800 Subject: [PATCH 009/126] add brotli as a dependency --- CHANGES.rst | 5 +++++ setup.py | 3 ++- zyte_api/aio/client.py | 7 +++++++ 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 21448ce..dffa5b0 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,11 @@ Changes ======= +TBR +--- + +* Require to install ``Brotli`` as a dependency. + 0.3.0 (2022-07-29) ------------------ diff --git a/setup.py b/setup.py index 353c5b1..184fcec 100755 --- a/setup.py +++ b/setup.py @@ -27,10 +27,11 @@ def get_version(): install_requires=[ 'requests', 'tenacity', - 'aiohttp >= 3.6.0', + 'aiohttp >= 3.7.3', 'tqdm', 'attrs', 'runstats', + 'brotli', ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 82162eb..50d2c66 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -10,6 +10,7 @@ import aiohttp from aiohttp import TCPConnector from tenacity import AsyncRetrying +from aiohttp.http_parser import HAS_BROTLI from .errors import RequestError from .retry import zyte_api_retrying @@ -65,6 +66,12 @@ async def request_raw(self, query: dict, *, auth = aiohttp.BasicAuth(self.api_key) headers = {'User-Agent': user_agent(aiohttp)} + # NOTE: Remove this check if the following commit for aiohttp which + # adds direct client support for brotli has been released after 3.8.1: + # https://github.com/aio-libs/aiohttp/commit/28ea32d2282728a94af73c87efd6ab314c14320e + if HAS_BROTLI: + headers['Accept-Encoding'] = 'br' + response_stats = [] start_global = time.perf_counter() From eaf9208c30ae3b385d679926ff205e407b745fad Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 20 Sep 2022 14:56:06 +0800 Subject: [PATCH 010/126] simplify brotli declaration in headers --- zyte_api/aio/client.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 50d2c66..164402d 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -10,7 +10,6 @@ import aiohttp from aiohttp import TCPConnector from tenacity import AsyncRetrying -from aiohttp.http_parser import HAS_BROTLI from .errors import RequestError from .retry import zyte_api_retrying @@ -64,13 +63,7 @@ async def request_raw(self, query: dict, *, retrying = retrying or zyte_api_retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = {'User-Agent': user_agent(aiohttp)} - - # NOTE: Remove this check if the following commit for aiohttp which - # adds direct client support for brotli has been released after 3.8.1: - # https://github.com/aio-libs/aiohttp/commit/28ea32d2282728a94af73c87efd6ab314c14320e - if HAS_BROTLI: - headers['Accept-Encoding'] = 'br' + headers = {'User-Agent': user_agent(aiohttp), 'Accept-Encoding': 'br'} response_stats = [] start_global = time.perf_counter() From 15a3e40ca9039912577071658f39d98c28796ce3 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 20 Sep 2022 16:38:02 +0800 Subject: [PATCH 011/126] update CHANGES.rst with 0.4.0 changes --- CHANGES.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index dffa5b0..0646125 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,10 +1,11 @@ Changes ======= -TBR ---- +0.4.0 (2022-09-20) +------------------ -* Require to install ``Brotli`` as a dependency. +* Require to install ``Brotli`` as a dependency. This changes the requests to + have ``Accept-Encoding: br`` and automatically decompress brotli responses. 0.3.0 (2022-07-29) ------------------ From 37a126fdd9b6e5dd4e7ed70957902994edad0778 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 20 Sep 2022 16:38:52 +0800 Subject: [PATCH 012/126] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?= =?UTF-8?q?.4.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index abe6c79..5661783 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0 +current_version = 0.4.0 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index f0b95be..8ea7b56 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.3.0' +release = u'0.4.0' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 0404d81..abeeedb 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.3.0' +__version__ = '0.4.0' From d72f7af42f6d8453f28158f024ba2ec3c5038321 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 13 Oct 2022 07:20:57 +0200 Subject: [PATCH 013/126] =?UTF-8?q?Network=20error=20retry=20time:=205=20m?= =?UTF-8?q?inutes=20=E2=86=92=2015=20minutes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/aio/retry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index 8777a52..17eb45b 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -87,7 +87,7 @@ class RetryFactory: ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never - network_error_stop = stop_after_delay(5 * 60) + network_error_stop = stop_after_delay(15 * 60) temporary_download_error_stop = stop_after_attempt(4) def wait(self, retry_state: RetryCallState) -> float: From f404b8956ebf0c2144830a3631590328cacfcd42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sun, 16 Oct 2022 12:21:25 +0200 Subject: [PATCH 014/126] 0.4.1 release notes (#34) --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 0646125..407087b 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +0.4.1 (to be released) +---------------------- + +* Network errors, like server timeouts or disconnections, are now retried for + up to 15 minutes, instead of 5 minutes. + 0.4.0 (2022-09-20) ------------------ From 1ac47c688d5deed61f4255ebcba38eed698e83ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sun, 16 Oct 2022 12:23:26 +0200 Subject: [PATCH 015/126] =?UTF-8?q?Bump=20version:=200.4.0=20=E2=86=92=200?= =?UTF-8?q?.4.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5661783..1b325d6 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.0 +current_version = 0.4.1 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 8ea7b56..122fe55 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.0' +release = u'0.4.1' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index abeeedb..f0ede3d 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.0' +__version__ = '0.4.1' From 92c594377c06a1e696e47c5ad753c23be6b6f5f2 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 28 Oct 2022 14:37:59 +0500 Subject: [PATCH 016/126] Update the minimum aiohttp version (#36) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 184fcec..9be4f01 100755 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ def get_version(): install_requires=[ 'requests', 'tenacity', - 'aiohttp >= 3.7.3', + 'aiohttp >= 3.8.0', 'tqdm', 'attrs', 'runstats', From 119d7102107d27bc1b1c98c1f7068b57fa98b3d5 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 28 Oct 2022 14:41:35 +0500 Subject: [PATCH 017/126] declare Python 3.11 support; bump mypy version (just in case) --- .github/workflows/test.yml | 2 +- setup.py | 1 + tox.ini | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 1ac85a6..d3df3bd 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] + python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 diff --git a/setup.py b/setup.py index 9be4f01..a6de7a5 100755 --- a/setup.py +++ b/setup.py @@ -44,5 +44,6 @@ def get_version(): 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', + 'Programming Language :: Python :: 3.11', ], ) diff --git a/tox.ini b/tox.ini index c4cebf1..0013ca2 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,mypy,docs +envlist = py37,py38,py39,py310,py311,mypy,docs [testenv] deps = @@ -13,7 +13,7 @@ commands = [testenv:mypy] deps = - mypy==0.910 + mypy==0.982 commands = mypy --ignore-missing-imports --no-warn-no-return \ zyte_api \ From e893a6d35cf08d3c367286a73f2725ceb5831f82 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 28 Oct 2022 16:09:55 +0500 Subject: [PATCH 018/126] changelog for 0.4.2 --- CHANGES.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 407087b..c9e5f0a 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,14 @@ Changes ======= -0.4.1 (to be released) ----------------------- +0.4.2 (2022-10-28) +------------------ +* Bump minimum ``aiohttp`` version to 3.8.0, as earlier versions don't support + brotli decompression of responses +* Declared Python 3.11 support + +0.4.1 (2022-10-16) +------------------ * Network errors, like server timeouts or disconnections, are now retried for up to 15 minutes, instead of 5 minutes. From b2edeac6f36f61a5714d5ed6ed7ae090bfc3c797 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 28 Oct 2022 16:10:07 +0500 Subject: [PATCH 019/126] =?UTF-8?q?Bump=20version:=200.4.1=20=E2=86=92=200?= =?UTF-8?q?.4.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 1b325d6..e091e60 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.1 +current_version = 0.4.2 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 122fe55..ced4b3c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.1' +release = u'0.4.2' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index f0ede3d..a987347 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.1' +__version__ = '0.4.2' From e4f95a8d9108d57e5d10b72997b8084aa55eff2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Nov 2022 11:27:40 +0100 Subject: [PATCH 020/126] RFC-3986-encode URLs --- setup.py | 9 ++-- tests/test_utils.py | 45 ++++++++++++++++++- zyte_api/aio/client.py | 4 +- zyte_api/utils.py | 98 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 149 insertions(+), 7 deletions(-) diff --git a/setup.py b/setup.py index a6de7a5..1ea581d 100755 --- a/setup.py +++ b/setup.py @@ -25,13 +25,14 @@ def get_version(): 'console_scripts': ['zyte-api=zyte_api.__main__:_main'], }, install_requires=[ - 'requests', - 'tenacity', 'aiohttp >= 3.8.0', - 'tqdm', 'attrs', - 'runstats', 'brotli', + 'requests', + 'runstats', + 'tenacity', + 'tqdm', + 'w3lib', ], classifiers=[ 'Development Status :: 3 - Alpha', diff --git a/tests/test_utils.py b/tests/test_utils.py index af3c739..d72f2c8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,6 +1,7 @@ import pytest +from pytest import raises -from zyte_api.utils import _guess_intype +from zyte_api.utils import _guess_intype, _process_query @pytest.mark.parametrize( @@ -55,3 +56,45 @@ ) def test_guess_intype(file_name, first_line, expected): assert _guess_intype(file_name, [first_line]) == expected + + +@pytest.mark.parametrize( + "unaffected", + ( + {}, + {"a": "b"}, + {"a": {"b": "c"}}, + ), +) +@pytest.mark.parametrize( + "input,output", + ( + ( + {"url": "https://example.com"}, + {"url": "https://example.com"}, + ), + ( + {"url": "https://example.com/a b"}, + {"url": "https://example.com/a%20b"}, + ), + ( + {"url": "https://example.com/a|b"}, + {"url": "https://example.com/a%7Cb"}, + ), + ( + {"url": "https://example.com?a=b c"}, + {"url": "https://example.com?a=b%20c"}, + ), + ( + {"url": "https://example.com?a=b|c"}, + {"url": "https://example.com?a=b%7Cc"}, + ), + ), +) +def test_process_query(unaffected, input, output): + assert _process_query({**unaffected, **input}) == {**unaffected, **output} + + +def test_process_query_bytes(): + with raises(ValueError): + _process_query({"url": b"https://example.com"}) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 164402d..a05bdd4 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -16,7 +16,7 @@ from ..apikey import get_apikey from ..constants import API_URL, API_TIMEOUT from ..stats import AggStats, ResponseStats -from ..utils import user_agent +from ..utils import _process_query, user_agent # 120 seconds is probably too long, but we are concerned about the case with @@ -74,7 +74,7 @@ async def request(): post_kwargs = dict( url=self.api_url + endpoint, - json=query, + json=_process_query(query), auth=auth, headers=headers, ) diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 2be98de..ee69c2d 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -1,8 +1,21 @@ import re from os.path import splitext +from typing import Union +from urllib.parse import quote, unquote, urlsplit, urlunsplit + +from w3lib.url import RFC3986_RESERVED, RFC3986_UNRESERVED, RFC3986_USERINFO_SAFE_CHARS +from w3lib.util import to_unicode from .__version__ import __version__ +# https://github.com/scrapy/w3lib/blob/8e19741b6b004d6248fb70b525255a96a1eb1ee6/w3lib/url.py#L61-L63 +_ascii_tab_newline_re = re.compile( + r"[\t\n\r]" +) + +_SAFE_CHARS = RFC3986_RESERVED + RFC3986_UNRESERVED + b"%" +_PATH_SAFE_CHARS = _SAFE_CHARS.replace(b"#", b"") + def _guess_intype(file_name, lines): _, dot_extension = splitext(file_name) @@ -18,6 +31,91 @@ def _guess_intype(file_name, lines): return "txt" +def _process_query(query): + """Given a query to be sent to Zyte API, return a functionally-equivalent + query that fixes any known issue. + + Specifically, unsafe characters in the query URL are escaped. + + *query* is never modified in place, but the returned object is not + guaranteed to be a copy of *query*: it could be *query* itself if no + changes where needed, or a shallow copy of *query* with some common nested + objects (e.g. shared ``actions`` list). + """ + url = query.get("url", None) + if url is None: + return query + if not isinstance(url, str): + raise ValueError(f"Expected a str URL parameter, got {type(url)}") + safe_url = _safe_url_string(url) + if url == safe_url: + return query + return {**query, "url": safe_url} + + +def _safe_url_string( + url: Union[bytes, str], + encoding: str = "utf8", + path_encoding: str = "utf8", + quote_path: bool = True, +) -> str: + """Fork of ``w3lib.url.safe_url_string`` that enforces `RFC-3986`_. + + ``w3lib.url.safe_url_string`` has an implementation closer to the + `URL living standard`_ (e.g. does not encode “|”), while Zyte API expects + RFC-3986-compliant URLs. + + Forked w3lib commit: 8e19741b6b004d6248fb70b525255a96a1eb1ee6 + + .. _RFC-3986: https://datatracker.ietf.org/doc/html/rfc3986 + .. _URL living standard: https://url.spec.whatwg.org/ + """ + decoded = to_unicode(url, encoding=encoding, errors="percentencode") + parts = urlsplit(_ascii_tab_newline_re.sub("", decoded)) + + username, password, hostname, port = ( + parts.username, + parts.password, + parts.hostname, + parts.port, + ) + netloc_bytes = b"" + if username is not None or password is not None: + if username is not None: + safe_username = quote(unquote(username), RFC3986_USERINFO_SAFE_CHARS) + netloc_bytes += safe_username.encode(encoding) + if password is not None: + netloc_bytes += b":" + safe_password = quote(unquote(password), RFC3986_USERINFO_SAFE_CHARS) + netloc_bytes += safe_password.encode(encoding) + netloc_bytes += b"@" + if hostname is not None: + try: + netloc_bytes += hostname.encode("idna") + except UnicodeError: + netloc_bytes += hostname.encode(encoding) + if port is not None: + netloc_bytes += b":" + netloc_bytes += str(port).encode(encoding) + + netloc = netloc_bytes.decode() + + if quote_path: + path = quote(parts.path.encode(path_encoding), _PATH_SAFE_CHARS) + else: + path = parts.path + + return urlunsplit( + ( + parts.scheme, + netloc, + path, + quote(parts.query.encode(encoding), _SAFE_CHARS), + quote(parts.fragment.encode(encoding), _SAFE_CHARS), + ) + ) + + def user_agent(library): return 'python-zyte-api/{} {}/{}'.format( __version__, From 17b7dad566b910cb4e657d24a08467b74d00101d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 1 Nov 2022 20:31:28 +0100 Subject: [PATCH 021/126] =?UTF-8?q?Union[bytes,=20str]=20=E2=86=92=20AnySt?= =?UTF-8?q?r?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_api/utils.py b/zyte_api/utils.py index ee69c2d..2e888a8 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -1,6 +1,6 @@ import re from os.path import splitext -from typing import Union +from typing import AnyStr from urllib.parse import quote, unquote, urlsplit, urlunsplit from w3lib.url import RFC3986_RESERVED, RFC3986_UNRESERVED, RFC3986_USERINFO_SAFE_CHARS @@ -54,7 +54,7 @@ def _process_query(query): def _safe_url_string( - url: Union[bytes, str], + url: AnyStr, encoding: str = "utf8", path_encoding: str = "utf8", quote_path: bool = True, From a3703b4121763b36ef9d331bc4ae2a837a6a3e79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 3 Nov 2022 04:41:22 +0100 Subject: [PATCH 022/126] =?UTF-8?q?URL=20character=20escaping:=20RFC-3986?= =?UTF-8?q?=20=E2=86=92=20RFC-2396?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_utils.py | 104 ++++++++++++++++++++++++++++++++++++-------- zyte_api/utils.py | 32 ++++++++++---- 2 files changed, 109 insertions(+), 27 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index d72f2c8..b035fd8 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,7 +1,16 @@ +from itertools import chain + import pytest from pytest import raises +from w3lib.url import _path_safe_chars, _safe_chars -from zyte_api.utils import _guess_intype, _process_query +from zyte_api.utils import ( + _guess_intype, + _process_query, + RFC2396_FRAGMENT_SAFE_CHARS, + RFC2396_PATH_SAFE_CHARS, + RFC2396_QUERY_SAFE_CHARS, +) @pytest.mark.parametrize( @@ -58,14 +67,6 @@ def test_guess_intype(file_name, first_line, expected): assert _guess_intype(file_name, [first_line]) == expected -@pytest.mark.parametrize( - "unaffected", - ( - {}, - {"a": "b"}, - {"a": {"b": "c"}}, - ), -) @pytest.mark.parametrize( "input,output", ( @@ -74,25 +75,90 @@ def test_guess_intype(file_name, first_line, expected): {"url": "https://example.com"}, ), ( - {"url": "https://example.com/a b"}, - {"url": "https://example.com/a%20b"}, + { + "a": {"b", "c"}, + "d": "https://example.com/ a", + "url": "https://example.com/ a", + }, + { + "a": {"b", "c"}, + "d": "https://example.com/ a", + "url": "https://example.com/%20a", + }, + ), + *( + ( + {"url": f"https://example.com/{bytes([char]).decode()}"}, + {"url": f"https://example.com/%{char:X}"}, + ) + for char in chain( + ( + ord(' '), + ), + # Characters that w3lib would not escape: []|% + ( + char + for char in _path_safe_chars + if ( + char not in RFC2396_PATH_SAFE_CHARS + and char not in b"?#" + ) + ) + ) ), ( - {"url": "https://example.com/a|b"}, - {"url": "https://example.com/a%7Cb"}, + {"url": "https://example.com/ñ"}, + {"url": "https://example.com/%C3%B1"}, + ), + *( + ( + {"url": f"https://example.com?{bytes([char]).decode()}"}, + {"url": f"https://example.com?%{char:X}"}, + ) + for char in chain( + ( + ord(' '), + ), + # Characters that w3lib would not escape: []|% + ( + char + for char in _safe_chars + if ( + char not in RFC2396_QUERY_SAFE_CHARS + and char not in b"#" + ) + ) + ) ), ( - {"url": "https://example.com?a=b c"}, - {"url": "https://example.com?a=b%20c"}, + {"url": "https://example.com?ñ"}, + {"url": "https://example.com?%C3%B1"}, + ), + *( + ( + {"url": f"https://example.com#{bytes([char]).decode()}"}, + {"url": f"https://example.com#%{char:X}"}, + ) + for char in chain( + ( + ord(' '), + ), + # Characters that w3lib would not escape: #[]|% + ( + char + for char in _safe_chars + if char not in RFC2396_FRAGMENT_SAFE_CHARS + ) + ) ), ( - {"url": "https://example.com?a=b|c"}, - {"url": "https://example.com?a=b%7Cc"}, + {"url": "https://example.com#ñ"}, + {"url": "https://example.com#%C3%B1"}, ), ), ) -def test_process_query(unaffected, input, output): - assert _process_query({**unaffected, **input}) == {**unaffected, **output} +def test_process_query(input, output): + assert _process_query(input) == output def test_process_query_bytes(): diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 2e888a8..6bbbba0 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -1,9 +1,9 @@ import re +from string import ascii_letters, digits from os.path import splitext from typing import AnyStr from urllib.parse import quote, unquote, urlsplit, urlunsplit -from w3lib.url import RFC3986_RESERVED, RFC3986_UNRESERVED, RFC3986_USERINFO_SAFE_CHARS from w3lib.util import to_unicode from .__version__ import __version__ @@ -13,8 +13,15 @@ r"[\t\n\r]" ) -_SAFE_CHARS = RFC3986_RESERVED + RFC3986_UNRESERVED + b"%" -_PATH_SAFE_CHARS = _SAFE_CHARS.replace(b"#", b"") +RFC2396_RESERVED_CHARS = b";/?:@&=+$," +RFC2396_UNRESERVED_CHARS = (ascii_letters + digits + "-_.!~*'()").encode() +RFC2396_USERINFO_SAFE_CHARS = RFC2396_UNRESERVED_CHARS + b";:&=+$," +RFC2396_PATH_SEPARATORS = b"/;" +RFC2396_PATH_SAFE_CHARS = ( + RFC2396_UNRESERVED_CHARS + RFC2396_PATH_SEPARATORS + b":@&=+$," +) +RFC2396_QUERY_SAFE_CHARS = RFC2396_RESERVED_CHARS + RFC2396_UNRESERVED_CHARS +RFC2396_FRAGMENT_SAFE_CHARS = RFC2396_QUERY_SAFE_CHARS def _guess_intype(file_name, lines): @@ -82,11 +89,17 @@ def _safe_url_string( netloc_bytes = b"" if username is not None or password is not None: if username is not None: - safe_username = quote(unquote(username), RFC3986_USERINFO_SAFE_CHARS) + safe_username = quote( + unquote(username), + RFC2396_USERINFO_SAFE_CHARS, + ) netloc_bytes += safe_username.encode(encoding) if password is not None: netloc_bytes += b":" - safe_password = quote(unquote(password), RFC3986_USERINFO_SAFE_CHARS) + safe_password = quote( + unquote(password), + RFC2396_USERINFO_SAFE_CHARS, + ) netloc_bytes += safe_password.encode(encoding) netloc_bytes += b"@" if hostname is not None: @@ -101,7 +114,7 @@ def _safe_url_string( netloc = netloc_bytes.decode() if quote_path: - path = quote(parts.path.encode(path_encoding), _PATH_SAFE_CHARS) + path = quote(parts.path.encode(path_encoding), RFC2396_PATH_SAFE_CHARS) else: path = parts.path @@ -110,8 +123,11 @@ def _safe_url_string( parts.scheme, netloc, path, - quote(parts.query.encode(encoding), _SAFE_CHARS), - quote(parts.fragment.encode(encoding), _SAFE_CHARS), + quote(parts.query.encode(encoding), RFC2396_QUERY_SAFE_CHARS), + quote( + parts.fragment.encode(encoding), + RFC2396_FRAGMENT_SAFE_CHARS, + ), ) ) From 58086f52c0eb2b58ad7878211081df153f5a57b4 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 10 Nov 2022 00:01:27 +0500 Subject: [PATCH 023/126] Don't reuse the connections. It seems aiohttp has troubles with edge cases of Keep-Alive, and disabling it helps with ServerDisconnectedErrors. Using aiohttp sessions is still important, because it allows to reduce the number of ClientConnectorErrors. --- CHANGES.rst | 6 ++++++ zyte_api/aio/client.py | 9 +++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index c9e5f0a..5ce14cd 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +(TBD) +----- + +* Connections are no longer reused between requests. + This reduces the amount of ``ServerDisconnectedError`` exceptions. + 0.4.2 (2022-10-28) ------------------ * Bump minimum ``aiohttp`` version to 3.8.0, as earlier versions don't support diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 164402d..52a47ad 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -29,7 +29,8 @@ def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: """ Create a session with parameters suited for Zyte API """ kwargs.setdefault('timeout', AIO_API_TIMEOUT) if "connector" not in kwargs: - kwargs["connector"] = TCPConnector(limit=connection_pool_size) + kwargs["connector"] = TCPConnector(limit=connection_pool_size, + force_close=True) return aiohttp.ClientSession(**kwargs) @@ -132,9 +133,9 @@ def request_parallel_as_completed(self, ``queries`` is a list of requests to process (dicts). - ``session`` is an optional aiohttp.ClientSession object; - use it to enable HTTP Keep-Alive. Set the session TCPConnector - limit to a value greater than the number of connections. + ``session`` is an optional aiohttp.ClientSession object. + Set the session TCPConnector limit to a value greater than + the number of connections. """ sem = asyncio.Semaphore(self.n_conn) From 84f093707c3fa16ba2de5b84d96bbc9bdda843ee Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 10 Nov 2022 22:51:22 +0500 Subject: [PATCH 024/126] set release date --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 5ce14cd..e4c1980 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changes ======= -(TBD) ------ +0.4.3 (2022-11-10) +------------------ * Connections are no longer reused between requests. This reduces the amount of ``ServerDisconnectedError`` exceptions. From d15915c2101a227c7d3b2d21896b3a81a3dac05f Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 10 Nov 2022 22:51:36 +0500 Subject: [PATCH 025/126] =?UTF-8?q?Bump=20version:=200.4.2=20=E2=86=92=200?= =?UTF-8?q?.4.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e091e60..b7e8846 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.2 +current_version = 0.4.3 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index ced4b3c..66d5383 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.2' +release = u'0.4.3' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index a987347..908c0bb 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.2' +__version__ = '0.4.3' From 954181d454f92d9a5e4628df4ba446fd4ef05202 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 28 Nov 2022 16:55:14 +0100 Subject: [PATCH 026/126] Use w3lib.url.safe_url_string to make URLs safe --- tests/test_utils.py | 77 +++-------------------------------- zyte_api/utils.py | 98 +++------------------------------------------ 2 files changed, 10 insertions(+), 165 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index b035fd8..d6c4dee 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -7,9 +7,6 @@ from zyte_api.utils import ( _guess_intype, _process_query, - RFC2396_FRAGMENT_SAFE_CHARS, - RFC2396_PATH_SAFE_CHARS, - RFC2396_QUERY_SAFE_CHARS, ) @@ -70,10 +67,13 @@ def test_guess_intype(file_name, first_line, expected): @pytest.mark.parametrize( "input,output", ( + # Safe URLs are returned unmodified. ( {"url": "https://example.com"}, {"url": "https://example.com"}, ), + # Unsafe URLs in the url field are modified, while left untouched on + # other fields. ( { "a": {"b", "c"}, @@ -86,75 +86,8 @@ def test_guess_intype(file_name, first_line, expected): "url": "https://example.com/%20a", }, ), - *( - ( - {"url": f"https://example.com/{bytes([char]).decode()}"}, - {"url": f"https://example.com/%{char:X}"}, - ) - for char in chain( - ( - ord(' '), - ), - # Characters that w3lib would not escape: []|% - ( - char - for char in _path_safe_chars - if ( - char not in RFC2396_PATH_SAFE_CHARS - and char not in b"?#" - ) - ) - ) - ), - ( - {"url": "https://example.com/ñ"}, - {"url": "https://example.com/%C3%B1"}, - ), - *( - ( - {"url": f"https://example.com?{bytes([char]).decode()}"}, - {"url": f"https://example.com?%{char:X}"}, - ) - for char in chain( - ( - ord(' '), - ), - # Characters that w3lib would not escape: []|% - ( - char - for char in _safe_chars - if ( - char not in RFC2396_QUERY_SAFE_CHARS - and char not in b"#" - ) - ) - ) - ), - ( - {"url": "https://example.com?ñ"}, - {"url": "https://example.com?%C3%B1"}, - ), - *( - ( - {"url": f"https://example.com#{bytes([char]).decode()}"}, - {"url": f"https://example.com#%{char:X}"}, - ) - for char in chain( - ( - ord(' '), - ), - # Characters that w3lib would not escape: #[]|% - ( - char - for char in _safe_chars - if char not in RFC2396_FRAGMENT_SAFE_CHARS - ) - ) - ), - ( - {"url": "https://example.com#ñ"}, - {"url": "https://example.com#%C3%B1"}, - ), + # NOTE: We use w3lib.url.safe_url_string for escaping. Tests covering + # the URL escaping logic exist upstream. ), ) def test_process_query(input, output): diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 6bbbba0..c6f4480 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -1,28 +1,10 @@ import re -from string import ascii_letters, digits from os.path import splitext -from typing import AnyStr -from urllib.parse import quote, unquote, urlsplit, urlunsplit -from w3lib.util import to_unicode +from w3lib.url import safe_url_string from .__version__ import __version__ -# https://github.com/scrapy/w3lib/blob/8e19741b6b004d6248fb70b525255a96a1eb1ee6/w3lib/url.py#L61-L63 -_ascii_tab_newline_re = re.compile( - r"[\t\n\r]" -) - -RFC2396_RESERVED_CHARS = b";/?:@&=+$," -RFC2396_UNRESERVED_CHARS = (ascii_letters + digits + "-_.!~*'()").encode() -RFC2396_USERINFO_SAFE_CHARS = RFC2396_UNRESERVED_CHARS + b";:&=+$," -RFC2396_PATH_SEPARATORS = b"/;" -RFC2396_PATH_SAFE_CHARS = ( - RFC2396_UNRESERVED_CHARS + RFC2396_PATH_SEPARATORS + b":@&=+$," -) -RFC2396_QUERY_SAFE_CHARS = RFC2396_RESERVED_CHARS + RFC2396_UNRESERVED_CHARS -RFC2396_FRAGMENT_SAFE_CHARS = RFC2396_QUERY_SAFE_CHARS - def _guess_intype(file_name, lines): _, dot_extension = splitext(file_name) @@ -42,7 +24,9 @@ def _process_query(query): """Given a query to be sent to Zyte API, return a functionally-equivalent query that fixes any known issue. - Specifically, unsafe characters in the query URL are escaped. + Specifically, unsafe characters in the query URL are escaped to make sure + they are safe not only for the end server, but also for Zyte API, which + requires URLs compatible with RFC 2396. *query* is never modified in place, but the returned object is not guaranteed to be a copy of *query*: it could be *query* itself if no @@ -54,84 +38,12 @@ def _process_query(query): return query if not isinstance(url, str): raise ValueError(f"Expected a str URL parameter, got {type(url)}") - safe_url = _safe_url_string(url) + safe_url = safe_url_string(url) if url == safe_url: return query return {**query, "url": safe_url} -def _safe_url_string( - url: AnyStr, - encoding: str = "utf8", - path_encoding: str = "utf8", - quote_path: bool = True, -) -> str: - """Fork of ``w3lib.url.safe_url_string`` that enforces `RFC-3986`_. - - ``w3lib.url.safe_url_string`` has an implementation closer to the - `URL living standard`_ (e.g. does not encode “|”), while Zyte API expects - RFC-3986-compliant URLs. - - Forked w3lib commit: 8e19741b6b004d6248fb70b525255a96a1eb1ee6 - - .. _RFC-3986: https://datatracker.ietf.org/doc/html/rfc3986 - .. _URL living standard: https://url.spec.whatwg.org/ - """ - decoded = to_unicode(url, encoding=encoding, errors="percentencode") - parts = urlsplit(_ascii_tab_newline_re.sub("", decoded)) - - username, password, hostname, port = ( - parts.username, - parts.password, - parts.hostname, - parts.port, - ) - netloc_bytes = b"" - if username is not None or password is not None: - if username is not None: - safe_username = quote( - unquote(username), - RFC2396_USERINFO_SAFE_CHARS, - ) - netloc_bytes += safe_username.encode(encoding) - if password is not None: - netloc_bytes += b":" - safe_password = quote( - unquote(password), - RFC2396_USERINFO_SAFE_CHARS, - ) - netloc_bytes += safe_password.encode(encoding) - netloc_bytes += b"@" - if hostname is not None: - try: - netloc_bytes += hostname.encode("idna") - except UnicodeError: - netloc_bytes += hostname.encode(encoding) - if port is not None: - netloc_bytes += b":" - netloc_bytes += str(port).encode(encoding) - - netloc = netloc_bytes.decode() - - if quote_path: - path = quote(parts.path.encode(path_encoding), RFC2396_PATH_SAFE_CHARS) - else: - path = parts.path - - return urlunsplit( - ( - parts.scheme, - netloc, - path, - quote(parts.query.encode(encoding), RFC2396_QUERY_SAFE_CHARS), - quote( - parts.fragment.encode(encoding), - RFC2396_FRAGMENT_SAFE_CHARS, - ), - ) - ) - - def user_agent(library): return 'python-zyte-api/{} {}/{}'.format( __version__, From f058d26a5ede5f8960fedc71e0b9d4cd9ca3eb48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 28 Nov 2022 16:58:48 +0100 Subject: [PATCH 027/126] Remove unused imports --- tests/test_utils.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index d6c4dee..1bf6c92 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,13 +1,7 @@ -from itertools import chain - import pytest from pytest import raises -from w3lib.url import _path_safe_chars, _safe_chars -from zyte_api.utils import ( - _guess_intype, - _process_query, -) +from zyte_api.utils import _guess_intype, _process_query @pytest.mark.parametrize( From a895bc78b7f353a22b88578b88791ced0e370c5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 29 Nov 2022 10:29:21 +0100 Subject: [PATCH 028/126] Make sure that URL processing does not remove fragments --- tests/test_utils.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tests/test_utils.py b/tests/test_utils.py index 1bf6c92..03efd60 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -61,11 +61,6 @@ def test_guess_intype(file_name, first_line, expected): @pytest.mark.parametrize( "input,output", ( - # Safe URLs are returned unmodified. - ( - {"url": "https://example.com"}, - {"url": "https://example.com"}, - ), # Unsafe URLs in the url field are modified, while left untouched on # other fields. ( @@ -80,6 +75,16 @@ def test_guess_intype(file_name, first_line, expected): "url": "https://example.com/%20a", }, ), + # Safe URLs are returned unmodified. + ( + {"url": "https://example.com"}, + {"url": "https://example.com"}, + ), + # URL fragments are kept. + ( + {"url": "https://example.com#a"}, + {"url": "https://example.com#a"}, + ), # NOTE: We use w3lib.url.safe_url_string for escaping. Tests covering # the URL escaping logic exist upstream. ), From 0caeedd8a4f990422b3db9a0473de01fe9bbe0b9 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 30 Nov 2022 17:31:40 +0500 Subject: [PATCH 029/126] allow to set custom retrying for the AsyncClient --- zyte_api/aio/client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 2fb6638..e4b7d65 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -49,11 +49,13 @@ def __init__(self, *, api_key=None, api_url=API_URL, n_conn=15, + retrying: Optional[AsyncRetrying] = None, ): self.api_key = get_apikey(api_key) self.api_url = api_url self.n_conn = n_conn self.agg_stats = AggStats() + self.retrying = retrying or zyte_api_retrying async def request_raw(self, query: dict, *, endpoint: str = 'extract', @@ -61,7 +63,7 @@ async def request_raw(self, query: dict, *, handle_retries=True, retrying: Optional[AsyncRetrying] = None, ): - retrying = retrying or zyte_api_retrying + retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) headers = {'User-Agent': user_agent(aiohttp), 'Accept-Encoding': 'br'} From 42067852a706caead54538018c8321e9fc53ecd7 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 30 Nov 2022 18:07:29 +0500 Subject: [PATCH 030/126] CLI: allow to disable retrying of network and request errors --- zyte_api/__main__.py | 23 +++++++++++++++++------ zyte_api/aio/retry.py | 4 ++-- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 5ad2407..e8b5ce1 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -8,13 +8,19 @@ import random import tqdm +from tenacity import retry_if_exception from zyte_api.aio.client import ( create_session, - AsyncClient + AsyncClient, ) from zyte_api.constants import ENV_VARIABLE, API_URL from zyte_api.utils import _guess_intype +from zyte_api.aio.retry import RetryFactory, _is_throttling_error + + +class DontRetryErrorsFactory(RetryFactory): + retry_condition = retry_if_exception(_is_throttling_error) logger = logging.getLogger('zyte_api') @@ -22,10 +28,11 @@ _UNSET = object() -async def run(queries, out, n_conn, stop_on_errors, api_url, - api_key=None): - - client = AsyncClient(n_conn=n_conn, api_key=api_key, api_url=api_url) +async def run(queries, out, *, n_conn, stop_on_errors, api_url, + api_key=None, retry_errors=True): + retrying = None if retry_errors else DontRetryErrorsFactory().build() + client = AsyncClient(n_conn=n_conn, api_key=api_key, api_url=api_url, + retrying=retrying) async with create_session(connection_pool_size=n_conn) as session: result_iter = client.request_parallel_as_completed( queries=queries, @@ -117,6 +124,9 @@ def _main(program_name='zyte-api'): choices=["DEBUG", "INFO", "WARNING", "ERROR"], help="log level (default: %(default)s)") p.add_argument("--shuffle", help="Shuffle input URLs", action="store_true") + p.add_argument("--dont-retry-errors", + help="Don't retry request and network errors", + action="store_true") args = p.parse_args() logging.basicConfig( stream=sys.stderr, @@ -138,7 +148,8 @@ def _main(program_name='zyte-api'): n_conn=args.n_conn, stop_on_errors=False, api_url=args.api_url, - api_key=args.api_key) + api_key=args.api_key, + retry_errors=args.dont_retry_errors) loop.run_until_complete(coro) loop.close() diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index 17eb45b..1f8b4fb 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -18,7 +18,7 @@ retry_if_exception, RetryCallState, before_sleep_log, - after_log, AsyncRetrying, before_log, + after_log, AsyncRetrying, before_log, retry_base, ) from tenacity.stop import stop_never @@ -62,7 +62,7 @@ class RetryFactory: """ Build custom retry configuration """ - retry_condition = ( + retry_condition: retry_base = ( retry_if_exception(_is_throttling_error) | retry_if_exception(_is_network_error) | retry_if_exception(_is_temporary_download_error) From 6b93625fc61340d1e623198d7ab0f322f823fdf9 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Wed, 30 Nov 2022 19:02:05 +0500 Subject: [PATCH 031/126] fix the logic --- zyte_api/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index e8b5ce1..ae9a69a 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -149,7 +149,7 @@ def _main(program_name='zyte-api'): stop_on_errors=False, api_url=args.api_url, api_key=args.api_key, - retry_errors=args.dont_retry_errors) + retry_errors=not args.dont_retry_errors) loop.run_until_complete(coro) loop.close() From 405948b896c37e5e36d8106303ffbbc7a35d52da Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 1 Dec 2022 11:38:54 +0500 Subject: [PATCH 032/126] changelog for 0.4.4 --- CHANGES.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index e4c1980..86b7c9d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +0.4.4 (TBD) +----------- + +* Fixed an issue with submitting URLs which contain unescaped symbols +* New "retrying" argument for AsyncClient.__init__, which allows to set + custom retrying policy for the client +* ``--dont-retry-errors`` argument in the CLI tool + 0.4.3 (2022-11-10) ------------------ From c18582d88318c016a14f92e75161e77564ff4831 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 1 Dec 2022 13:59:29 +0500 Subject: [PATCH 033/126] set release date --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 86b7c9d..146b962 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changes ======= -0.4.4 (TBD) ------------ +0.4.4 (2022-12-01) +------------------ * Fixed an issue with submitting URLs which contain unescaped symbols * New "retrying" argument for AsyncClient.__init__, which allows to set From 6f135663b8aecb9b0f9e2aba3b73977103c4aa01 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Thu, 1 Dec 2022 13:59:37 +0500 Subject: [PATCH 034/126] =?UTF-8?q?Bump=20version:=200.4.3=20=E2=86=92=200?= =?UTF-8?q?.4.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index b7e8846..fb998f0 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.3 +current_version = 0.4.4 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 66d5383..9e2415b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.3' +release = u'0.4.4' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 908c0bb..9a8e054 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.3' +__version__ = '0.4.4' From 08fd0c2e9842de000a704a33e616cf71af3cfa44 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 23 Dec 2022 15:32:08 +0500 Subject: [PATCH 035/126] fix tox4 support --- tox.ini | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tox.ini b/tox.ini index 0013ca2..97f16dd 100644 --- a/tox.ini +++ b/tox.ini @@ -23,10 +23,6 @@ commands = mypy --ignore-missing-imports --no-warn-no-return \ changedir = docs deps = -rdocs/requirements.txt - -[testenv:docs] basepython = python3 -changedir = {[docs]changedir} -deps = {[docs]deps} commands = sphinx-build -W -b html . {envtmpdir}/html From 16e9df500c1790d6ab0dfaba002743500b9f6c62 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 23 Dec 2022 15:32:34 +0500 Subject: [PATCH 036/126] remove unused "requests" from install_requires --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 1ea581d..12accac 100755 --- a/setup.py +++ b/setup.py @@ -28,7 +28,6 @@ def get_version(): 'aiohttp >= 3.8.0', 'attrs', 'brotli', - 'requests', 'runstats', 'tenacity', 'tqdm', From ccef2eaee4eda30e90320a309870953a2895de47 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 23 Dec 2022 15:34:26 +0500 Subject: [PATCH 037/126] require w3lib 2.1.1, which is needed to escape URLs properly --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 12accac..f42faf7 100755 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ def get_version(): 'runstats', 'tenacity', 'tqdm', - 'w3lib', + 'w3lib >= 2.1.1', ], classifiers=[ 'Development Status :: 3 - Alpha', From b26470d1f1b24a761ec00bc209617d2054606d8e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Fri, 23 Dec 2022 15:34:43 +0500 Subject: [PATCH 038/126] changelog --- CHANGES.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 146b962..e8f4960 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,14 @@ Changes ======= +0.4.5 (TBD) +----------- + +* w3lib >= 2.1.1 is required in install_requires, to ensure that URLs + are escaped properly. +* unnecessary ``requests`` library is removed from install_requires +* fixed tox 4 support + 0.4.4 (2022-12-01) ------------------ From 641649e059b3676a4491a9938ec2a6d6d0d8eab0 Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 3 Jan 2023 13:49:10 +0500 Subject: [PATCH 039/126] fixed tox.ini --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 97f16dd..03e50bc 100644 --- a/tox.ini +++ b/tox.ini @@ -19,7 +19,7 @@ commands = mypy --ignore-missing-imports --no-warn-no-return \ zyte_api \ tests -[docs] +[testenv:docs] changedir = docs deps = -rdocs/requirements.txt From 0eacc235ad21a83c97fe151e5b65ce30784ecb1e Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 3 Jan 2023 16:41:49 +0500 Subject: [PATCH 040/126] set release date --- CHANGES.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e8f4960..fc13639 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,8 @@ Changes ======= -0.4.5 (TBD) ------------ +0.4.5 (2023-01-03) +------------------ * w3lib >= 2.1.1 is required in install_requires, to ensure that URLs are escaped properly. From 5d561c4ee5880b4ecfe25139eb84237544aee4cc Mon Sep 17 00:00:00 2001 From: Mikhail Korobov Date: Tue, 3 Jan 2023 16:41:58 +0500 Subject: [PATCH 041/126] =?UTF-8?q?Bump=20version:=200.4.4=20=E2=86=92=200?= =?UTF-8?q?.4.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fb998f0..e7f49e5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.4 +current_version = 0.4.5 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 9e2415b..83a3eaf 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.4' +release = u'0.4.5' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 9a8e054..68eb9b6 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.4' +__version__ = '0.4.5' From 158b2eb4a24aaa57893e2483fb87d355caff49e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 25 Aug 2023 14:42:51 +0200 Subject: [PATCH 042/126] Cover the api_key parameter in the asyncio API page --- docs/asyncio_api.rst | 14 ++++++++------ docs/install.rst | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index 07ba88c..f06f4ae 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -12,16 +12,18 @@ You can use the method ``request_raw`` to perform individual requests: import asyncio from zyte_api.aio.client import AsyncClient - client = AsyncClient() + client = AsyncClient(api_key="YOUR_API_KEY") + async def single_request(url): - return await client.request_raw({ - 'url': url, - 'browserHtml': True - }) + return await client.request_raw({"url": url, "browserHtml": True}) + response = asyncio.run(single_request("https://books.toscrape.com")) - # Do something with the response .. + # Do something with the response… + +.. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment + variable instead `. There is also ``request_parallel_as_completed`` method, which allows to process many URLs in parallel, using multiple connections: diff --git a/docs/install.rst b/docs/install.rst index 587b4fd..f87ca0d 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -10,6 +10,8 @@ Installation ``zyte-api`` requires Python 3.7+. +.. _api-key: + API key ======= From 14076825603cb98d570757b2de332f14ae68d93f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 21 Sep 2023 10:30:26 +0200 Subject: [PATCH 043/126] =?UTF-8?q?API=5FTIMEOUT:=2060s=20=E2=86=92240s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_api/constants.py b/zyte_api/constants.py index db24f4b..9cac837 100644 --- a/zyte_api/constants.py +++ b/zyte_api/constants.py @@ -7,4 +7,4 @@ API_URL = 'https://api.zyte.com/v1/' # Default timeout that server uses. Client timeouts should be larger than that. -API_TIMEOUT = 60 +API_TIMEOUT = 240 From a902a3973e475d376a5ebe6afd362c58504bd393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 21 Sep 2023 10:45:13 +0200 Subject: [PATCH 044/126] Cover 0.4.6 in the changelog --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index fc13639..ab6c8d6 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +0.4.6 (2023-09-21) +------------------ + +* Increased the client timeout to match the server’s. +* Mentioned the ``api_key`` parameter of ``AsyncClient`` in the docs example. + 0.4.5 (2023-01-03) ------------------ From d7b9ad80a9733d1d40e239d40b0b6eaa65081ea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 21 Sep 2023 15:32:27 +0200 Subject: [PATCH 045/126] Update API_TIMEOUT --- zyte_api/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_api/constants.py b/zyte_api/constants.py index 9cac837..926577d 100644 --- a/zyte_api/constants.py +++ b/zyte_api/constants.py @@ -7,4 +7,4 @@ API_URL = 'https://api.zyte.com/v1/' # Default timeout that server uses. Client timeouts should be larger than that. -API_TIMEOUT = 240 +API_TIMEOUT = 200 From c905219f11e87d3ada39fd317cb0d8bf7533a916 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Sep 2023 10:58:24 +0200 Subject: [PATCH 046/126] Update the release date of 0.4.6 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index ab6c8d6..6e0d0a3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.4.6 (2023-09-21) +0.4.6 (2023-09-26) ------------------ * Increased the client timeout to match the server’s. From 67f736552b051bb236d6881ca969eb7a8e071289 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 26 Sep 2023 10:58:52 +0200 Subject: [PATCH 047/126] =?UTF-8?q?Bump=20version:=200.4.5=20=E2=86=92=200?= =?UTF-8?q?.4.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e7f49e5..4c5891f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.5 +current_version = 0.4.6 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 83a3eaf..5cc939c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.5' +release = u'0.4.6' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 68eb9b6..ab45471 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.5' +__version__ = '0.4.6' From ad7e3a80bbb0341434b52f2535891222ad177ac3 Mon Sep 17 00:00:00 2001 From: Shevchenko Taras Date: Tue, 26 Sep 2023 12:16:37 +0300 Subject: [PATCH 048/126] Allow overriding the user agent (#50) --- tests/test_client.py | 22 ++++++++++++++++++++++ zyte_api/aio/client.py | 9 +++++++-- zyte_api/utils.py | 9 ++------- 3 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 tests/test_client.py diff --git a/tests/test_client.py b/tests/test_client.py new file mode 100644 index 0000000..cd4c214 --- /dev/null +++ b/tests/test_client.py @@ -0,0 +1,22 @@ +import pytest + +from zyte_api.aio.client import AsyncClient +from zyte_api.utils import USER_AGENT + + +@pytest.mark.parametrize( + "user_agent,expected", + ( + ( + None, + USER_AGENT, + ), + ( + f'scrapy-zyte-api/0.11.1 {USER_AGENT}', + f'scrapy-zyte-api/0.11.1 {USER_AGENT}', + ), + ), +) +def test_user_agent(user_agent, expected): + client = AsyncClient(api_key='123', api_url='http:\\test', user_agent=user_agent) + assert client.user_agent == expected diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index e4b7d65..4f5a249 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -16,7 +16,7 @@ from ..apikey import get_apikey from ..constants import API_URL, API_TIMEOUT from ..stats import AggStats, ResponseStats -from ..utils import _process_query, user_agent +from ..utils import USER_AGENT, _process_query # 120 seconds is probably too long, but we are concerned about the case with @@ -50,12 +50,14 @@ def __init__(self, *, api_url=API_URL, n_conn=15, retrying: Optional[AsyncRetrying] = None, + user_agent: Optional[str] = None, ): self.api_key = get_apikey(api_key) self.api_url = api_url self.n_conn = n_conn self.agg_stats = AggStats() self.retrying = retrying or zyte_api_retrying + self.user_agent = user_agent or USER_AGENT async def request_raw(self, query: dict, *, endpoint: str = 'extract', @@ -66,7 +68,10 @@ async def request_raw(self, query: dict, *, retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = {'User-Agent': user_agent(aiohttp), 'Accept-Encoding': 'br'} + headers = { + 'User-Agent': self.user_agent, + 'Accept-Encoding': 'br' + } response_stats = [] start_global = time.perf_counter() diff --git a/zyte_api/utils.py b/zyte_api/utils.py index c6f4480..7767a9b 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -5,6 +5,8 @@ from .__version__ import __version__ +USER_AGENT = f'python-zyte-api/{__version__}' + def _guess_intype(file_name, lines): _, dot_extension = splitext(file_name) @@ -42,10 +44,3 @@ def _process_query(query): if url == safe_url: return query return {**query, "url": safe_url} - - -def user_agent(library): - return 'python-zyte-api/{} {}/{}'.format( - __version__, - library.__name__, - library.__version__) From 51a61ea9876e27c17c64b651f05d48f09c62f9ea Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 26 Sep 2023 16:08:55 +0300 Subject: [PATCH 049/126] changelog for 0.4.7 --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 6e0d0a3..e39ebf7 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +0.4.7 (to be released) +------------------ + +* Added the ability to send a custom user agent from a dependent library + during creating a new client with ``AsyncClient`` + 0.4.6 (2023-09-26) ------------------ From 3af5db67f3380abb6c34b0c82dc94258f7a5d3c9 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 26 Sep 2023 16:27:08 +0300 Subject: [PATCH 050/126] make the description shorter --- CHANGES.rst | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index e39ebf7..ab39c83 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -4,8 +4,7 @@ Changes 0.4.7 (to be released) ------------------ -* Added the ability to send a custom user agent from a dependent library - during creating a new client with ``AsyncClient`` +* ``AsyncClient`` now lets you set a custom user agent to send to Zyte API. 0.4.6 (2023-09-26) ------------------ From 40b9bc1d109e94118460de259b01e6ba16685328 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 26 Sep 2023 17:07:12 +0300 Subject: [PATCH 051/126] fix formatting --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index ab39c83..cbc193d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -2,7 +2,7 @@ Changes ======= 0.4.7 (to be released) ------------------- +---------------------- * ``AsyncClient`` now lets you set a custom user agent to send to Zyte API. From 8547d6460fd2d86acfbe88f161777c1116531ffb Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 26 Sep 2023 19:57:07 +0300 Subject: [PATCH 052/126] Update the release date of 0.4.7 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index cbc193d..dd600b8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.4.7 (to be released) +0.4.7 (2023-09-26) ---------------------- * ``AsyncClient`` now lets you set a custom user agent to send to Zyte API. From 91b141e80fdf9f5777d25b7627c4f544fdcffc12 Mon Sep 17 00:00:00 2001 From: PyExplorer Date: Tue, 26 Sep 2023 20:00:20 +0300 Subject: [PATCH 053/126] =?UTF-8?q?Bump=20version:=200.4.6=20=E2=86=92=200?= =?UTF-8?q?.4.7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 4c5891f..a960335 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.6 +current_version = 0.4.7 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 5cc939c..4635aec 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.6' +release = u'0.4.7' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index ab45471..1e4826d 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.6' +__version__ = '0.4.7' From 0e684a320163d43f7d082a93a1b60ba4e0a3cb1b Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 24 Oct 2023 14:49:56 +0800 Subject: [PATCH 054/126] add the ZAPI request id on RequestError message --- requirements-test.txt | 1 + tests/test_client.py | 28 ++++++++++++++++++++++++++++ zyte_api/aio/client.py | 5 ++++- 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/requirements-test.txt b/requirements-test.txt index 847062e..3c609fe 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ pytest pytest-cov +pytest-asyncio responses diff --git a/tests/test_client.py b/tests/test_client.py index cd4c214..d24f05b 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,6 +1,8 @@ import pytest +from unittest import mock from zyte_api.aio.client import AsyncClient +from zyte_api.aio.errors import RequestError from zyte_api.utils import USER_AGENT @@ -20,3 +22,29 @@ def test_user_agent(user_agent, expected): client = AsyncClient(api_key='123', api_url='http:\\test', user_agent=user_agent) assert client.user_agent == expected + + +@pytest.mark.asyncio +@mock.patch("zyte_api.aio.client._post_func") +async def test_request_raw_error(mock_post): + request_id = "abcd1234" + content = b"some content" + headers = {"request-id": request_id} + status = 521 + reason = "Some failure somewhere" + + response = mock.AsyncMock() + response.status = status + response.read.return_value = content + response.headers = headers + response.reason = "reason" + mock_post()().__aenter__.return_value = response + client = AsyncClient(api_key='a') + + with pytest.raises(RequestError) as excinfo: + await client.request_raw(query={}) + + assert f"(request_id={request_id}) reason" in excinfo.value.message + assert {"request-id": request_id} == excinfo.value.headers + assert status == excinfo.value.status + assert content == excinfo.value.response_content \ No newline at end of file diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 4f5a249..d3e0f9b 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -96,11 +96,14 @@ async def request(): stats.record_read() stats.record_request_error(content, self.agg_stats) + request_id = resp.headers.get("request-id") + message = f"(request_id={request_id}) {resp.reason}" + raise RequestError( request_info=resp.request_info, history=resp.history, status=resp.status, - message=resp.reason, + message=message, headers=resp.headers, response_content=content ) From c71fc958367fb4b31aaf87ebbd0ed3ddf9efb56d Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 24 Oct 2023 18:35:24 +0800 Subject: [PATCH 055/126] RequestError: add request_id as an attribute and str representation --- tests/test_client.py | 3 ++- zyte_api/aio/client.py | 3 ++- zyte_api/aio/errors.py | 6 +++++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/test_client.py b/tests/test_client.py index d24f05b..b13f1b0 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -47,4 +47,5 @@ async def test_request_raw_error(mock_post): assert f"(request_id={request_id}) reason" in excinfo.value.message assert {"request-id": request_id} == excinfo.value.headers assert status == excinfo.value.status - assert content == excinfo.value.response_content \ No newline at end of file + assert content == excinfo.value.response_content + assert request_id == excinfo.value.request_id diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index d3e0f9b..f4230bb 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -105,7 +105,8 @@ async def request(): status=resp.status, message=message, headers=resp.headers, - response_content=content + response_content=content, + request_id=request_id, ) response = await resp.json() diff --git a/zyte_api/aio/errors.py b/zyte_api/aio/errors.py index a7c03cf..8b3005c 100644 --- a/zyte_api/aio/errors.py +++ b/zyte_api/aio/errors.py @@ -15,6 +15,9 @@ class RequestError(ClientResponseError): """ def __init__(self, *args, **kwargs): self.response_content = kwargs.pop("response_content") + self.request_id = kwargs.pop("request_id", None) + if self.request_id is None: + self.request_id = kwargs.get("headers", {}).get("request-id") super().__init__(*args, **kwargs) @property @@ -23,4 +26,5 @@ def parsed(self): def __str__(self): return f"RequestError: {self.status}, message={self.message}, " \ - f"headers={self.headers}, body={self.response_content}" + f"headers={self.headers}, body={self.response_content}, " \ + f"request_id={self.request_id}" From e288c7543cab97c43145ca227d2c6d20d396c6ac Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 24 Oct 2023 20:18:36 +0800 Subject: [PATCH 056/126] revert previous code changes for a simpler approach --- requirements-test.txt | 1 - tests/test_client.py | 29 ----------------------------- zyte_api/aio/client.py | 5 +---- 3 files changed, 1 insertion(+), 34 deletions(-) diff --git a/requirements-test.txt b/requirements-test.txt index 3c609fe..847062e 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,3 @@ pytest pytest-cov -pytest-asyncio responses diff --git a/tests/test_client.py b/tests/test_client.py index b13f1b0..cd4c214 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,8 +1,6 @@ import pytest -from unittest import mock from zyte_api.aio.client import AsyncClient -from zyte_api.aio.errors import RequestError from zyte_api.utils import USER_AGENT @@ -22,30 +20,3 @@ def test_user_agent(user_agent, expected): client = AsyncClient(api_key='123', api_url='http:\\test', user_agent=user_agent) assert client.user_agent == expected - - -@pytest.mark.asyncio -@mock.patch("zyte_api.aio.client._post_func") -async def test_request_raw_error(mock_post): - request_id = "abcd1234" - content = b"some content" - headers = {"request-id": request_id} - status = 521 - reason = "Some failure somewhere" - - response = mock.AsyncMock() - response.status = status - response.read.return_value = content - response.headers = headers - response.reason = "reason" - mock_post()().__aenter__.return_value = response - client = AsyncClient(api_key='a') - - with pytest.raises(RequestError) as excinfo: - await client.request_raw(query={}) - - assert f"(request_id={request_id}) reason" in excinfo.value.message - assert {"request-id": request_id} == excinfo.value.headers - assert status == excinfo.value.status - assert content == excinfo.value.response_content - assert request_id == excinfo.value.request_id diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index f4230bb..0525842 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -96,14 +96,11 @@ async def request(): stats.record_read() stats.record_request_error(content, self.agg_stats) - request_id = resp.headers.get("request-id") - message = f"(request_id={request_id}) {resp.reason}" - raise RequestError( request_info=resp.request_info, history=resp.history, status=resp.status, - message=message, + message=resp.reason, headers=resp.headers, response_content=content, request_id=request_id, From 3d8252815330f845df7bb0485262ce2325cddc81 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Wed, 25 Oct 2023 20:46:17 +0800 Subject: [PATCH 057/126] remove undefined request_id variable --- zyte_api/aio/client.py | 1 - 1 file changed, 1 deletion(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 0525842..a3fb331 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -103,7 +103,6 @@ async def request(): message=resp.reason, headers=resp.headers, response_content=content, - request_id=request_id, ) response = await resp.json() From 09d1366723dd2b9d87c575a8dbdf2953d343c4a1 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 2 Nov 2023 20:02:00 +0800 Subject: [PATCH 058/126] =?UTF-8?q?Bump=20version:=200.4.7=20=E2=86=92=200?= =?UTF-8?q?.4.8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- CHANGES.rst | 8 +++++++- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index a960335..5a0b6bd 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.7 +current_version = 0.4.8 commit = True tag = True tag_name = {new_version} diff --git a/CHANGES.rst b/CHANGES.rst index dd600b8..392d6a8 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,8 +1,14 @@ Changes ======= +0.4.8 (YYYY-MM-DD) +------------------ + +* Include the Zyte API request ID value in a new ``.request_id`` attribute + in :class:`zyte_api.aio.errors.RequestError`. + 0.4.7 (2023-09-26) ----------------------- +------------------ * ``AsyncClient`` now lets you set a custom user agent to send to Zyte API. diff --git a/docs/conf.py b/docs/conf.py index 4635aec..4fa4cfd 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -24,7 +24,7 @@ # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'0.4.7' +release = u'0.4.8' # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 1e4826d..5bf52d5 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.7' +__version__ = '0.4.8' From 094632f80efb04c980a765e68fcc8c489a7eb53e Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 2 Nov 2023 21:33:44 +0800 Subject: [PATCH 059/126] update release date for 0.4.8 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 392d6a8..b4912e3 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.4.8 (YYYY-MM-DD) +0.4.8 (2023-11-02) ------------------ * Include the Zyte API request ID value in a new ``.request_id`` attribute From 2b6c779afd92e6b5e2a0fefc0ff4e57db04c44b4 Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Thu, 2 Nov 2023 21:39:32 +0800 Subject: [PATCH 060/126] avoid Sphinx format to prevent PyPI RST error --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index b4912e3..a39a513 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,7 +5,7 @@ Changes ------------------ * Include the Zyte API request ID value in a new ``.request_id`` attribute - in :class:`zyte_api.aio.errors.RequestError`. + in ``zyte_api.aio.errors.RequestError``. 0.4.7 (2023-09-26) ------------------ From d13737fd249e92b61ece707c85a48c5a3e9da98d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 Dec 2023 17:26:26 +0100 Subject: [PATCH 061/126] Add .readthedocs.yml (#56) --- .github/workflows/test.yml | 2 +- .readthedocs.yml | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 .readthedocs.yml diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d3df3bd..5189fb7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,7 +40,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.10'] + python-version: ['3.11'] tox-job: ["mypy", "docs"] steps: diff --git a/.readthedocs.yml b/.readthedocs.yml new file mode 100644 index 0000000..ead29a8 --- /dev/null +++ b/.readthedocs.yml @@ -0,0 +1,15 @@ +version: 2 +formats: all +sphinx: + configuration: docs/conf.py + fail_on_warning: true +build: + os: ubuntu-22.04 + tools: + # For available versions, see: + # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python + python: "3.11" # Keep in sync with .github/workflows/test.yml +python: + install: + - requirements: docs/requirements.txt + - path: . From c7b05eac76136265689b4c06f99b9f3bdb5a1bec Mon Sep 17 00:00:00 2001 From: Kevin Lloyd Bernal Date: Tue, 23 Jan 2024 10:02:55 +0800 Subject: [PATCH 062/126] remove Python 3.7 support --- .github/workflows/test.yml | 2 +- README.rst | 2 +- docs/install.rst | 2 +- setup.py | 1 - tox.ini | 2 +- 5 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index d3df3bd..6014c38 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.7', '3.8', '3.9', '3.10', '3.11'] + python-version: ['3.8', '3.9', '3.10', '3.11'] steps: - uses: actions/checkout@v2 diff --git a/README.rst b/README.rst index 75dc378..2128fe7 100644 --- a/README.rst +++ b/README.rst @@ -29,7 +29,7 @@ Installation pip install zyte-api -``zyte-api`` requires Python 3.7+. +``zyte-api`` requires Python 3.8+. API key ======= diff --git a/docs/install.rst b/docs/install.rst index f87ca0d..d034b04 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -8,7 +8,7 @@ Installation pip install zyte-api -``zyte-api`` requires Python 3.7+. +``zyte-api`` requires Python 3.8+. .. _api-key: diff --git a/setup.py b/setup.py index f42faf7..658699a 100755 --- a/setup.py +++ b/setup.py @@ -40,7 +40,6 @@ def get_version(): 'Natural Language :: English', 'Operating System :: OS Independent', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', diff --git a/tox.ini b/tox.ini index 03e50bc..280b2ba 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py37,py38,py39,py310,py311,mypy,docs +envlist = py38,py39,py310,py311,mypy,docs [testenv] deps = From 931a8103b5dfc63f40be4e663306600c440b7fa7 Mon Sep 17 00:00:00 2001 From: Adnan Awan Date: Thu, 1 Feb 2024 17:50:53 +0500 Subject: [PATCH 063/126] Provide an option to store error responses in CLI (#47) --- requirements-test.txt | 1 + tests/test_main.py | 132 ++++++++++++++++++++++++++++++ zyte_api/__main__.py | 184 ++++++++++++++++++++++++++---------------- 3 files changed, 247 insertions(+), 70 deletions(-) create mode 100644 tests/test_main.py diff --git a/requirements-test.txt b/requirements-test.txt index 847062e..e693788 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,3 +1,4 @@ pytest pytest-cov responses +pytest-asyncio diff --git a/tests/test_main.py b/tests/test_main.py new file mode 100644 index 0000000..0087774 --- /dev/null +++ b/tests/test_main.py @@ -0,0 +1,132 @@ +import json +import os +from json import JSONDecodeError +from unittest.mock import Mock, patch, AsyncMock + +import pytest + +from zyte_api.__main__ import run + + +class RequestError(Exception): + @property + def parsed(self): + mock = Mock( + response_body=Mock(decode=Mock(return_value=forbidden_domain_response())) + ) + return mock + + +def get_json_content(file_object): + if not file_object: + return + + file_path = file_object.name + try: + with open(file_path, "r") as file: + return json.load(file) + except JSONDecodeError: + pass + + +def delete_file(file_path): + try: + os.remove(file_path) + print(f"File '{file_path}' has been deleted successfully.") + except FileNotFoundError: + print(f"File '{file_path}' not found. Unable to delete.") + + +def forbidden_domain_response(): + response_str = { + "type": "/download/temporary-error", + "title": "Temporary Downloading Error", + "status": 520, + "detail": "There is a downloading problem which might be temporary. Retry in N seconds from 'Retry-After' header or open a support ticket from https://support.zyte.com/support/tickets/new if it fails consistently.", + } + return response_str + + +async def fake_exception(value=True): + # Simulating an error condition + if value: + raise RequestError() + + create_session_mock = AsyncMock() + return await create_session_mock.coroutine() + + +@pytest.mark.parametrize( + "queries,expected_response,store_errors,exception", + ( + ( + # test if it stores the error(s) also by adding flag + ( + [ + { + "url": "https://forbidden.example", + "browserHtml": True, + "echoData": "https://forbidden.example", + } + ], + forbidden_domain_response(), + True, + fake_exception, + ), + # test with store_errors=False + ( + [ + { + "url": "https://forbidden.example", + "browserHtml": True, + "echoData": "https://forbidden.example", + } + ], + None, # expected response should be None + False, + fake_exception, + ), + ) + ), +) +@pytest.mark.asyncio +async def test_run(queries, expected_response, store_errors, exception): + temporary_file = open("temporary_file.jsonl", "w") + n_conn = 5 + stop_on_errors = False + api_url = "https://example.com" + api_key = "fake_key" + retry_errors = True + + # Create a mock for AsyncClient + async_client_mock = Mock() + + # Create a mock for the request_parallel_as_completed method + request_parallel_mock = Mock() + async_client_mock.return_value.request_parallel_as_completed = request_parallel_mock + + # Patch the AsyncClient class in __main__ with the mock + with patch("zyte_api.__main__.AsyncClient", async_client_mock), patch( + "zyte_api.__main__.create_session" + ) as create_session_mock: + # Mock create_session to return an AsyncMock + create_session_mock.return_value = AsyncMock() + + # Set up the AsyncClient instance to return the mocked iterator + async_client_mock.return_value.request_parallel_as_completed.return_value = [ + exception(), + ] + + # Call the run function with the mocked AsyncClient + await run( + queries=queries, + out=temporary_file, + n_conn=n_conn, + stop_on_errors=stop_on_errors, + api_url=api_url, + api_key=api_key, + retry_errors=retry_errors, + store_errors=store_errors, + ) + + assert get_json_content(temporary_file) == expected_response diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index ae9a69a..3e03e70 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -1,11 +1,11 @@ """ Basic command-line interface for Zyte API. """ import argparse -import json -import sys import asyncio +import json import logging import random +import sys import tqdm from tenacity import retry_if_exception @@ -14,45 +14,64 @@ create_session, AsyncClient, ) +from zyte_api.aio.retry import RetryFactory, _is_throttling_error from zyte_api.constants import ENV_VARIABLE, API_URL from zyte_api.utils import _guess_intype -from zyte_api.aio.retry import RetryFactory, _is_throttling_error class DontRetryErrorsFactory(RetryFactory): retry_condition = retry_if_exception(_is_throttling_error) -logger = logging.getLogger('zyte_api') +logger = logging.getLogger("zyte_api") _UNSET = object() -async def run(queries, out, *, n_conn, stop_on_errors, api_url, - api_key=None, retry_errors=True): +async def run( + queries, + out, + *, + n_conn, + stop_on_errors, + api_url, + api_key=None, + retry_errors=True, + store_errors=None, +): + def write_output(content): + json.dump(content, out, ensure_ascii=False) + out.write("\n") + out.flush() + pbar.update() + retrying = None if retry_errors else DontRetryErrorsFactory().build() - client = AsyncClient(n_conn=n_conn, api_key=api_key, api_url=api_url, - retrying=retrying) + client = AsyncClient( + n_conn=n_conn, api_key=api_key, api_url=api_url, retrying=retrying + ) async with create_session(connection_pool_size=n_conn) as session: result_iter = client.request_parallel_as_completed( queries=queries, session=session, ) - pbar = tqdm.tqdm(smoothing=0, leave=True, total=len(queries), miniters=1, - unit="url") + pbar = tqdm.tqdm( + smoothing=0, leave=True, total=len(queries), miniters=1, unit="url" + ) pbar.set_postfix_str(str(client.agg_stats)) try: for fut in result_iter: try: result = await fut - json.dump(result, out, ensure_ascii=False) - out.write("\n") - out.flush() - pbar.update() except Exception as e: + if store_errors: + write_output(e.parsed.response_body.decode()) + if stop_on_errors: raise + logger.error(str(e)) + else: + write_output(result) finally: pbar.set_postfix_str(str(client.agg_stats)) finally: @@ -72,10 +91,7 @@ def read_input(input_fp, intype): urls = [u.strip() for u in lines if u.strip()] records = [{"url": url, "browserHtml": True} for url in urls] else: - records = [ - json.loads(line.strip()) - for line in lines if line.strip() - ] + records = [json.loads(line.strip()) for line in lines if line.strip()] # Automatically replicating the url in echoData to being able to # to match URLs with content in the responses for record in records: @@ -83,76 +99,104 @@ def read_input(input_fp, intype): return records -def _main(program_name='zyte-api'): - """ Process urls from input file through Zyte API """ +def _main(program_name="zyte-api"): + """Process urls from input file through Zyte API""" p = argparse.ArgumentParser( prog=program_name, description=""" Process input URLs from a file using Zyte API. """, ) - p.add_argument("input", - type=argparse.FileType("r", encoding='utf8'), - help="Input file with urls, url per line by default. The " - "Format can be changed using `--intype` argument.") - p.add_argument("--intype", default=_UNSET, choices=["txt", "jl"], - help="Type of the input file. " - "Allowed values are 'txt' (1 URL per line) and 'jl' " - "(JSON Lines file, each object describing the " - "parameters of a request). " - "If not specified, the input type is guessed based on " - "the input file name extension (.jl, .jsonl, .txt) or " - "content, and assumed to be txt if guessing fails.") - p.add_argument("--limit", type=int, - help="Max number of URLs to take from the input") - p.add_argument("--output", "-o", - default=sys.stdout, - type=argparse.FileType("w", encoding='utf8'), - help=".jsonlines file to store extracted data. " - "By default, results are printed to stdout.") - p.add_argument("--n-conn", type=int, default=20, - help="number of connections to the API server " - "(default: %(default)s)") - p.add_argument("--api-key", - help="Zyte API key. " - "You can also set %s environment variable instead " - "of using this option." % ENV_VARIABLE) - p.add_argument("--api-url", - help="Zyte API endpoint (default: %(default)s)", - default=API_URL) - p.add_argument("--loglevel", "-L", default="INFO", - choices=["DEBUG", "INFO", "WARNING", "ERROR"], - help="log level (default: %(default)s)") + p.add_argument( + "input", + type=argparse.FileType("r", encoding="utf8"), + help="Input file with urls, url per line by default. The " + "Format can be changed using `--intype` argument.", + ) + p.add_argument( + "--intype", + default=_UNSET, + choices=["txt", "jl"], + help="Type of the input file. " + "Allowed values are 'txt' (1 URL per line) and 'jl' " + "(JSON Lines file, each object describing the " + "parameters of a request). " + "If not specified, the input type is guessed based on " + "the input file name extension (.jl, .jsonl, .txt) or " + "content, and assumed to be txt if guessing fails.", + ) + p.add_argument( + "--limit", type=int, help="Max number of URLs to take from the input" + ) + p.add_argument( + "--output", + "-o", + default=sys.stdout, + type=argparse.FileType("w", encoding="utf8"), + help=".jsonlines file to store extracted data. " + "By default, results are printed to stdout.", + ) + p.add_argument( + "--n-conn", + type=int, + default=20, + help="number of connections to the API server " "(default: %(default)s)", + ) + p.add_argument( + "--api-key", + help="Zyte API key. " + "You can also set %s environment variable instead " + "of using this option." % ENV_VARIABLE, + ) + p.add_argument( + "--api-url", help="Zyte API endpoint (default: %(default)s)", default=API_URL + ) + p.add_argument( + "--loglevel", + "-L", + default="INFO", + choices=["DEBUG", "INFO", "WARNING", "ERROR"], + help="log level (default: %(default)s)", + ) p.add_argument("--shuffle", help="Shuffle input URLs", action="store_true") - p.add_argument("--dont-retry-errors", - help="Don't retry request and network errors", - action="store_true") - args = p.parse_args() - logging.basicConfig( - stream=sys.stderr, - level=getattr(logging, args.loglevel) + p.add_argument( + "--dont-retry-errors", + help="Don't retry request and network errors", + action="store_true", + ) + p.add_argument( + "--store-errors", + help="when set to true, it includes all types of responses, and when set to false," + " it includes only error-free responses in the output.", ) + args = p.parse_args() + logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) queries = read_input(args.input, args.intype) if args.shuffle: random.shuffle(queries) if args.limit: - queries = queries[:args.limit] + queries = queries[: args.limit] - logger.info(f"Loaded {len(queries)} urls from {args.input.name}; shuffled: {args.shuffle}") + logger.info( + f"Loaded {len(queries)} urls from {args.input.name}; shuffled: {args.shuffle}" + ) logger.info(f"Running Zyte API (connections: {args.n_conn})") loop = asyncio.get_event_loop() - coro = run(queries, - out=args.output, - n_conn=args.n_conn, - stop_on_errors=False, - api_url=args.api_url, - api_key=args.api_key, - retry_errors=not args.dont_retry_errors) + coro = run( + queries, + out=args.output, + n_conn=args.n_conn, + stop_on_errors=False, + api_url=args.api_url, + api_key=args.api_key, + retry_errors=not args.dont_retry_errors, + store_errors=args.store_errors, + ) loop.run_until_complete(coro) loop.close() -if __name__ == '__main__': - _main(program_name='python -m zyte_api') +if __name__ == "__main__": + _main(program_name="python -m zyte_api") From 5b94c9082275ddb013444547a47bf9153a176d90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Mar 2024 15:57:24 +0100 Subject: [PATCH 064/126] Replace AsyncClient with AsyncZyteAPI --- docs/asyncio_api.rst | 33 +++------ pytest.ini | 4 ++ requirements-test.txt | 4 -- tests/__init__.py | 0 tests/conftest.py | 9 +++ tests/mockserver.py | 130 +++++++++++++++++++++++++++++++++++ tests/test_async.py | 44 ++++++++++++ tox.ini | 7 +- zyte_api/__init__.py | 5 +- zyte_api/_async.py | 152 +++++++++++++++++++++++++++++++++++++++++ zyte_api/_utils.py | 19 ++++++ zyte_api/aio/client.py | 11 +++ 12 files changed, 390 insertions(+), 28 deletions(-) create mode 100644 pytest.ini delete mode 100644 requirements-test.txt create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py create mode 100644 tests/mockserver.py create mode 100644 tests/test_async.py create mode 100644 zyte_api/_async.py create mode 100644 zyte_api/_utils.py diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index f06f4ae..4792660 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -4,29 +4,26 @@ asyncio API =========== -Create an instance of the ``AsyncClient`` to use the asyncio client API. +Create an instance of the ``AsyncZyteAPI`` to use the asyncio client API. You can use the method ``request_raw`` to perform individual requests: .. code-block:: python import asyncio - from zyte_api.aio.client import AsyncClient - - client = AsyncClient(api_key="YOUR_API_KEY") + from zyte_api import AsyncZyteAPI + client = AsyncZyteAPI(api_key="YOUR_API_KEY") async def single_request(url): - return await client.request_raw({"url": url, "browserHtml": True}) - + return await client.get({"url": url, "browserHtml": True}) response = asyncio.run(single_request("https://books.toscrape.com")) - # Do something with the response… .. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment variable instead `. -There is also ``request_parallel_as_completed`` method, which allows -to process many URLs in parallel, using multiple connections: +There is also an ``iter`` method, which allows to process many URLs in +parallel, using multiple connections: .. code-block:: python @@ -34,17 +31,17 @@ to process many URLs in parallel, using multiple connections: import json import sys - from zyte_api.aio.client import AsyncClient, create_session + from zyte_api import AsyncZyteAPI, create_session from zyte_api.aio.errors import RequestError async def extract_from(urls, n_conn): - client = AsyncClient(n_conn=n_conn) + client = AsyncZyteAPI(n_conn=n_conn) requests = [ {"url": url, "browserHtml": True} for url in urls ] async with create_session(n_conn) as session: - res_iter = client.request_parallel_as_completed(requests, session=session) + res_iter = client.iter(requests, session=session) for fut in res_iter: try: res = await fut @@ -57,13 +54,5 @@ to process many URLs in parallel, using multiple connections: urls = ["https://toscrape.com", "https://books.toscrape.com"] asyncio.run(extract_from(urls, n_conn=15)) -``request_parallel_as_completed`` is modelled after ``asyncio.as_completed`` -(see https://docs.python.org/3/library/asyncio-task.html#asyncio.as_completed), -and actually uses it under the hood. - -``request_parallel_as_completed`` and ``request_raw`` methods handle -throttling (http 429 errors) and network errors, retrying a request in -these cases. - -CLI interface implementation (``zyte_api/__main__.py``) can serve -as an usage example. +``iter`` and ``get`` methods handle throttling (http 429 errors) and network +errors, retrying a request in these cases. diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..5260d78 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,4 @@ +[pytest] +filterwarnings = + ignore:The zyte_api\.aio\.client module is deprecated:DeprecationWarning + diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index e693788..0000000 --- a/requirements-test.txt +++ /dev/null @@ -1,4 +0,0 @@ -pytest -pytest-cov -responses -pytest-asyncio diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..db2b302 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,9 @@ +import pytest + + +@pytest.fixture(scope="session") +def mockserver(): + from .mockserver import MockServer + + with MockServer() as server: + yield server diff --git a/tests/mockserver.py b/tests/mockserver.py new file mode 100644 index 0000000..ea8fffb --- /dev/null +++ b/tests/mockserver.py @@ -0,0 +1,130 @@ +import argparse +import json +import socket +import sys +import time +from base64 import b64encode +from contextlib import asynccontextmanager +from importlib import import_module +from subprocess import PIPE, Popen +from typing import Any, Dict, Optional +from urllib.parse import urlparse + +from pytest_twisted import ensureDeferred +from twisted.internet import reactor +from twisted.internet.defer import Deferred +from twisted.internet.task import deferLater +from twisted.web.resource import Resource +from twisted.web.server import NOT_DONE_YET, Site + + +def get_ephemeral_port(): + s = socket.socket() + s.bind(("", 0)) + return s.getsockname()[1] + + +class DefaultResource(Resource): + request_count = 0 + + def getChild(self, path, request): + return self + + def render_POST(self, request): + request_data = json.loads(request.content.read()) + + request.responseHeaders.setRawHeaders( + b"Content-Type", + [b"application/json"], + ) + request.responseHeaders.setRawHeaders( + b"request-id", + [b"abcd1234"], + ) + + url = request_data["url"] + domain = urlparse(url).netloc + if domain == "exception.example": + request.setResponseCode(401) + response_data = { + "status": 401, + "type": "/auth/key-not-found", + "title": "Authentication Key Not Found", + "detail": "The authentication key is not valid or can't be matched.", + } + return json.dumps(response_data).encode() + + response_data: Dict[str, Any] = { + "url": url, + } + + assert "httpResponseBody" in request_data + html = "Hello

World!

" + body = b64encode(html.encode()).decode() + response_data["httpResponseBody"] = body + + return json.dumps(response_data).encode() + + +class MockServer: + def __init__(self, resource=None, port=None): + resource = resource or DefaultResource + self.resource = "{}.{}".format(resource.__module__, resource.__name__) + self.proc = None + self.host = socket.gethostbyname(socket.gethostname()) + self.port = port or get_ephemeral_port() + self.root_url = "http://%s:%d" % (self.host, self.port) + + def __enter__(self): + self.proc = Popen( + [ + sys.executable, + "-u", + "-m", + "tests.mockserver", + self.resource, + "--port", + str(self.port), + ], + stdout=PIPE, + ) + assert self.proc.stdout is not None + self.proc.stdout.readline() + return self + + def __exit__(self, exc_type, exc_value, traceback): + assert self.proc is not None + self.proc.kill() + self.proc.wait() + time.sleep(0.2) + + def urljoin(self, path): + return self.root_url + path + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("resource") + parser.add_argument("--port", type=int) + args = parser.parse_args() + module_name, name = args.resource.rsplit(".", 1) + sys.path.append(".") + resource = getattr(import_module(module_name), name)() + # Typing issue: https://github.com/twisted/twisted/issues/9909 + http_port = reactor.listenTCP(args.port, Site(resource)) # type: ignore[attr-defined] + + def print_listening(): + host = http_port.getHost() + print( + "Mock server {} running at http://{}:{}".format( + resource, host.host, host.port + ) + ) + + # Typing issue: https://github.com/twisted/twisted/issues/9909 + reactor.callWhenRunning(print_listening) # type: ignore[attr-defined] + reactor.run() # type: ignore[attr-defined] + + +if __name__ == "__main__": + main() diff --git a/tests/test_async.py b/tests/test_async.py new file mode 100644 index 0000000..c405aa9 --- /dev/null +++ b/tests/test_async.py @@ -0,0 +1,44 @@ +from types import GeneratorType + +from zyte_api import AsyncZyteAPI +from zyte_api.apikey import NoApiKey + +import pytest + + +def test_api_key(): + AsyncZyteAPI(api_key="a") + with pytest.raises(NoApiKey): + AsyncZyteAPI() + + +@pytest.mark.asyncio +async def test_get(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + expected_result = {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="} + actual_result = await client.get({"url": "https://a.example", "httpResponseBody": True}) + assert actual_result == expected_result + + +@pytest.mark.asyncio +async def test_iter(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + Exception, + {"url": "https://b.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + ] + actual_results = [] + async for actual_result in client.iter(queries): + actual_results.append(actual_result) + assert len(actual_results) == len(expected_results) + for actual_result in actual_results: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results diff --git a/tox.ini b/tox.ini index 280b2ba..fca46c5 100644 --- a/tox.ini +++ b/tox.ini @@ -3,7 +3,12 @@ envlist = py38,py39,py310,py311,mypy,docs [testenv] deps = - -rrequirements-test.txt + pytest + pytest-asyncio + pytest-cov + pytest-twisted + responses + twisted commands = py.test \ diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index a903afd..ed9f58f 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -1,3 +1,6 @@ """ Python client libraries and command line utilities for Zyte API -""" \ No newline at end of file +""" + +from ._async import AsyncZyteAPI +from ._utils import create_session diff --git a/zyte_api/_async.py b/zyte_api/_async.py new file mode 100644 index 0000000..9fc2dfb --- /dev/null +++ b/zyte_api/_async.py @@ -0,0 +1,152 @@ +import asyncio +import time +from functools import partial +from typing import Optional, Iterator, List + +import aiohttp +from tenacity import AsyncRetrying + +from .aio.errors import RequestError +from .aio.retry import zyte_api_retrying +from .apikey import get_apikey +from .constants import API_URL, API_TIMEOUT +from .stats import AggStats, ResponseStats +from .utils import USER_AGENT, _process_query +from ._utils import _AIO_API_TIMEOUT, create_session + + +def _post_func(session): + """ Return a function to send a POST request """ + if session is None: + return partial(aiohttp.request, + method='POST', + timeout=_AIO_API_TIMEOUT) + else: + return session.post + + +class AsyncZyteAPI: + def __init__( + self, + *, + api_key=None, + api_url=API_URL, + n_conn=15, + retrying: Optional[AsyncRetrying] = None, + user_agent: Optional[str] = None, + ): + self.api_key = get_apikey(api_key) + self.api_url = api_url + self.n_conn = n_conn + self.agg_stats = AggStats() + self.retrying = retrying or zyte_api_retrying + self.user_agent = user_agent or USER_AGENT + + async def get( + self, + query: dict, + *, + endpoint: str = 'extract', + session=None, + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ): + retrying = retrying or self.retrying + post = _post_func(session) + auth = aiohttp.BasicAuth(self.api_key) + headers = { + 'User-Agent': self.user_agent, + 'Accept-Encoding': 'br' + } + + response_stats = [] + start_global = time.perf_counter() + + async def request(): + stats = ResponseStats.create(start_global) + self.agg_stats.n_attempts += 1 + + post_kwargs = dict( + url=self.api_url + endpoint, + json=_process_query(query), + auth=auth, + headers=headers, + ) + + try: + async with post(**post_kwargs) as resp: + stats.record_connected(resp.status, self.agg_stats) + if resp.status >= 400: + content = await resp.read() + resp.release() + stats.record_read() + stats.record_request_error(content, self.agg_stats) + + raise RequestError( + request_info=resp.request_info, + history=resp.history, + status=resp.status, + message=resp.reason, + headers=resp.headers, + response_content=content, + ) + + response = await resp.json() + stats.record_read(self.agg_stats) + return response + except Exception as e: + if not isinstance(e, RequestError): + self.agg_stats.n_errors += 1 + stats.record_exception(e, agg_stats=self.agg_stats) + raise + finally: + response_stats.append(stats) + + if handle_retries: + request = retrying.wraps(request) + + try: + # Try to make a request + result = await request() + self.agg_stats.n_success += 1 + except Exception: + self.agg_stats.n_fatal_errors += 1 + raise + + return result + + async def iter( + self, + queries: List[dict], + *, + endpoint: str = 'extract', + session: Optional[aiohttp.ClientSession] = None, + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ) -> Iterator[asyncio.Future]: + """ Send multiple requests to Zyte API in parallel. + Return an `asyncio.as_completed` iterator. + + ``queries`` is a list of requests to process (dicts). + + ``session`` is an optional aiohttp.ClientSession object. + Set the session TCPConnector limit to a value greater than + the number of connections. + """ + sem = asyncio.Semaphore(self.n_conn) + + async def _request(query): + async with sem: + return await self.get( + query, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, + ) + + for result in asyncio.as_completed([_request(query) for query in queries]): + try: + yield await result + except Exception as exception: + yield exception diff --git a/zyte_api/_utils.py b/zyte_api/_utils.py new file mode 100644 index 0000000..e547fb5 --- /dev/null +++ b/zyte_api/_utils.py @@ -0,0 +1,19 @@ +import aiohttp +from aiohttp import TCPConnector + +from .constants import API_TIMEOUT + + +# 120 seconds is probably too long, but we are concerned about the case with +# many concurrent requests and some processing logic running in the same reactor, +# thus, saturating the CPU. This will make timeouts more likely. +_AIO_API_TIMEOUT = aiohttp.ClientTimeout(total=API_TIMEOUT + 120) + + +def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: + """ Create a session with parameters suited for Zyte API """ + kwargs.setdefault('timeout', _AIO_API_TIMEOUT) + if "connector" not in kwargs: + kwargs["connector"] = TCPConnector(limit=connection_pool_size, + force_close=True) + return aiohttp.ClientSession(**kwargs) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index a3fb331..1cf1d98 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -6,6 +6,7 @@ import time from functools import partial from typing import Optional, Iterator, List +from warnings import warn import aiohttp from aiohttp import TCPConnector @@ -19,6 +20,16 @@ from ..utils import USER_AGENT, _process_query +warn( + ( + "The zyte_api.aio.client module is deprecated. Replace AsyncClient " + "with zyte_api.AsyncZyteAPI (note that method names are different) " + "and create_session with zyte_api.create_session." + ), + DeprecationWarning, +) + + # 120 seconds is probably too long, but we are concerned about the case with # many concurrent requests and some processing logic running in the same reactor, # thus, saturating the CPU. This will make timeouts more likely. From 794083cfa3c5682bf9943bdb412946576e399fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 13 Mar 2024 16:04:57 +0100 Subject: [PATCH 065/126] Fix typing --- zyte_api/_async.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 9fc2dfb..651fa8c 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,7 +1,7 @@ import asyncio import time from functools import partial -from typing import Optional, Iterator, List +from typing import AsyncGenerator, Iterator, List, Optional, Union import aiohttp from tenacity import AsyncRetrying @@ -123,7 +123,7 @@ async def iter( session: Optional[aiohttp.ClientSession] = None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, - ) -> Iterator[asyncio.Future]: + ) -> AsyncGenerator[Union[dict, Exception], None]: """ Send multiple requests to Zyte API in parallel. Return an `asyncio.as_completed` iterator. From eafd59febfb3db20d750e6d02eb3676d73a0b72a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 14 Mar 2024 21:48:39 +0100 Subject: [PATCH 066/126] Clarify that iter does not yield in the original order --- docs/asyncio_api.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index 4792660..2c6e568 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -54,5 +54,7 @@ parallel, using multiple connections: urls = ["https://toscrape.com", "https://books.toscrape.com"] asyncio.run(extract_from(urls, n_conn=15)) +``iter`` yields results as they come, not necessarily in their original order. + ``iter`` and ``get`` methods handle throttling (http 429 errors) and network errors, retrying a request in these cases. From 95b85e71f39a79efad4b5445ad3b371cc60de7c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 14 Mar 2024 21:53:19 +0100 Subject: [PATCH 067/126] Reuse code --- zyte_api/aio/client.py | 27 ++------------------------- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 1cf1d98..68f2530 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -14,6 +14,8 @@ from .errors import RequestError from .retry import zyte_api_retrying +from .._async import _post_func +from .._utils import _AIO_API_TIMEOUT as AIO_API_TIMEOUT, create_session from ..apikey import get_apikey from ..constants import API_URL, API_TIMEOUT from ..stats import AggStats, ResponseStats @@ -30,31 +32,6 @@ ) -# 120 seconds is probably too long, but we are concerned about the case with -# many concurrent requests and some processing logic running in the same reactor, -# thus, saturating the CPU. This will make timeouts more likely. -AIO_API_TIMEOUT = aiohttp.ClientTimeout(total=API_TIMEOUT + 120) - - -def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: - """ Create a session with parameters suited for Zyte API """ - kwargs.setdefault('timeout', AIO_API_TIMEOUT) - if "connector" not in kwargs: - kwargs["connector"] = TCPConnector(limit=connection_pool_size, - force_close=True) - return aiohttp.ClientSession(**kwargs) - - -def _post_func(session): - """ Return a function to send a POST request """ - if session is None: - return partial(aiohttp.request, - method='POST', - timeout=AIO_API_TIMEOUT) - else: - return session.post - - class AsyncClient: def __init__(self, *, api_key=None, From 296834311a84bc0fdea3d844d40ec0cd4a9d0cef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 14 Mar 2024 22:04:25 +0100 Subject: [PATCH 068/126] Revert iter to an iterator --- tests/test_async.py | 6 +++++- zyte_api/_async.py | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/test_async.py b/tests/test_async.py index c405aa9..0d15df0 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -34,7 +34,11 @@ async def test_iter(mockserver): {"url": "https://b.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, ] actual_results = [] - async for actual_result in client.iter(queries): + for future in client.iter(queries): + try: + actual_result = await future + except Exception as exception: + actual_result = exception actual_results.append(actual_result) assert len(actual_results) == len(expected_results) for actual_result in actual_results: diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 651fa8c..57ecd94 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,7 +1,7 @@ import asyncio import time from functools import partial -from typing import AsyncGenerator, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional, Union import aiohttp from tenacity import AsyncRetrying @@ -115,7 +115,7 @@ async def request(): return result - async def iter( + def iter( self, queries: List[dict], *, @@ -123,9 +123,13 @@ async def iter( session: Optional[aiohttp.ClientSession] = None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, - ) -> AsyncGenerator[Union[dict, Exception], None]: - """ Send multiple requests to Zyte API in parallel. - Return an `asyncio.as_completed` iterator. + ) -> Iterator[asyncio.Future[Dict[str, Any]]]: + """Send multiple requests to Zyte API in parallel, and return an + iterator of futures for responses. + + `Responses are iterated in arrival order + `__, + i.e. response order may not match the order in the original query. ``queries`` is a list of requests to process (dicts). @@ -145,8 +149,4 @@ async def _request(query): retrying=retrying, ) - for result in asyncio.as_completed([_request(query) for query in queries]): - try: - yield await result - except Exception as exception: - yield exception + return asyncio.as_completed([_request(query) for query in queries]) From 4304637340d5a55312a1cdfca73d0ef4a865b3e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 14 Mar 2024 22:10:18 +0100 Subject: [PATCH 069/126] Restore Python 3.8 support --- zyte_api/_async.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 57ecd94..7ffb02f 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,7 +1,7 @@ import asyncio import time from functools import partial -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional, TYPE_CHECKING, Union import aiohttp from tenacity import AsyncRetrying @@ -15,6 +15,12 @@ from ._utils import _AIO_API_TIMEOUT, create_session +if TYPE_CHECKING: + _ResponseFuture = asyncio.Future[Dict[str, Any]] +else: + _ResponseFuture = asyncio.Future + + def _post_func(session): """ Return a function to send a POST request """ if session is None: @@ -123,7 +129,7 @@ def iter( session: Optional[aiohttp.ClientSession] = None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, - ) -> Iterator[asyncio.Future[Dict[str, Any]]]: + ) -> Iterator[_ResponseFuture]: """Send multiple requests to Zyte API in parallel, and return an iterator of futures for responses. From 5163ee6d29129ca880722b997beb385cf4943714 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 14 Mar 2024 22:11:11 +0100 Subject: [PATCH 070/126] Clarify the reason for the use of TYPE_CHECKING --- zyte_api/_async.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 7ffb02f..8156bde 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -18,7 +18,7 @@ if TYPE_CHECKING: _ResponseFuture = asyncio.Future[Dict[str, Any]] else: - _ResponseFuture = asyncio.Future + _ResponseFuture = asyncio.Future # Python 3.8 support def _post_func(session): From d31e2d954ba3bf972d28777c1c0c8a3366fb9efd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 Mar 2024 09:08:06 +0100 Subject: [PATCH 071/126] Use pre-commit (#64) --- .pre-commit-config.yaml | 19 +++++++++ docs/asyncio_api.rst | 9 ++-- docs/conf.py | 88 ++++++++++++++++++++++++---------------- pyproject.toml | 6 +++ setup.cfg | 12 ++++++ setup.py | 61 ++++++++++++++-------------- tests/mockserver.py | 8 +--- tests/test_async.py | 23 +++++++---- tests/test_client.py | 6 +-- tests/test_main.py | 2 +- tox.ini | 4 ++ zyte_api/__main__.py | 9 ++-- zyte_api/__version__.py | 2 +- zyte_api/_async.py | 22 ++++------ zyte_api/_utils.py | 8 ++-- zyte_api/aio/__init__.py | 2 +- zyte_api/aio/client.py | 68 +++++++++++++++---------------- zyte_api/aio/errors.py | 11 +++-- zyte_api/aio/retry.py | 26 ++++++------ zyte_api/apikey.py | 8 ++-- zyte_api/constants.py | 4 +- zyte_api/errors.py | 13 +++--- zyte_api/stats.py | 45 ++++++++++++-------- zyte_api/utils.py | 4 +- 24 files changed, 263 insertions(+), 197 deletions(-) create mode 100644 .pre-commit-config.yaml create mode 100644 pyproject.toml create mode 100644 setup.cfg diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..c5420cd --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,19 @@ +repos: +- repo: https://github.com/PyCQA/isort + rev: 5.13.1 + hooks: + - id: isort +- repo: https://github.com/psf/black + rev: 24.3.0 + hooks: + - id: black +- repo: https://github.com/pycqa/flake8 + rev: 7.0.0 + hooks: + - id: flake8 +- repo: https://github.com/adamchainz/blacken-docs + rev: 1.16.0 + hooks: + - id: blacken-docs + additional_dependencies: + - black==24.3.0 diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index 2c6e568..c9a1e4a 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -14,9 +14,11 @@ You can use the method ``request_raw`` to perform individual requests: client = AsyncZyteAPI(api_key="YOUR_API_KEY") + async def single_request(url): return await client.get({"url": url, "browserHtml": True}) + response = asyncio.run(single_request("https://books.toscrape.com")) .. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment @@ -34,12 +36,10 @@ parallel, using multiple connections: from zyte_api import AsyncZyteAPI, create_session from zyte_api.aio.errors import RequestError + async def extract_from(urls, n_conn): client = AsyncZyteAPI(n_conn=n_conn) - requests = [ - {"url": url, "browserHtml": True} - for url in urls - ] + requests = [{"url": url, "browserHtml": True} for url in urls] async with create_session(n_conn) as session: res_iter = client.iter(requests, session=session) for fut in res_iter: @@ -51,6 +51,7 @@ parallel, using multiple connections: print(e, file=sys.stderr) raise + urls = ["https://toscrape.com", "https://books.toscrape.com"] asyncio.run(extract_from(urls, n_conn=15)) diff --git a/docs/conf.py b/docs/conf.py index 4fa4cfd..5002ec5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,19 +12,22 @@ # import os import sys -sys.path.insert(0, os.path.abspath('../')) + +import sphinx_rtd_theme + +sys.path.insert(0, os.path.abspath("../")) # -- Project information ----------------------------------------------------- -project = u'python-zyte-api' -copyright = u'2021, Zyte Group Ltd' -author = u'Zyte Group Ltd' +project = "python-zyte-api" +copyright = "2021, Zyte Group Ltd" +author = "Zyte Group Ltd" # The short X.Y version -version = u'' +version = "" # The full version, including alpha/beta/rc tags -release = u'0.4.8' +release = "0.4.8" # -- General configuration --------------------------------------------------- @@ -37,37 +40,37 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.intersphinx', - 'sphinx.ext.ifconfig', - 'sphinx.ext.viewcode', - 'sphinx.ext.githubpages', - 'sphinx.ext.autosummary', + "sphinx.ext.autodoc", + "sphinx.ext.intersphinx", + "sphinx.ext.ifconfig", + "sphinx.ext.viewcode", + "sphinx.ext.githubpages", + "sphinx.ext.autosummary", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = '.rst' +source_suffix = ".rst" # The master toctree document. -master_doc = 'index' +master_doc = "index" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = 'en' +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store'] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] # The name of the Pygments (syntax highlighting) style to use. pygments_style = None @@ -78,12 +81,11 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'sphinx_rtd_theme' +html_theme = "sphinx_rtd_theme" # Add any paths that contain custom themes here, relative to this directory. # Add path to the RTD explicitly to robustify builds (otherwise might # fail in a clean Debian build env) -import sphinx_rtd_theme html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] # Theme options are theme-specific and customize the look and feel of a theme @@ -111,7 +113,7 @@ # -- Options for HTMLHelp output --------------------------------------------- # Output file base name for HTML help builder. -htmlhelp_basename = 'python-zyte-apidoc' +htmlhelp_basename = "python-zyte-apidoc" # -- Options for LaTeX output ------------------------------------------------ @@ -120,15 +122,12 @@ # The paper size ('letterpaper' or 'a4paper'). # # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). # # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. # # 'preamble': '', - # Latex figure (float) alignment # # 'figure_align': 'htbp', @@ -138,8 +137,13 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'python-zyte-api.tex', u'python-zyte-api Documentation', - u'Zyte Group Ltd', 'manual'), + ( + master_doc, + "python-zyte-api.tex", + "python-zyte-api Documentation", + "Zyte Group Ltd", + "manual", + ), ] @@ -148,8 +152,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'python-zyte-api', u'python-zyte-api Documentation', - [author], 1) + (master_doc, "python-zyte-api", "python-zyte-api Documentation", [author], 1) ] @@ -159,9 +162,15 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'python-zyte-api', u'python-zyte-api Documentation', - author, 'python-zyte-api', 'One line description of project.', - 'Miscellaneous'), + ( + master_doc, + "python-zyte-api", + "python-zyte-api Documentation", + author, + "python-zyte-api", + "One line description of project.", + "Miscellaneous", + ), ] @@ -180,22 +189,31 @@ # epub_uid = '' # A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] +epub_exclude_files = ["search.html"] # -- Extension configuration ------------------------------------------------- # -- Options for intersphinx extension --------------------------------------- intersphinx_mapping = { - 'python': ('https://docs.python.org/3', None, ), - 'aiohttp': ('https://docs.aiohttp.org/en/stable/', None, ), - 'tenacity': ('https://tenacity.readthedocs.io/en/latest/', None, ), + "python": ( + "https://docs.python.org/3", + None, + ), + "aiohttp": ( + "https://docs.aiohttp.org/en/stable/", + None, + ), + "tenacity": ( + "https://tenacity.readthedocs.io/en/latest/", + None, + ), } autodoc_default_options = { # 'special-members': '__init__,__call__', # 'undoc-members': True, - 'exclude-members': '__weakref__' + "exclude-members": "__weakref__" } add_module_names = False diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..830e253 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,6 @@ +[tool.isort] +profile = "black" +multi_line_output = 3 + +[tool.black] +target-version = ["py38", "py39", "py310", "py311", "py312"] diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..5efe5f2 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,12 @@ +[flake8] +ignore = + # Style issues handled by black. + E501, + E203, + W503, + +per-file-ignores = + # F401: Ignore "imported but unused" errors in __init__ files, as those + # imports are there to expose submodule functions so they can be imported + # directly from that module + zyte_api/__init__.py:F401 \ No newline at end of file diff --git a/setup.py b/setup.py index 658699a..7383672 100755 --- a/setup.py +++ b/setup.py @@ -1,48 +1,49 @@ #!/usr/bin/env python import os -from setuptools import setup, find_packages + +from setuptools import find_packages, setup def get_version(): about = {} here = os.path.abspath(os.path.dirname(__file__)) - with open(os.path.join(here, 'zyte_api/__version__.py')) as f: + with open(os.path.join(here, "zyte_api/__version__.py")) as f: exec(f.read(), about) - return about['__version__'] + return about["__version__"] setup( - name='zyte-api', + name="zyte-api", version=get_version(), - description='Python interface to Zyte API', - long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), - long_description_content_type='text/x-rst', - author='Zyte Group Ltd', - author_email='opensource@zyte.com', - url='https://github.com/zytedata/python-zyte-api', - packages=find_packages(exclude=['tests', 'examples']), - entry_points = { - 'console_scripts': ['zyte-api=zyte_api.__main__:_main'], + description="Python interface to Zyte API", + long_description=open("README.rst").read() + "\n\n" + open("CHANGES.rst").read(), + long_description_content_type="text/x-rst", + author="Zyte Group Ltd", + author_email="opensource@zyte.com", + url="https://github.com/zytedata/python-zyte-api", + packages=find_packages(exclude=["tests", "examples"]), + entry_points={ + "console_scripts": ["zyte-api=zyte_api.__main__:_main"], }, install_requires=[ - 'aiohttp >= 3.8.0', - 'attrs', - 'brotli', - 'runstats', - 'tenacity', - 'tqdm', - 'w3lib >= 2.1.1', + "aiohttp >= 3.8.0", + "attrs", + "brotli", + "runstats", + "tenacity", + "tqdm", + "w3lib >= 2.1.1", ], classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', - 'Natural Language :: English', - 'Operating System :: OS Independent', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Natural Language :: English", + "Operating System :: OS Independent", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", ], ) diff --git a/tests/mockserver.py b/tests/mockserver.py index ea8fffb..8f4330d 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -4,18 +4,14 @@ import sys import time from base64 import b64encode -from contextlib import asynccontextmanager from importlib import import_module from subprocess import PIPE, Popen -from typing import Any, Dict, Optional +from typing import Any, Dict from urllib.parse import urlparse -from pytest_twisted import ensureDeferred from twisted.internet import reactor -from twisted.internet.defer import Deferred -from twisted.internet.task import deferLater from twisted.web.resource import Resource -from twisted.web.server import NOT_DONE_YET, Site +from twisted.web.server import Site def get_ephemeral_port(): diff --git a/tests/test_async.py b/tests/test_async.py index 0d15df0..eb93054 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -1,10 +1,8 @@ -from types import GeneratorType +import pytest from zyte_api import AsyncZyteAPI from zyte_api.apikey import NoApiKey -import pytest - def test_api_key(): AsyncZyteAPI(api_key="a") @@ -15,8 +13,13 @@ def test_api_key(): @pytest.mark.asyncio async def test_get(mockserver): client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) - expected_result = {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="} - actual_result = await client.get({"url": "https://a.example", "httpResponseBody": True}) + expected_result = { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + } + actual_result = await client.get( + {"url": "https://a.example", "httpResponseBody": True} + ) assert actual_result == expected_result @@ -29,9 +32,15 @@ async def test_iter(mockserver): {"url": "https://b.example", "httpResponseBody": True}, ] expected_results = [ - {"url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, Exception, - {"url": "https://b.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg=="}, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, ] actual_results = [] for future in client.iter(queries): diff --git a/tests/test_client.py b/tests/test_client.py index cd4c214..ae34726 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -12,11 +12,11 @@ USER_AGENT, ), ( - f'scrapy-zyte-api/0.11.1 {USER_AGENT}', - f'scrapy-zyte-api/0.11.1 {USER_AGENT}', + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", ), ), ) def test_user_agent(user_agent, expected): - client = AsyncClient(api_key='123', api_url='http:\\test', user_agent=user_agent) + client = AsyncClient(api_key="123", api_url="http:\\test", user_agent=user_agent) assert client.user_agent == expected diff --git a/tests/test_main.py b/tests/test_main.py index 0087774..85573af 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,7 +1,7 @@ import json import os from json import JSONDecodeError -from unittest.mock import Mock, patch, AsyncMock +from unittest.mock import AsyncMock, Mock, patch import pytest diff --git a/tox.ini b/tox.ini index fca46c5..fec688d 100644 --- a/tox.ini +++ b/tox.ini @@ -31,3 +31,7 @@ deps = basepython = python3 commands = sphinx-build -W -b html . {envtmpdir}/html + +[testenv:pre-commit] +deps = pre-commit +commands = pre-commit run --all-files --show-diff-on-failure diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 3e03e70..b81dc63 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -10,12 +10,9 @@ import tqdm from tenacity import retry_if_exception -from zyte_api.aio.client import ( - create_session, - AsyncClient, -) +from zyte_api.aio.client import AsyncClient, create_session from zyte_api.aio.retry import RetryFactory, _is_throttling_error -from zyte_api.constants import ENV_VARIABLE, API_URL +from zyte_api.constants import API_URL, ENV_VARIABLE from zyte_api.utils import _guess_intype @@ -167,7 +164,7 @@ def _main(program_name="zyte-api"): p.add_argument( "--store-errors", help="when set to true, it includes all types of responses, and when set to false," - " it includes only error-free responses in the output.", + " it includes only error-free responses in the output.", ) args = p.parse_args() logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 5bf52d5..a3a9bd5 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = '0.4.8' +__version__ = "0.4.8" diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 8156bde..6553c5c 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,19 +1,18 @@ import asyncio import time from functools import partial -from typing import Any, Dict, Iterator, List, Optional, TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional import aiohttp from tenacity import AsyncRetrying +from ._utils import _AIO_API_TIMEOUT from .aio.errors import RequestError from .aio.retry import zyte_api_retrying from .apikey import get_apikey -from .constants import API_URL, API_TIMEOUT +from .constants import API_URL from .stats import AggStats, ResponseStats from .utils import USER_AGENT, _process_query -from ._utils import _AIO_API_TIMEOUT, create_session - if TYPE_CHECKING: _ResponseFuture = asyncio.Future[Dict[str, Any]] @@ -22,11 +21,9 @@ def _post_func(session): - """ Return a function to send a POST request """ + """Return a function to send a POST request""" if session is None: - return partial(aiohttp.request, - method='POST', - timeout=_AIO_API_TIMEOUT) + return partial(aiohttp.request, method="POST", timeout=_AIO_API_TIMEOUT) else: return session.post @@ -52,7 +49,7 @@ async def get( self, query: dict, *, - endpoint: str = 'extract', + endpoint: str = "extract", session=None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, @@ -60,10 +57,7 @@ async def get( retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = { - 'User-Agent': self.user_agent, - 'Accept-Encoding': 'br' - } + headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} response_stats = [] start_global = time.perf_counter() @@ -125,7 +119,7 @@ def iter( self, queries: List[dict], *, - endpoint: str = 'extract', + endpoint: str = "extract", session: Optional[aiohttp.ClientSession] = None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, diff --git a/zyte_api/_utils.py b/zyte_api/_utils.py index e547fb5..116206f 100644 --- a/zyte_api/_utils.py +++ b/zyte_api/_utils.py @@ -3,7 +3,6 @@ from .constants import API_TIMEOUT - # 120 seconds is probably too long, but we are concerned about the case with # many concurrent requests and some processing logic running in the same reactor, # thus, saturating the CPU. This will make timeouts more likely. @@ -11,9 +10,8 @@ def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: - """ Create a session with parameters suited for Zyte API """ - kwargs.setdefault('timeout', _AIO_API_TIMEOUT) + """Create a session with parameters suited for Zyte API""" + kwargs.setdefault("timeout", _AIO_API_TIMEOUT) if "connector" not in kwargs: - kwargs["connector"] = TCPConnector(limit=connection_pool_size, - force_close=True) + kwargs["connector"] = TCPConnector(limit=connection_pool_size, force_close=True) return aiohttp.ClientSession(**kwargs) diff --git a/zyte_api/aio/__init__.py b/zyte_api/aio/__init__.py index b69b052..9833d38 100644 --- a/zyte_api/aio/__init__.py +++ b/zyte_api/aio/__init__.py @@ -1,3 +1,3 @@ """ Asyncio client for Zyte API -""" \ No newline at end of file +""" diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 68f2530..c2336db 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -4,23 +4,20 @@ import asyncio import time -from functools import partial -from typing import Optional, Iterator, List +from typing import Iterator, List, Optional from warnings import warn import aiohttp -from aiohttp import TCPConnector from tenacity import AsyncRetrying -from .errors import RequestError -from .retry import zyte_api_retrying from .._async import _post_func -from .._utils import _AIO_API_TIMEOUT as AIO_API_TIMEOUT, create_session +from .._utils import create_session # noqa: F401 from ..apikey import get_apikey -from ..constants import API_URL, API_TIMEOUT +from ..constants import API_URL from ..stats import AggStats, ResponseStats from ..utils import USER_AGENT, _process_query - +from .errors import RequestError +from .retry import zyte_api_retrying warn( ( @@ -33,13 +30,15 @@ class AsyncClient: - def __init__(self, *, - api_key=None, - api_url=API_URL, - n_conn=15, - retrying: Optional[AsyncRetrying] = None, - user_agent: Optional[str] = None, - ): + def __init__( + self, + *, + api_key=None, + api_url=API_URL, + n_conn=15, + retrying: Optional[AsyncRetrying] = None, + user_agent: Optional[str] = None, + ): self.api_key = get_apikey(api_key) self.api_url = api_url self.n_conn = n_conn @@ -47,19 +46,19 @@ def __init__(self, *, self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT - async def request_raw(self, query: dict, *, - endpoint: str = 'extract', - session=None, - handle_retries=True, - retrying: Optional[AsyncRetrying] = None, - ): + async def request_raw( + self, + query: dict, + *, + endpoint: str = "extract", + session=None, + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ): retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) - headers = { - 'User-Agent': self.user_agent, - 'Accept-Encoding': 'br' - } + headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} response_stats = [] start_global = time.perf_counter() @@ -117,13 +116,14 @@ async def request(): return result - def request_parallel_as_completed(self, - queries: List[dict], - *, - endpoint: str = 'extract', - session: Optional[aiohttp.ClientSession] = None, - ) -> Iterator[asyncio.Future]: - """ Send multiple requests to Zyte API in parallel. + def request_parallel_as_completed( + self, + queries: List[dict], + *, + endpoint: str = "extract", + session: Optional[aiohttp.ClientSession] = None, + ) -> Iterator[asyncio.Future]: + """Send multiple requests to Zyte API in parallel. Return an `asyncio.as_completed` iterator. ``queries`` is a list of requests to process (dicts). @@ -136,8 +136,6 @@ def request_parallel_as_completed(self, async def _request(query): async with sem: - return await self.request_raw(query, - endpoint=endpoint, - session=session) + return await self.request_raw(query, endpoint=endpoint, session=session) return asyncio.as_completed([_request(query) for query in queries]) diff --git a/zyte_api/aio/errors.py b/zyte_api/aio/errors.py index 8b3005c..cf12e80 100644 --- a/zyte_api/aio/errors.py +++ b/zyte_api/aio/errors.py @@ -9,10 +9,11 @@ class RequestError(ClientResponseError): - """ Exception which is raised when Request-level error is returned. + """Exception which is raised when Request-level error is returned. In contrast with ClientResponseError, it allows to inspect response content. """ + def __init__(self, *args, **kwargs): self.response_content = kwargs.pop("response_content") self.request_id = kwargs.pop("request_id", None) @@ -25,6 +26,8 @@ def parsed(self): return ParsedError.from_body(self.response_content) def __str__(self): - return f"RequestError: {self.status}, message={self.message}, " \ - f"headers={self.headers}, body={self.response_content}, " \ - f"request_id={self.request_id}" + return ( + f"RequestError: {self.status}, message={self.message}, " + f"headers={self.headers}, body={self.response_content}, " + f"request_id={self.request_id}" + ) diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index 1f8b4fb..39299b5 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -9,22 +9,24 @@ from aiohttp import client_exceptions from tenacity import ( + AsyncRetrying, + RetryCallState, + after_log, + before_log, + before_sleep_log, + retry_base, + retry_if_exception, + stop_after_attempt, + stop_after_delay, wait_chain, wait_fixed, - wait_random_exponential, wait_random, - stop_after_attempt, - stop_after_delay, - retry_if_exception, - RetryCallState, - before_sleep_log, - after_log, AsyncRetrying, before_log, retry_base, + wait_random_exponential, ) from tenacity.stop import stop_never from .errors import RequestError - logger = logging.getLogger(__name__) @@ -62,6 +64,7 @@ class RetryFactory: """ Build custom retry configuration """ + retry_condition: retry_base = ( retry_if_exception(_is_throttling_error) | retry_if_exception(_is_network_error) @@ -71,19 +74,18 @@ class RetryFactory: throttling_wait = wait_chain( # always wait 20-40s first wait_fixed(20) + wait_random(0, 20), - # wait 20-40s again wait_fixed(20) + wait_random(0, 20), - # wait from 30 to 630s, with full jitter and exponentially # increasing max wait time - wait_fixed(30) + wait_random_exponential(multiplier=1, max=600) + wait_fixed(30) + wait_random_exponential(multiplier=1, max=600), ) # connection errors, other client and server failures network_error_wait = ( # wait from 3s to ~1m - wait_random(3, 7) + wait_random_exponential(multiplier=1, max=55) + wait_random(3, 7) + + wait_random_exponential(multiplier=1, max=55) ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never diff --git a/zyte_api/apikey.py b/zyte_api/apikey.py index f9b0f80..c1cc70b 100644 --- a/zyte_api/apikey.py +++ b/zyte_api/apikey.py @@ -10,11 +10,13 @@ class NoApiKey(Exception): def get_apikey(key: Optional[str] = None) -> str: - """ Return API key, probably loading it from an environment variable """ + """Return API key, probably loading it from an environment variable""" if key is not None: return key try: return os.environ[ENV_VARIABLE] except KeyError: - raise NoApiKey("API key not found. Please set {} " - "environment variable.".format(ENV_VARIABLE)) + raise NoApiKey( + "API key not found. Please set {} " + "environment variable.".format(ENV_VARIABLE) + ) diff --git a/zyte_api/constants.py b/zyte_api/constants.py index 926577d..a433302 100644 --- a/zyte_api/constants.py +++ b/zyte_api/constants.py @@ -1,10 +1,10 @@ # -*- coding: utf-8 -*- # Name of the environment variable with the API key -ENV_VARIABLE = 'ZYTE_API_KEY' +ENV_VARIABLE = "ZYTE_API_KEY" # API URL -API_URL = 'https://api.zyte.com/v1/' +API_URL = "https://api.zyte.com/v1/" # Default timeout that server uses. Client timeouts should be larger than that. API_TIMEOUT = 200 diff --git a/zyte_api/errors.py b/zyte_api/errors.py index b608bf1..8088b54 100644 --- a/zyte_api/errors.py +++ b/zyte_api/errors.py @@ -6,13 +6,14 @@ @attr.s(auto_attribs=True) class ParsedError: - """ Parsed error from Zyte API """ + """Parsed error from Zyte API""" + response_body: bytes data: Optional[dict] parse_error: Optional[str] @classmethod - def from_body(cls, response_body: bytes) -> 'ParsedError': + def from_body(cls, response_body: bytes) -> "ParsedError": data = None parse_error = None @@ -25,12 +26,8 @@ def from_body(cls, response_body: bytes) -> 'ParsedError': except (json.JSONDecodeError, UnicodeDecodeError) as _: # noqa: F841 parse_error = "bad_json" - return cls( - response_body=response_body, - data=data, - parse_error=parse_error - ) + return cls(response_body=response_body, data=data, parse_error=parse_error) @property def type(self) -> Optional[str]: - return (self.data or {}).get('type', None) + return (self.data or {}).get("type", None) diff --git a/zyte_api/stats.py b/zyte_api/stats.py index 1789ee5..42c7b6a 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -1,8 +1,8 @@ # -*- coding: utf-8 -*- -from typing import Optional -from collections import Counter import functools import time +from collections import Counter +from typing import Optional import attr from runstats import Statistics @@ -17,6 +17,7 @@ def wrapper(*args, **kwargs): return meth(*args, **kwargs) except ZeroDivisionError: return 0 + return wrapper @@ -26,9 +27,13 @@ def __init__(self): self.time_total_stats = Statistics() self.n_success = 0 # number of successful results returned to the user - self.n_fatal_errors = 0 # number of errors returned to the user, after all retries + self.n_fatal_errors = ( + 0 # number of errors returned to the user, after all retries + ) - self.n_attempts = 0 # total amount of requests made to Zyte API, including retries + self.n_attempts = ( + 0 # total amount of requests made to Zyte API, including retries + ) self.n_429 = 0 # number of 429 (throttling) responses self.n_errors = 0 # number of errors, including errors which were retried @@ -46,25 +51,29 @@ def __str__(self): self.error_ratio(), self.n_success, self.n_processed, - self.success_ratio() + self.success_ratio(), ) def summary(self): return ( - "\n" + - "Summary\n" + - "-------\n" + - "Mean connection time: {:0.2f}\n".format(self.time_connect_stats.mean()) + - "Mean response time: {:0.2f}\n".format(self.time_total_stats.mean()) + - "Throttle ratio: {:0.1%}\n".format(self.throttle_ratio()) + - "Attempts: {}\n".format(self.n_attempts) + - "Errors: {:0.1%}, fatal: {}, non fatal: {}\n".format( + "\n" + + "Summary\n" + + "-------\n" + + "Mean connection time: {:0.2f}\n".format( + self.time_connect_stats.mean() + ) + + "Mean response time: {:0.2f}\n".format(self.time_total_stats.mean()) + + "Throttle ratio: {:0.1%}\n".format(self.throttle_ratio()) + + "Attempts: {}\n".format(self.n_attempts) + + "Errors: {:0.1%}, fatal: {}, non fatal: {}\n".format( self.error_ratio(), self.n_fatal_errors, - self.n_errors - self.n_fatal_errors) + - "Successful URLs: {} of {}\n".format( - self.n_success, self.n_processed) + - "Success ratio: {:0.1%}\n".format(self.success_ratio()) + self.n_errors - self.n_fatal_errors, + ) + + "Successful URLs: {} of {}\n".format( + self.n_success, self.n_processed + ) + + "Success ratio: {:0.1%}\n".format(self.success_ratio()) ) @zero_on_division_error @@ -81,7 +90,7 @@ def success_ratio(self): @property def n_processed(self): - """ Total number of processed URLs """ + """Total number of processed URLs""" return self.n_success + self.n_fatal_errors diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 7767a9b..1b24a60 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -5,7 +5,7 @@ from .__version__ import __version__ -USER_AGENT = f'python-zyte-api/{__version__}' +USER_AGENT = f"python-zyte-api/{__version__}" def _guess_intype(file_name, lines): @@ -16,7 +16,7 @@ def _guess_intype(file_name, lines): if extension == "txt": return "txt" - if re.search(r'^\s*\{', lines[0]): + if re.search(r"^\s*\{", lines[0]): return "jl" return "txt" From bf6e9016ba45d77d0459523b172430070ef7bfea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 Mar 2024 09:48:34 +0100 Subject: [PATCH 072/126] Include basic usage examples in the README (#61) --- README.rst | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 2128fe7..703c3ad 100644 --- a/README.rst +++ b/README.rst @@ -38,7 +38,27 @@ Make sure you have an API key for the `Zyte API`_ service. You can set ``ZYTE_API_KEY`` environment variable with the key to avoid passing it around explicitly. -Read the `documentation `_ for more information. + +Basic usage +=========== + +Use the ``zyte-api`` command to send Zyte API requests from the command line: + +.. code-block:: shell + + zyte-api url-list.txt --output results.jsonl + +Or use the Python async API: + +.. code-block:: python + + from zyte_api import AsyncZyteAPI + + client = AsyncZyteAPI() + response = await client.get({"url": url, "httpResponseBody": True}) + +Read the `documentation `_ for more +information. License is BSD 3-clause. From 1b5a61b7ed5a27d54936416f9317e6cc6f748e72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 19 Mar 2024 13:17:31 +0100 Subject: [PATCH 073/126] Cleanups (#65) --- docs/asyncio_api.rst | 7 +- pytest.ini | 2 +- setup.cfg | 4 +- tests/test_client.py | 4 +- tests/test_main.py | 20 +++--- zyte_api/__init__.py | 2 + zyte_api/__main__.py | 9 +-- zyte_api/_async.py | 4 +- zyte_api/_errors.py | 32 +++++++++ zyte_api/_retry.py | 130 +++++++++++++++++++++++++++++++++++++ zyte_api/aio/__init__.py | 15 +++++ zyte_api/aio/client.py | 10 --- zyte_api/aio/errors.py | 34 +--------- zyte_api/aio/retry.py | 137 +-------------------------------------- 14 files changed, 208 insertions(+), 202 deletions(-) create mode 100644 zyte_api/_errors.py create mode 100644 zyte_api/_retry.py diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index c9a1e4a..a77691e 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -4,8 +4,8 @@ asyncio API =========== -Create an instance of the ``AsyncZyteAPI`` to use the asyncio client API. -You can use the method ``request_raw`` to perform individual requests: +Create an instance of the ``AsyncZyteAPI`` to use the asyncio client API. You +can use the method ``get`` to perform individual requests: .. code-block:: python @@ -33,8 +33,7 @@ parallel, using multiple connections: import json import sys - from zyte_api import AsyncZyteAPI, create_session - from zyte_api.aio.errors import RequestError + from zyte_api import AsyncZyteAPI, RequestError, create_session async def extract_from(urls, n_conn): diff --git a/pytest.ini b/pytest.ini index 5260d78..c7cb0a7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,4 +1,4 @@ [pytest] filterwarnings = - ignore:The zyte_api\.aio\.client module is deprecated:DeprecationWarning + ignore:The zyte_api\.aio module is deprecated:DeprecationWarning diff --git a/setup.cfg b/setup.cfg index 5efe5f2..43b4ff1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,4 +9,6 @@ per-file-ignores = # F401: Ignore "imported but unused" errors in __init__ files, as those # imports are there to expose submodule functions so they can be imported # directly from that module - zyte_api/__init__.py:F401 \ No newline at end of file + zyte_api/__init__.py:F401 + zyte_api/aio/errors.py:F401 + zyte_api/aio/retry.py:F401 \ No newline at end of file diff --git a/tests/test_client.py b/tests/test_client.py index ae34726..5886ef2 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -1,6 +1,6 @@ import pytest -from zyte_api.aio.client import AsyncClient +from zyte_api import AsyncZyteAPI from zyte_api.utils import USER_AGENT @@ -18,5 +18,5 @@ ), ) def test_user_agent(user_agent, expected): - client = AsyncClient(api_key="123", api_url="http:\\test", user_agent=user_agent) + client = AsyncZyteAPI(api_key="123", api_url="http:\\test", user_agent=user_agent) assert client.user_agent == expected diff --git a/tests/test_main.py b/tests/test_main.py index 85573af..3960a5a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -91,33 +91,34 @@ async def fake_exception(value=True): ) @pytest.mark.asyncio async def test_run(queries, expected_response, store_errors, exception): - temporary_file = open("temporary_file.jsonl", "w") + tmp_path = "temporary_file.jsonl" + temporary_file = open(tmp_path, "w") n_conn = 5 stop_on_errors = False api_url = "https://example.com" api_key = "fake_key" retry_errors = True - # Create a mock for AsyncClient + # Create a mock for AsyncZyteAPI async_client_mock = Mock() - # Create a mock for the request_parallel_as_completed method + # Create a mock for the iter method request_parallel_mock = Mock() - async_client_mock.return_value.request_parallel_as_completed = request_parallel_mock + async_client_mock.return_value.iter = request_parallel_mock - # Patch the AsyncClient class in __main__ with the mock - with patch("zyte_api.__main__.AsyncClient", async_client_mock), patch( + # Patch the AsyncZyteAPI class in __main__ with the mock + with patch("zyte_api.__main__.AsyncZyteAPI", async_client_mock), patch( "zyte_api.__main__.create_session" ) as create_session_mock: # Mock create_session to return an AsyncMock create_session_mock.return_value = AsyncMock() - # Set up the AsyncClient instance to return the mocked iterator - async_client_mock.return_value.request_parallel_as_completed.return_value = [ + # Set up the AsyncZyteAPI instance to return the mocked iterator + async_client_mock.return_value.iter.return_value = [ exception(), ] - # Call the run function with the mocked AsyncClient + # Call the run function with the mocked AsyncZyteAPI await run( queries=queries, out=temporary_file, @@ -130,3 +131,4 @@ async def test_run(queries, expected_response, store_errors, exception): ) assert get_json_content(temporary_file) == expected_response + os.unlink(tmp_path) diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index ed9f58f..5dcf5c4 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -3,4 +3,6 @@ """ from ._async import AsyncZyteAPI +from ._errors import RequestError +from ._retry import RetryFactory, zyte_api_retrying from ._utils import create_session diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index b81dc63..6d8c821 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -10,8 +10,9 @@ import tqdm from tenacity import retry_if_exception -from zyte_api.aio.client import AsyncClient, create_session -from zyte_api.aio.retry import RetryFactory, _is_throttling_error +from zyte_api._async import AsyncZyteAPI +from zyte_api._retry import RetryFactory, _is_throttling_error +from zyte_api._utils import create_session from zyte_api.constants import API_URL, ENV_VARIABLE from zyte_api.utils import _guess_intype @@ -43,11 +44,11 @@ def write_output(content): pbar.update() retrying = None if retry_errors else DontRetryErrorsFactory().build() - client = AsyncClient( + client = AsyncZyteAPI( n_conn=n_conn, api_key=api_key, api_url=api_url, retrying=retrying ) async with create_session(connection_pool_size=n_conn) as session: - result_iter = client.request_parallel_as_completed( + result_iter = client.iter( queries=queries, session=session, ) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 6553c5c..9a74fb6 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -6,9 +6,9 @@ import aiohttp from tenacity import AsyncRetrying +from ._errors import RequestError +from ._retry import zyte_api_retrying from ._utils import _AIO_API_TIMEOUT -from .aio.errors import RequestError -from .aio.retry import zyte_api_retrying from .apikey import get_apikey from .constants import API_URL from .stats import AggStats, ResponseStats diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py new file mode 100644 index 0000000..072322d --- /dev/null +++ b/zyte_api/_errors.py @@ -0,0 +1,32 @@ +import logging + +from aiohttp import ClientResponseError + +from zyte_api.errors import ParsedError + +logger = logging.getLogger("zyte_api") + + +class RequestError(ClientResponseError): + """Exception which is raised when Request-level error is returned. + In contrast with ClientResponseError, it allows to inspect response + content. + """ + + def __init__(self, *args, **kwargs): + self.response_content = kwargs.pop("response_content") + self.request_id = kwargs.pop("request_id", None) + if self.request_id is None: + self.request_id = kwargs.get("headers", {}).get("request-id") + super().__init__(*args, **kwargs) + + @property + def parsed(self): + return ParsedError.from_body(self.response_content) + + def __str__(self): + return ( + f"RequestError: {self.status}, message={self.message}, " + f"headers={self.headers}, body={self.response_content}, " + f"request_id={self.request_id}" + ) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py new file mode 100644 index 0000000..d570cf5 --- /dev/null +++ b/zyte_api/_retry.py @@ -0,0 +1,130 @@ +import asyncio +import logging + +from aiohttp import client_exceptions +from tenacity import ( + AsyncRetrying, + RetryCallState, + after_log, + before_log, + before_sleep_log, + retry_base, + retry_if_exception, + stop_after_attempt, + stop_after_delay, + wait_chain, + wait_fixed, + wait_random, + wait_random_exponential, +) +from tenacity.stop import stop_never + +from ._errors import RequestError + +logger = logging.getLogger(__name__) + + +_NETWORK_ERRORS = ( + asyncio.TimeoutError, # could happen while reading the response body + client_exceptions.ClientResponseError, + client_exceptions.ClientOSError, + client_exceptions.ServerConnectionError, + client_exceptions.ServerDisconnectedError, + client_exceptions.ServerTimeoutError, + client_exceptions.ClientPayloadError, + client_exceptions.ClientConnectorSSLError, + client_exceptions.ClientConnectorError, +) + + +def _is_network_error(exc: BaseException) -> bool: + if isinstance(exc, RequestError): + # RequestError is ClientResponseError, which is in the + # _NETWORK_ERRORS list, but it should be handled + # separately. + return False + return isinstance(exc, _NETWORK_ERRORS) + + +def _is_throttling_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status in (429, 503) + + +def _is_temporary_download_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 520 + + +class RetryFactory: + """ + Build custom retry configuration + """ + + retry_condition: retry_base = ( + retry_if_exception(_is_throttling_error) + | retry_if_exception(_is_network_error) + | retry_if_exception(_is_temporary_download_error) + ) + # throttling + throttling_wait = wait_chain( + # always wait 20-40s first + wait_fixed(20) + wait_random(0, 20), + # wait 20-40s again + wait_fixed(20) + wait_random(0, 20), + # wait from 30 to 630s, with full jitter and exponentially + # increasing max wait time + wait_fixed(30) + wait_random_exponential(multiplier=1, max=600), + ) + + # connection errors, other client and server failures + network_error_wait = ( + # wait from 3s to ~1m + wait_random(3, 7) + + wait_random_exponential(multiplier=1, max=55) + ) + temporary_download_error_wait = network_error_wait + throttling_stop = stop_never + network_error_stop = stop_after_delay(15 * 60) + temporary_download_error_stop = stop_after_attempt(4) + + def wait(self, retry_state: RetryCallState) -> float: + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if _is_throttling_error(exc): + return self.throttling_wait(retry_state=retry_state) + elif _is_network_error(exc): + return self.network_error_wait(retry_state=retry_state) + elif _is_temporary_download_error(exc): + return self.temporary_download_error_wait(retry_state=retry_state) + else: + raise RuntimeError("Invalid retry state exception: %s" % exc) + + def stop(self, retry_state: RetryCallState) -> bool: + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if _is_throttling_error(exc): + return self.throttling_stop(retry_state) + elif _is_network_error(exc): + return self.network_error_stop(retry_state) + elif _is_temporary_download_error(exc): + return self.temporary_download_error_stop(retry_state) + else: + raise RuntimeError("Invalid retry state exception: %s" % exc) + + def reraise(self) -> bool: + return True + + def build(self) -> AsyncRetrying: + return AsyncRetrying( + wait=self.wait, + retry=self.retry_condition, + stop=self.stop, + reraise=self.reraise(), + before=before_log(logger, logging.DEBUG), + after=after_log(logger, logging.DEBUG), + before_sleep=before_sleep_log(logger, logging.DEBUG), + ) + + +zyte_api_retrying: AsyncRetrying = RetryFactory().build() diff --git a/zyte_api/aio/__init__.py b/zyte_api/aio/__init__.py index 9833d38..6b8d6f1 100644 --- a/zyte_api/aio/__init__.py +++ b/zyte_api/aio/__init__.py @@ -1,3 +1,18 @@ """ Asyncio client for Zyte API """ + +from warnings import warn + +warn( + ( + "The zyte_api.aio module is deprecated. Replace " + "zyte_api.aio.client.AsyncClient with zyte_api.AsyncZyteAPI (note " + "that method names are different), zyte_api.aio.client.create_session " + "with zyte_api.create_session, zyte_api.aio.errors.RequestError with " + "zyte_api.RequestError, zyte_api.aio.retry.RetryFactory with " + "zyte_api.RetryFactory, and zyte_api.aio.retry.zyte_api_retrying with " + "zyte_api.zyte_api_retrying." + ), + DeprecationWarning, +) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index c2336db..7814f69 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -5,7 +5,6 @@ import asyncio import time from typing import Iterator, List, Optional -from warnings import warn import aiohttp from tenacity import AsyncRetrying @@ -19,15 +18,6 @@ from .errors import RequestError from .retry import zyte_api_retrying -warn( - ( - "The zyte_api.aio.client module is deprecated. Replace AsyncClient " - "with zyte_api.AsyncZyteAPI (note that method names are different) " - "and create_session with zyte_api.create_session." - ), - DeprecationWarning, -) - class AsyncClient: def __init__( diff --git a/zyte_api/aio/errors.py b/zyte_api/aio/errors.py index cf12e80..987d8ba 100644 --- a/zyte_api/aio/errors.py +++ b/zyte_api/aio/errors.py @@ -1,33 +1 @@ -# -*- coding: utf-8 -*- -import logging - -from aiohttp import ClientResponseError - -from zyte_api.errors import ParsedError - -logger = logging.getLogger(__name__) - - -class RequestError(ClientResponseError): - """Exception which is raised when Request-level error is returned. - In contrast with ClientResponseError, it allows to inspect response - content. - """ - - def __init__(self, *args, **kwargs): - self.response_content = kwargs.pop("response_content") - self.request_id = kwargs.pop("request_id", None) - if self.request_id is None: - self.request_id = kwargs.get("headers", {}).get("request-id") - super().__init__(*args, **kwargs) - - @property - def parsed(self): - return ParsedError.from_body(self.response_content) - - def __str__(self): - return ( - f"RequestError: {self.status}, message={self.message}, " - f"headers={self.headers}, body={self.response_content}, " - f"request_id={self.request_id}" - ) +from .._errors import RequestError diff --git a/zyte_api/aio/retry.py b/zyte_api/aio/retry.py index 39299b5..5cd22a7 100644 --- a/zyte_api/aio/retry.py +++ b/zyte_api/aio/retry.py @@ -1,136 +1 @@ -# -*- coding: utf-8 -*- -""" -Zyte API retrying logic. - -TODO: Implement retry logic for temparary errors (520) using the proposed retry-after header. -""" -import asyncio -import logging - -from aiohttp import client_exceptions -from tenacity import ( - AsyncRetrying, - RetryCallState, - after_log, - before_log, - before_sleep_log, - retry_base, - retry_if_exception, - stop_after_attempt, - stop_after_delay, - wait_chain, - wait_fixed, - wait_random, - wait_random_exponential, -) -from tenacity.stop import stop_never - -from .errors import RequestError - -logger = logging.getLogger(__name__) - - -_NETWORK_ERRORS = ( - asyncio.TimeoutError, # could happen while reading the response body - client_exceptions.ClientResponseError, - client_exceptions.ClientOSError, - client_exceptions.ServerConnectionError, - client_exceptions.ServerDisconnectedError, - client_exceptions.ServerTimeoutError, - client_exceptions.ClientPayloadError, - client_exceptions.ClientConnectorSSLError, - client_exceptions.ClientConnectorError, -) - - -def _is_network_error(exc: BaseException) -> bool: - if isinstance(exc, RequestError): - # RequestError is ClientResponseError, which is in the - # _NETWORK_ERRORS list, but it should be handled - # separately. - return False - return isinstance(exc, _NETWORK_ERRORS) - - -def _is_throttling_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status in (429, 503) - - -def _is_temporary_download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 520 - - -class RetryFactory: - """ - Build custom retry configuration - """ - - retry_condition: retry_base = ( - retry_if_exception(_is_throttling_error) - | retry_if_exception(_is_network_error) - | retry_if_exception(_is_temporary_download_error) - ) - # throttling - throttling_wait = wait_chain( - # always wait 20-40s first - wait_fixed(20) + wait_random(0, 20), - # wait 20-40s again - wait_fixed(20) + wait_random(0, 20), - # wait from 30 to 630s, with full jitter and exponentially - # increasing max wait time - wait_fixed(30) + wait_random_exponential(multiplier=1, max=600), - ) - - # connection errors, other client and server failures - network_error_wait = ( - # wait from 3s to ~1m - wait_random(3, 7) - + wait_random_exponential(multiplier=1, max=55) - ) - temporary_download_error_wait = network_error_wait - throttling_stop = stop_never - network_error_stop = stop_after_delay(15 * 60) - temporary_download_error_stop = stop_after_attempt(4) - - def wait(self, retry_state: RetryCallState) -> float: - assert retry_state.outcome, "Unexpected empty outcome" - exc = retry_state.outcome.exception() - assert exc, "Unexpected empty exception" - if _is_throttling_error(exc): - return self.throttling_wait(retry_state=retry_state) - elif _is_network_error(exc): - return self.network_error_wait(retry_state=retry_state) - elif _is_temporary_download_error(exc): - return self.temporary_download_error_wait(retry_state=retry_state) - else: - raise RuntimeError("Invalid retry state exception: %s" % exc) - - def stop(self, retry_state: RetryCallState) -> bool: - assert retry_state.outcome, "Unexpected empty outcome" - exc = retry_state.outcome.exception() - assert exc, "Unexpected empty exception" - if _is_throttling_error(exc): - return self.throttling_stop(retry_state) - elif _is_network_error(exc): - return self.network_error_stop(retry_state) - elif _is_temporary_download_error(exc): - return self.temporary_download_error_stop(retry_state) - else: - raise RuntimeError("Invalid retry state exception: %s" % exc) - - def reraise(self) -> bool: - return True - - def build(self) -> AsyncRetrying: - return AsyncRetrying( - wait=self.wait, - retry=self.retry_condition, - stop=self.stop, - reraise=self.reraise(), - before=before_log(logger, logging.DEBUG), - after=after_log(logger, logging.DEBUG), - before_sleep=before_sleep_log(logger, logging.DEBUG), - ) - - -zyte_api_retrying: AsyncRetrying = RetryFactory().build() +from .._retry import RetryFactory, zyte_api_retrying From 533a4177e12b4dda93dfeb853f6215b997d61162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Mar 2024 06:26:18 +0100 Subject: [PATCH 074/126] Add a sync API (#58) --- docs/index.rst | 5 +- docs/sync.rst | 8 +++ tests/test_sync.py | 51 ++++++++++++++++++ zyte_api/__init__.py | 1 + zyte_api/_sync.py | 115 +++++++++++++++++++++++++++++++++++++++++ zyte_api/aio/client.py | 10 +++- 6 files changed, 186 insertions(+), 4 deletions(-) create mode 100644 docs/sync.rst create mode 100644 tests/test_sync.py create mode 100644 zyte_api/_sync.py diff --git a/docs/index.rst b/docs/index.rst index e34d233..fee03f6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,9 +2,7 @@ python-zyte-api =============== -Python client libraries for `Zyte API`_. - -Command-line utility and asyncio-based library are provided by this package. +Command-line client and Python client library for `Zyte API`_. :ref:`license` is BSD 3-clause. @@ -14,6 +12,7 @@ Command-line utility and asyncio-based library are provided by this package. install command_line + sync asyncio_api .. toctree:: diff --git a/docs/sync.rst b/docs/sync.rst new file mode 100644 index 0000000..ba34f05 --- /dev/null +++ b/docs/sync.rst @@ -0,0 +1,8 @@ +.. _sync: + +======== +Sync API +======== + +.. autoclass:: zyte_api.ZyteAPI + :members: diff --git a/tests/test_sync.py b/tests/test_sync.py new file mode 100644 index 0000000..e7eb5c3 --- /dev/null +++ b/tests/test_sync.py @@ -0,0 +1,51 @@ +from types import GeneratorType + +import pytest + +from zyte_api import ZyteAPI +from zyte_api.apikey import NoApiKey + + +def test_api_key(): + ZyteAPI(api_key="a") + with pytest.raises(NoApiKey): + ZyteAPI() + + +def test_get(mockserver): + client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + expected_result = { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + } + actual_result = client.get({"url": "https://a.example", "httpResponseBody": True}) + assert actual_result == expected_result + + +def test_iter(mockserver): + client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + Exception, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + ] + actual_results = client.iter(queries) + assert isinstance(actual_results, GeneratorType) + actual_results_list = list(actual_results) + assert len(actual_results_list) == len(expected_results) + for actual_result in actual_results_list: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 5dcf5c4..d94af0e 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -5,4 +5,5 @@ from ._async import AsyncZyteAPI from ._errors import RequestError from ._retry import RetryFactory, zyte_api_retrying +from ._sync import ZyteAPI from ._utils import create_session diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py new file mode 100644 index 0000000..288fd9f --- /dev/null +++ b/zyte_api/_sync.py @@ -0,0 +1,115 @@ +import asyncio +from typing import Generator, List, Optional, Union + +from aiohttp import ClientSession +from tenacity import AsyncRetrying + +from ._async import AsyncZyteAPI +from .constants import API_URL + + +class ZyteAPI: + """Synchronous Zyte API client. + + To create an instance, pass your API key: + + .. code-block:: python + + client = ZyteAPI(api_key="YOUR_API_KEY") + + Or :ref:`use an environment variable ` and omit your API key: + + .. code-block:: python + + client = ZyteAPI() + + Use :meth:`get` and :meth:`iter` to send queries to Zyte API. + """ + + def __init__( + self, + *, + api_key=None, + api_url=API_URL, + n_conn=15, + retrying: Optional[AsyncRetrying] = None, + user_agent: Optional[str] = None, + ): + self._async_client = AsyncZyteAPI( + api_key=api_key, + api_url=api_url, + n_conn=n_conn, + retrying=retrying, + user_agent=user_agent, + ) + + def get( + self, + query: dict, + *, + endpoint: str = "extract", + session: Optional[ClientSession] = None, + handle_retries: bool = True, + retrying: Optional[AsyncRetrying] = None, + ) -> dict: + """Send a query to Zyte API and get the result. + + .. code-block:: python + + result = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) + """ + return asyncio.run( + self._async_client.get( + query=query, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, + ) + ) + + def iter( + self, + queries: List[dict], + *, + endpoint: str = "extract", + session: Optional[ClientSession] = None, + handle_retries: bool = True, + retrying: Optional[AsyncRetrying] = None, + ) -> Generator[Union[dict, Exception], None, None]: + """Send multiple queries to Zyte API in parallel and iterate over their + results as they come. + + .. code-block:: python + + queries = [ + {"url": "https://books.toscrape.com", "httpResponseBody": True}, + {"url": "https://quotes.toscrape.com", "httpResponseBody": True}, + ] + for result in client.iter(queries): + print(result) + + Results may come an a different order from the original list of + *queries*. You can use echoData_ to attach metadata to queries that you + can later use to restore their original order. + + .. _echoData: https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/echoData + + When exceptions occur, they are also yielded, not raised. + """ + try: + loop = asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + for future in self._async_client.iter( + queries=queries, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, + ): + try: + yield loop.run_until_complete(future) + except Exception as exception: + yield exception diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 7814f69..50027c6 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -112,6 +112,8 @@ def request_parallel_as_completed( *, endpoint: str = "extract", session: Optional[aiohttp.ClientSession] = None, + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, ) -> Iterator[asyncio.Future]: """Send multiple requests to Zyte API in parallel. Return an `asyncio.as_completed` iterator. @@ -126,6 +128,12 @@ def request_parallel_as_completed( async def _request(query): async with sem: - return await self.request_raw(query, endpoint=endpoint, session=session) + return await self.request_raw( + query, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, + ) return asyncio.as_completed([_request(query) for query in queries]) From e14d5564e8f782247779a7652c568e67153b4e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Mar 2024 06:32:02 +0100 Subject: [PATCH 075/126] Simplify the iter example, provide a session-specific example later (#59) --- docs/asyncio_api.rst | 73 ++++++++++++++++++++++++++++++++------------ 1 file changed, 53 insertions(+), 20 deletions(-) diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index a77691e..a2353cf 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -15,11 +15,11 @@ can use the method ``get`` to perform individual requests: client = AsyncZyteAPI(api_key="YOUR_API_KEY") - async def single_request(url): - return await client.get({"url": url, "browserHtml": True}) + async def main(): + result = await client.get({"url": "https://toscrape.com", "httpResponseBody": True}) - response = asyncio.run(single_request("https://books.toscrape.com")) + asyncio.run(main()) .. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment variable instead `. @@ -30,31 +30,64 @@ parallel, using multiple connections: .. code-block:: python import asyncio - import json - import sys from zyte_api import AsyncZyteAPI, RequestError, create_session - async def extract_from(urls, n_conn): - client = AsyncZyteAPI(n_conn=n_conn) - requests = [{"url": url, "browserHtml": True} for url in urls] - async with create_session(n_conn) as session: - res_iter = client.iter(requests, session=session) - for fut in res_iter: - try: - res = await fut - # do something with a result, e.g. - print(json.dumps(res)) - except RequestError as e: - print(e, file=sys.stderr) - raise + async def main(): + client = AsyncZyteAPI(api_key="YOUR_API_KEY") + queries = [ + {"url": "https://toscrape.com", "httpResponseBody": True}, + {"url": "https://books.toscrape.com", "httpResponseBody": True}, + ] + for future in client.iter(queries): + try: + result = await future + except RequestError as e: + ... + + asyncio.run(main()) - urls = ["https://toscrape.com", "https://books.toscrape.com"] - asyncio.run(extract_from(urls, n_conn=15)) ``iter`` yields results as they come, not necessarily in their original order. ``iter`` and ``get`` methods handle throttling (http 429 errors) and network errors, retrying a request in these cases. + +When using ``iter`` or multiple ``get`` calls, consider using a session: + +.. code-block:: python + + import asyncio + + from zyte_api import AsyncZyteAPI, create_session + + + async def main(): + client = AsyncZyteAPI(api_key="YOUR_API_KEY") + async with create_session(client.n_conn) as session: + queries = [ + {"url": "https://toscrape.com", "httpResponseBody": True}, + {"url": "https://books.toscrape.com", "httpResponseBody": True}, + ] + for future in client.iter(queries, session=session): + try: + result = await future + except RequestError as e: + ... + + + asyncio.run(main()) + +Sessions improve performance through a pool of reusable connections to the Zyte +API server. + +To send many queries with a concurrency limit, set ``n_conn`` in your client: + +.. code-block:: python + + client = AsyncZyteAPI(n_conn=15) + +Then use ``iter`` to send your queries. ``n_conn`` is not enforced when using +``get`` instead of ``iter``. From 55766c705f88698735c54369cbb605871fa8c1ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Mar 2024 06:54:42 +0100 Subject: [PATCH 076/126] Use a client semaphore (#63) --- docs/asyncio_api.rst | 8 +++--- tests/test_async.py | 23 ++++++++++++++++ zyte_api/_async.py | 62 ++++++++++++++++++++++---------------------- zyte_api/_sync.py | 31 ++++++++++++---------- 4 files changed, 76 insertions(+), 48 deletions(-) diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index a2353cf..44f7f71 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -83,11 +83,11 @@ When using ``iter`` or multiple ``get`` calls, consider using a session: Sessions improve performance through a pool of reusable connections to the Zyte API server. -To send many queries with a concurrency limit, set ``n_conn`` in your client: +To send many queries with a concurrency limit, set ``n_conn`` in your client +(default is ``15``): .. code-block:: python - client = AsyncZyteAPI(n_conn=15) + client = AsyncZyteAPI(n_conn=30) -Then use ``iter`` to send your queries. ``n_conn`` is not enforced when using -``get`` instead of ``iter``. +``n_conn`` will be enforce across all your ``get`` and ``iter`` calls. diff --git a/tests/test_async.py b/tests/test_async.py index eb93054..91796d4 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -1,3 +1,6 @@ +import asyncio +from unittest.mock import AsyncMock + import pytest from zyte_api import AsyncZyteAPI @@ -55,3 +58,23 @@ async def test_iter(mockserver): assert Exception in expected_results else: assert actual_result in expected_results + + +@pytest.mark.asyncio +async def test_semaphore(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + client._semaphore = AsyncMock(wraps=client._semaphore) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + {"url": "https://c.example", "httpResponseBody": True}, + ] + futures = [ + client.get(queries[0]), + next(iter(client.iter(queries[1:2]))), + client.get(queries[2]), + ] + for future in asyncio.as_completed(futures): + await future + assert client._semaphore.__aenter__.call_count == len(queries) + assert client._semaphore.__aexit__.call_count == len(queries) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 9a74fb6..b199818 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -44,6 +44,7 @@ def __init__( self.agg_stats = AggStats() self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT + self._semaphore = asyncio.Semaphore(n_conn) async def get( self, @@ -74,26 +75,27 @@ async def request(): ) try: - async with post(**post_kwargs) as resp: - stats.record_connected(resp.status, self.agg_stats) - if resp.status >= 400: - content = await resp.read() - resp.release() - stats.record_read() - stats.record_request_error(content, self.agg_stats) - - raise RequestError( - request_info=resp.request_info, - history=resp.history, - status=resp.status, - message=resp.reason, - headers=resp.headers, - response_content=content, - ) - - response = await resp.json() - stats.record_read(self.agg_stats) - return response + async with self._semaphore: + async with post(**post_kwargs) as resp: + stats.record_connected(resp.status, self.agg_stats) + if resp.status >= 400: + content = await resp.read() + resp.release() + stats.record_read() + stats.record_request_error(content, self.agg_stats) + + raise RequestError( + request_info=resp.request_info, + history=resp.history, + status=resp.status, + message=resp.reason, + headers=resp.headers, + response_content=content, + ) + + response = await resp.json() + stats.record_read(self.agg_stats) + return response except Exception as e: if not isinstance(e, RequestError): self.agg_stats.n_errors += 1 @@ -137,16 +139,14 @@ def iter( Set the session TCPConnector limit to a value greater than the number of connections. """ - sem = asyncio.Semaphore(self.n_conn) - - async def _request(query): - async with sem: - return await self.get( - query, - endpoint=endpoint, - session=session, - handle_retries=handle_retries, - retrying=retrying, - ) + + def _request(query): + return self.get( + query, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, + ) return asyncio.as_completed([_request(query) for query in queries]) diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index 288fd9f..0d87eb4 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -8,6 +8,15 @@ from .constants import API_URL +def _get_loop(): + try: + return asyncio.get_event_loop() + except RuntimeError: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + return loop + + class ZyteAPI: """Synchronous Zyte API client. @@ -58,15 +67,15 @@ def get( result = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) """ - return asyncio.run( - self._async_client.get( - query=query, - endpoint=endpoint, - session=session, - handle_retries=handle_retries, - retrying=retrying, - ) + loop = _get_loop() + future = self._async_client.get( + query=query, + endpoint=endpoint, + session=session, + handle_retries=handle_retries, + retrying=retrying, ) + return loop.run_until_complete(future) def iter( self, @@ -97,11 +106,7 @@ def iter( When exceptions occur, they are also yielded, not raised. """ - try: - loop = asyncio.get_event_loop() - except RuntimeError: - loop = asyncio.new_event_loop() - asyncio.set_event_loop(loop) + loop = _get_loop() for future in self._async_client.iter( queries=queries, endpoint=endpoint, From b0d1ee76083d829012260a54a9aaaa19eacf6573 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 20 Mar 2024 22:36:46 +0100 Subject: [PATCH 077/126] Implement AsyncZyteAPI.session (#62) --- docs/asyncio_api.rst | 8 ++--- tests/test_async.py | 67 ++++++++++++++++++++++++++++++++++++++++ tests/test_client.py | 22 ------------- tests/test_sync.py | 59 +++++++++++++++++++++++++++++++++++ zyte_api/__init__.py | 1 - zyte_api/_async.py | 62 ++++++++++++++++++++++++++++++++++++- zyte_api/_sync.py | 62 +++++++++++++++++++++++++++++++++++++ zyte_api/_utils.py | 12 +++++++ zyte_api/aio/__init__.py | 4 +-- 9 files changed, 267 insertions(+), 30 deletions(-) delete mode 100644 tests/test_client.py diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst index 44f7f71..071ccfe 100644 --- a/docs/asyncio_api.rst +++ b/docs/asyncio_api.rst @@ -31,7 +31,7 @@ parallel, using multiple connections: import asyncio - from zyte_api import AsyncZyteAPI, RequestError, create_session + from zyte_api import AsyncZyteAPI, RequestError async def main(): @@ -61,17 +61,17 @@ When using ``iter`` or multiple ``get`` calls, consider using a session: import asyncio - from zyte_api import AsyncZyteAPI, create_session + from zyte_api import AsyncZyteAPI, RequestError async def main(): client = AsyncZyteAPI(api_key="YOUR_API_KEY") - async with create_session(client.n_conn) as session: + async with client.session() as session: queries = [ {"url": "https://toscrape.com", "httpResponseBody": True}, {"url": "https://books.toscrape.com", "httpResponseBody": True}, ] - for future in client.iter(queries, session=session): + for future in session.iter(queries): try: result = await future except RequestError as e: diff --git a/tests/test_async.py b/tests/test_async.py index 91796d4..4847945 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -5,6 +5,25 @@ from zyte_api import AsyncZyteAPI from zyte_api.apikey import NoApiKey +from zyte_api.utils import USER_AGENT + + +@pytest.mark.parametrize( + "user_agent,expected", + ( + ( + None, + USER_AGENT, + ), + ( + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", + f"scrapy-zyte-api/0.11.1 {USER_AGENT}", + ), + ), +) +def test_user_agent(user_agent, expected): + client = AsyncZyteAPI(api_key="123", api_url="http:\\test", user_agent=user_agent) + assert client.user_agent == expected def test_api_key(): @@ -78,3 +97,51 @@ async def test_semaphore(mockserver): await future assert client._semaphore.__aenter__.call_count == len(queries) assert client._semaphore.__aexit__.call_count == len(queries) + + +@pytest.mark.asyncio +async def test_session(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + Exception, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + ] + actual_results = [] + async with client.session() as session: + assert session._context.connector.limit == client.n_conn + actual_results.append(await session.get(queries[0])) + for future in session.iter(queries[1:]): + try: + result = await future + except Exception as e: + result = e + actual_results.append(result) + aiohttp_session = session._context + assert not aiohttp_session.closed + assert aiohttp_session.closed + assert session._context is None + + with pytest.raises(RuntimeError): + await session.get(queries[0]) + + with pytest.raises(RuntimeError): + session.iter(queries[1:]) + + assert len(actual_results) == len(expected_results) + for actual_result in actual_results: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results diff --git a/tests/test_client.py b/tests/test_client.py deleted file mode 100644 index 5886ef2..0000000 --- a/tests/test_client.py +++ /dev/null @@ -1,22 +0,0 @@ -import pytest - -from zyte_api import AsyncZyteAPI -from zyte_api.utils import USER_AGENT - - -@pytest.mark.parametrize( - "user_agent,expected", - ( - ( - None, - USER_AGENT, - ), - ( - f"scrapy-zyte-api/0.11.1 {USER_AGENT}", - f"scrapy-zyte-api/0.11.1 {USER_AGENT}", - ), - ), -) -def test_user_agent(user_agent, expected): - client = AsyncZyteAPI(api_key="123", api_url="http:\\test", user_agent=user_agent) - assert client.user_agent == expected diff --git a/tests/test_sync.py b/tests/test_sync.py index e7eb5c3..6012d13 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -1,4 +1,5 @@ from types import GeneratorType +from unittest.mock import AsyncMock import pytest @@ -49,3 +50,61 @@ def test_iter(mockserver): assert Exception in expected_results else: assert actual_result in expected_results + + +def test_semaphore(mockserver): + client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + client._async_client._semaphore = AsyncMock(wraps=client._async_client._semaphore) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + {"url": "https://c.example", "httpResponseBody": True}, + ] + client.get(queries[0]) + next(iter(client.iter(queries[1:2]))) + client.get(queries[2]) + assert client._async_client._semaphore.__aenter__.call_count == len(queries) + assert client._async_client._semaphore.__aexit__.call_count == len(queries) + + +def test_session(mockserver): + client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + Exception, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + ] + actual_results = [] + with client.session() as session: + assert session._context.connector.limit == client._async_client.n_conn + actual_results.append(session.get(queries[0])) + for result in session.iter(queries[1:]): + actual_results.append(result) + aiohttp_session = session._context + assert not aiohttp_session.closed + assert aiohttp_session.closed + assert session._context is None + + with pytest.raises(RuntimeError): + session.get(queries[0]) + + with pytest.raises(RuntimeError): + session.iter(queries[1:]) + + assert len(actual_results) == len(expected_results) + for actual_result in actual_results: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index d94af0e..8c37dc3 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -6,4 +6,3 @@ from ._errors import RequestError from ._retry import RetryFactory, zyte_api_retrying from ._sync import ZyteAPI -from ._utils import create_session diff --git a/zyte_api/_async.py b/zyte_api/_async.py index b199818..57a661e 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -8,7 +8,7 @@ from ._errors import RequestError from ._retry import zyte_api_retrying -from ._utils import _AIO_API_TIMEOUT +from ._utils import _AIO_API_TIMEOUT, create_session from .apikey import get_apikey from .constants import API_URL from .stats import AggStats, ResponseStats @@ -28,6 +28,63 @@ def _post_func(session): return session.post +class _AsyncSession: + def __init__(self, client, **session_kwargs): + self._client = client + self._session = create_session(client.n_conn, **session_kwargs) + self._context = None + + async def __aenter__(self): + self._context = await self._session.__aenter__() + return self + + async def __aexit__(self, *exc_info): + result = await self._context.__aexit__(*exc_info) + self._context = None + return result + + def _check_context(self): + if self._context is None: + raise RuntimeError( + "Attempt to use session method on a session either not opened " + "or already closed." + ) + + async def get( + self, + query: dict, + *, + endpoint: str = "extract", + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ): + self._check_context() + return await self._client.get( + query=query, + endpoint=endpoint, + handle_retries=handle_retries, + retrying=retrying, + session=self._context, + ) + + def iter( + self, + queries: List[dict], + *, + endpoint: str = "extract", + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ) -> Iterator[asyncio.Future]: + self._check_context() + return self._client.iter( + queries=queries, + endpoint=endpoint, + session=self._context, + handle_retries=handle_retries, + retrying=retrying, + ) + + class AsyncZyteAPI: def __init__( self, @@ -150,3 +207,6 @@ def _request(query): ) return asyncio.as_completed([_request(query) for query in queries]) + + def session(self, **kwargs): + return _AsyncSession(client=self, **kwargs) diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index 0d87eb4..749cd3e 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -17,6 +17,65 @@ def _get_loop(): return loop +class _Session: + def __init__(self, client, **session_kwargs): + self._client = client + self._session = client._async_client.session(**session_kwargs) + self._context = None + + def __enter__(self): + loop = _get_loop() + self._context = loop.run_until_complete(self._session.__aenter__())._context + return self + + def __exit__(self, *exc_info): + loop = _get_loop() + result = loop.run_until_complete(self._context.__aexit__(*exc_info)) + self._context = None + return result + + def _check_context(self): + if self._context is None: + raise RuntimeError( + "Attempt to use session method on a session either not opened " + "or already closed." + ) + + def get( + self, + query: dict, + *, + endpoint: str = "extract", + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ): + self._check_context() + return self._client.get( + query=query, + endpoint=endpoint, + handle_retries=handle_retries, + retrying=retrying, + session=self._context, + ) + + def iter( + self, + queries: List[dict], + *, + endpoint: str = "extract", + handle_retries=True, + retrying: Optional[AsyncRetrying] = None, + ) -> Generator[Union[dict, Exception], None, None]: + self._check_context() + return self._client.iter( + queries=queries, + endpoint=endpoint, + session=self._context, + handle_retries=handle_retries, + retrying=retrying, + ) + + class ZyteAPI: """Synchronous Zyte API client. @@ -118,3 +177,6 @@ def iter( yield loop.run_until_complete(future) except Exception as exception: yield exception + + def session(self, **kwargs): + return _Session(client=self, **kwargs) diff --git a/zyte_api/_utils.py b/zyte_api/_utils.py index 116206f..86727bb 100644 --- a/zyte_api/_utils.py +++ b/zyte_api/_utils.py @@ -1,3 +1,5 @@ +from warnings import warn + import aiohttp from aiohttp import TCPConnector @@ -9,6 +11,16 @@ _AIO_API_TIMEOUT = aiohttp.ClientTimeout(total=API_TIMEOUT + 120) +def deprecated_create_session( + connection_pool_size=100, **kwargs +) -> aiohttp.ClientSession: + warn( + "zyte_api.create_session is deprecated, use AsyncZyteAPI.session instead.", + DeprecationWarning, + ) + return create_session(connection_pool_size=connection_pool_size, **kwargs) + + def create_session(connection_pool_size=100, **kwargs) -> aiohttp.ClientSession: """Create a session with parameters suited for Zyte API""" kwargs.setdefault("timeout", _AIO_API_TIMEOUT) diff --git a/zyte_api/aio/__init__.py b/zyte_api/aio/__init__.py index 6b8d6f1..16a6133 100644 --- a/zyte_api/aio/__init__.py +++ b/zyte_api/aio/__init__.py @@ -9,8 +9,8 @@ "The zyte_api.aio module is deprecated. Replace " "zyte_api.aio.client.AsyncClient with zyte_api.AsyncZyteAPI (note " "that method names are different), zyte_api.aio.client.create_session " - "with zyte_api.create_session, zyte_api.aio.errors.RequestError with " - "zyte_api.RequestError, zyte_api.aio.retry.RetryFactory with " + "with zyte_api.AsyncZyteAPI.session, zyte_api.aio.errors.RequestError " + "with zyte_api.RequestError, zyte_api.aio.retry.RetryFactory with " "zyte_api.RetryFactory, and zyte_api.aio.retry.zyte_api_retrying with " "zyte_api.zyte_api_retrying." ), From f36cab95489b5766fd60f97b2739a6c86ca590fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 22 Mar 2024 18:13:56 +0100 Subject: [PATCH 078/126] Complete test coverage (#66) --- .coveragerc | 6 + tests/mockserver.py | 77 +++++++++- tests/test_async.py | 322 ++++++++++++++++++++++++++++++++++++++--- tests/test_main.py | 144 +++++++++++++++++- tests/test_utils.py | 26 ++++ tox.ini | 3 +- zyte_api/__main__.py | 19 ++- zyte_api/_errors.py | 4 +- zyte_api/_retry.py | 16 +- zyte_api/_sync.py | 2 +- zyte_api/_utils.py | 5 +- zyte_api/aio/client.py | 142 +----------------- 12 files changed, 586 insertions(+), 180 deletions(-) diff --git a/.coveragerc b/.coveragerc index 101904c..4fdc383 100644 --- a/.coveragerc +++ b/.coveragerc @@ -1,2 +1,8 @@ [run] branch = true + +[report] +# https://github.com/nedbat/coveragepy/issues/831#issuecomment-517778185 +exclude_lines = + pragma: no cover + if TYPE_CHECKING: diff --git a/tests/mockserver.py b/tests/mockserver.py index 8f4330d..40b1d6e 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -10,8 +10,19 @@ from urllib.parse import urlparse from twisted.internet import reactor +from twisted.internet.task import deferLater from twisted.web.resource import Resource -from twisted.web.server import Site +from twisted.web.server import NOT_DONE_YET, Site + + +# https://github.com/scrapy/scrapy/blob/02b97f98e74a994ad3e4d74e7ed55207e508a576/tests/mockserver.py#L27C1-L33C19 +def getarg(request, name, default=None, type=None): + if name in request.args: + value = request.args[name][0] + if type is not None: + value = type(value) + return value + return default def get_ephemeral_port(): @@ -20,6 +31,37 @@ def get_ephemeral_port(): return s.getsockname()[1] +class DropResource(Resource): + isLeaf = True + + def deferRequest(self, request, delay, f, *a, **kw): + def _cancelrequest(_): + # silence CancelledError + d.addErrback(lambda _: None) + d.cancel() + + d = deferLater(reactor, delay, f, *a, **kw) + request.notifyFinish().addErrback(_cancelrequest) + return d + + def render_POST(self, request): + request.setHeader(b"Content-Length", b"1024") + self.deferRequest(request, 0, self._delayedRender, request) + return NOT_DONE_YET + + def _delayedRender(self, request): + abort = getarg(request, b"abort", 0, type=int) + request.write(b"this connection will be dropped\n") + tr = request.channel.transport + try: + if abort and hasattr(tr, "abortConnection"): + tr.abortConnection() + else: + tr.loseConnection() + finally: + request.finish() + + class DefaultResource(Resource): request_count = 0 @@ -40,6 +82,18 @@ def render_POST(self, request): url = request_data["url"] domain = urlparse(url).netloc + if domain == "e429.example": + request.setResponseCode(429) + response_data = {"status": 429, "type": "/limits/over-user-limit"} + return json.dumps(response_data).encode() + if domain == "e520.example": + request.setResponseCode(520) + response_data = {"status": 520, "type": "/download/temporary-error"} + return json.dumps(response_data).encode() + if domain == "e521.example": + request.setResponseCode(521) + response_data = {"status": 521, "type": "/download/internal-error"} + return json.dumps(response_data).encode() if domain == "exception.example": request.setResponseCode(401) response_data = { @@ -49,15 +103,30 @@ def render_POST(self, request): "detail": "The authentication key is not valid or can't be matched.", } return json.dumps(response_data).encode() + if domain == "empty-body-exception.example": + request.setResponseCode(500) + return b"" + if domain == "nonjson.example": + request.setResponseCode(200) + return b"foo" + if domain == "nonjson-exception.example": + request.setResponseCode(500) + return b"foo" + if domain == "array-exception.example": + request.setResponseCode(500) + return b'["foo"]' response_data: Dict[str, Any] = { "url": url, } - assert "httpResponseBody" in request_data html = "Hello

World!

" - body = b64encode(html.encode()).decode() - response_data["httpResponseBody"] = body + if "httpResponseBody" in request_data: + body = b64encode(html.encode()).decode() + response_data["httpResponseBody"] = body + else: + assert "browserHtml" in request_data + response_data["browserHtml"] = html return json.dumps(response_data).encode() diff --git a/tests/test_async.py b/tests/test_async.py index 4847945..eb3b327 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -2,12 +2,25 @@ from unittest.mock import AsyncMock import pytest +from tenacity import AsyncRetrying -from zyte_api import AsyncZyteAPI +from zyte_api import AsyncZyteAPI, RequestError +from zyte_api._retry import RetryFactory +from zyte_api.aio.client import AsyncClient from zyte_api.apikey import NoApiKey +from zyte_api.errors import ParsedError from zyte_api.utils import USER_AGENT +from .mockserver import DropResource, MockServer + +@pytest.mark.parametrize( + ("client_cls",), + ( + (AsyncZyteAPI,), + (AsyncClient,), + ), +) @pytest.mark.parametrize( "user_agent,expected", ( @@ -21,33 +34,175 @@ ), ), ) -def test_user_agent(user_agent, expected): - client = AsyncZyteAPI(api_key="123", api_url="http:\\test", user_agent=user_agent) +def test_user_agent(client_cls, user_agent, expected): + client = client_cls(api_key="123", api_url="http:\\test", user_agent=user_agent) assert client.user_agent == expected -def test_api_key(): - AsyncZyteAPI(api_key="a") +@pytest.mark.parametrize( + ("client_cls",), + ( + (AsyncZyteAPI,), + (AsyncClient,), + ), +) +def test_api_key(client_cls): + client_cls(api_key="a") with pytest.raises(NoApiKey): - AsyncZyteAPI() + client_cls() +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) @pytest.mark.asyncio -async def test_get(mockserver): - client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) +async def test_get(client_cls, get_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) expected_result = { "url": "https://a.example", "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", } - actual_result = await client.get( + actual_result = await getattr(client, get_method)( {"url": "https://a.example", "httpResponseBody": True} ) assert actual_result == expected_result +UNSET = object() + + +class OutlierException(RuntimeError): + pass + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.parametrize( + ("value", "exception"), + ( + (UNSET, OutlierException), + (True, OutlierException), + (False, RequestError), + ), +) @pytest.mark.asyncio -async def test_iter(mockserver): - client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) +async def test_get_handle_retries(client_cls, get_method, value, exception, mockserver): + kwargs = {} + if value is not UNSET: + kwargs["handle_retries"] = value + + def broken_stop(_): + raise OutlierException + + retrying = AsyncRetrying(stop=broken_stop) + client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) + with pytest.raises(exception): + await getattr(client, get_method)( + {"url": "https://exception.example", "browserHtml": True}, + **kwargs, + ) + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_get_request_error(client_cls, get_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) + with pytest.raises(RequestError) as request_error_info: + await getattr(client, get_method)( + {"url": "https://exception.example", "browserHtml": True}, + ) + parsed_error = request_error_info.value.parsed + assert isinstance(parsed_error, ParsedError) + assert parsed_error.data == { + "detail": "The authentication key is not valid or can't be matched.", + "status": 401, + "title": "Authentication Key Not Found", + "type": "/auth/key-not-found", + } + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_get_request_error_empty_body(client_cls, get_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) + with pytest.raises(RequestError) as request_error_info: + await getattr(client, get_method)( + {"url": "https://empty-body-exception.example", "browserHtml": True}, + ) + parsed_error = request_error_info.value.parsed + assert isinstance(parsed_error, ParsedError) + assert parsed_error.data is None + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_get_request_error_non_json(client_cls, get_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) + with pytest.raises(RequestError) as request_error_info: + await getattr(client, get_method)( + {"url": "https://nonjson-exception.example", "browserHtml": True}, + ) + parsed_error = request_error_info.value.parsed + assert isinstance(parsed_error, ParsedError) + assert parsed_error.data is None + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_get_request_error_unexpected_json(client_cls, get_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) + with pytest.raises(RequestError) as request_error_info: + await getattr(client, get_method)( + {"url": "https://array-exception.example", "browserHtml": True}, + ) + parsed_error = request_error_info.value.parsed + assert isinstance(parsed_error, ParsedError) + assert parsed_error.data is None + + +@pytest.mark.parametrize( + ("client_cls", "iter_method"), + ( + (AsyncZyteAPI, "iter"), + (AsyncClient, "request_parallel_as_completed"), + ), +) +@pytest.mark.asyncio +async def test_iter(client_cls, iter_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) queries = [ {"url": "https://a.example", "httpResponseBody": True}, {"url": "https://exception.example", "httpResponseBody": True}, @@ -65,7 +220,7 @@ async def test_iter(mockserver): }, ] actual_results = [] - for future in client.iter(queries): + for future in getattr(client, iter_method)(queries): try: actual_result = await future except Exception as exception: @@ -79,9 +234,142 @@ async def test_iter(mockserver): assert actual_result in expected_results +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.parametrize( + ("subdomain", "waiter"), + ( + ("e429", "throttling"), + ("e520", "temporary_download_error"), + ), +) @pytest.mark.asyncio -async def test_semaphore(mockserver): - client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) +async def test_retry_wait(client_cls, get_method, subdomain, waiter, mockserver): + def broken_wait(self, retry_state): + raise OutlierException + + class CustomRetryFactory(RetryFactory): + pass + + setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) + + retrying = CustomRetryFactory().build() + client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) + with pytest.raises(OutlierException): + await getattr(client, get_method)( + {"url": f"https://{subdomain}.example", "browserHtml": True}, + ) + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_retry_wait_network_error(client_cls, get_method): + waiter = "network_error" + + def broken_wait(self, retry_state): + raise OutlierException + + class CustomRetryFactory(RetryFactory): + pass + + setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) + + retrying = CustomRetryFactory().build() + with MockServer(resource=DropResource) as mockserver: + client = client_cls( + api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying + ) + with pytest.raises(OutlierException): + await getattr(client, get_method)( + {"url": "https://example.com", "browserHtml": True}, + ) + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.parametrize( + ("subdomain", "stopper"), + ( + ("e429", "throttling"), + ("e520", "temporary_download_error"), + ), +) +@pytest.mark.asyncio +async def test_retry_stop(client_cls, get_method, subdomain, stopper, mockserver): + def broken_stop(self, retry_state): + raise OutlierException + + class CustomRetryFactory(RetryFactory): + def wait(self, retry_state): + return None + + setattr(CustomRetryFactory, f"{stopper}_stop", broken_stop) + + retrying = CustomRetryFactory().build() + client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) + with pytest.raises(OutlierException): + await getattr(client, get_method)( + {"url": f"https://{subdomain}.example", "browserHtml": True}, + ) + + +@pytest.mark.parametrize( + ("client_cls", "get_method"), + ( + (AsyncZyteAPI, "get"), + (AsyncClient, "request_raw"), + ), +) +@pytest.mark.asyncio +async def test_retry_stop_network_error(client_cls, get_method): + stopper = "network_error" + + def broken_stop(self, retry_state): + raise OutlierException + + class CustomRetryFactory(RetryFactory): + def wait(self, retry_state): + return None + + setattr(CustomRetryFactory, f"{stopper}_stop", broken_stop) + + retrying = CustomRetryFactory().build() + with MockServer(resource=DropResource) as mockserver: + client = client_cls( + api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying + ) + with pytest.raises(OutlierException): + await getattr(client, get_method)( + {"url": "https://example.com", "browserHtml": True}, + ) + + +@pytest.mark.parametrize( + ("client_cls", "get_method", "iter_method"), + ( + (AsyncZyteAPI, "get", "iter"), + (AsyncClient, "request_raw", "request_parallel_as_completed"), + ), +) +@pytest.mark.asyncio +async def test_semaphore(client_cls, get_method, iter_method, mockserver): + client = client_cls(api_key="a", api_url=mockserver.urljoin("/")) client._semaphore = AsyncMock(wraps=client._semaphore) queries = [ {"url": "https://a.example", "httpResponseBody": True}, @@ -89,9 +377,9 @@ async def test_semaphore(mockserver): {"url": "https://c.example", "httpResponseBody": True}, ] futures = [ - client.get(queries[0]), - next(iter(client.iter(queries[1:2]))), - client.get(queries[2]), + getattr(client, get_method)(queries[0]), + next(iter(getattr(client, iter_method)(queries[1:2]))), + getattr(client, get_method)(queries[2]), ] for future in asyncio.as_completed(futures): await future diff --git a/tests/test_main.py b/tests/test_main.py index 3960a5a..b18c25f 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -1,14 +1,17 @@ import json import os +import subprocess from json import JSONDecodeError +from tempfile import NamedTemporaryFile from unittest.mock import AsyncMock, Mock, patch import pytest from zyte_api.__main__ import run +from zyte_api.aio.errors import RequestError -class RequestError(Exception): +class MockRequestError(Exception): @property def parsed(self): mock = Mock( @@ -50,7 +53,7 @@ def forbidden_domain_response(): async def fake_exception(value=True): # Simulating an error condition if value: - raise RequestError() + raise MockRequestError() create_session_mock = AsyncMock() return await create_session_mock.coroutine() @@ -94,7 +97,6 @@ async def test_run(queries, expected_response, store_errors, exception): tmp_path = "temporary_file.jsonl" temporary_file = open(tmp_path, "w") n_conn = 5 - stop_on_errors = False api_url = "https://example.com" api_key = "fake_key" retry_errors = True @@ -123,7 +125,6 @@ async def test_run(queries, expected_response, store_errors, exception): queries=queries, out=temporary_file, n_conn=n_conn, - stop_on_errors=stop_on_errors, api_url=api_url, api_key=api_key, retry_errors=retry_errors, @@ -132,3 +133,138 @@ async def test_run(queries, expected_response, store_errors, exception): assert get_json_content(temporary_file) == expected_response os.unlink(tmp_path) + + +@pytest.mark.asyncio +async def test_run_stop_on_errors_false(mockserver): + queries = [{"url": "https://exception.example", "httpResponseBody": True}] + with NamedTemporaryFile("w") as output_file: + with pytest.warns( + DeprecationWarning, match=r"^The stop_on_errors parameter is deprecated\.$" + ): + await run( + queries=queries, + out=output_file, + n_conn=1, + api_url=mockserver.urljoin("/"), + api_key="a", + stop_on_errors=False, + ) + + +@pytest.mark.asyncio +async def test_run_stop_on_errors_true(mockserver): + queries = [{"url": "https://exception.example", "httpResponseBody": True}] + with NamedTemporaryFile("w") as output_file: + with pytest.warns( + DeprecationWarning, match=r"^The stop_on_errors parameter is deprecated\.$" + ): + with pytest.raises(RequestError): + await run( + queries=queries, + out=output_file, + n_conn=1, + api_url=mockserver.urljoin("/"), + api_key="a", + stop_on_errors=True, + ) + + +def _run(*, input, mockserver, cli_params=None): + cli_params = cli_params or tuple() + with NamedTemporaryFile("w") as url_list: + url_list.write(input) + url_list.flush() + # Note: Using “python -m zyte_api” instead of “zyte-api” enables + # coverage tracking to work. + result = subprocess.run( + [ + "python", + "-m", + "zyte_api", + "--api-key", + "a", + "--api-url", + mockserver.urljoin("/"), + url_list.name, + *cli_params, + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + return result + + +def test_empty_input(mockserver): + result = _run(input="", mockserver=mockserver) + assert result.returncode + assert result.stdout == b"" + assert result.stderr == b"No input queries found. Is the input file empty?\n" + + +def test_intype_txt_implicit(mockserver): + result = _run(input="https://a.example", mockserver=mockserver) + assert not result.returncode + assert ( + result.stdout + == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' + ) + + +def test_intype_txt_explicit(mockserver): + result = _run( + input="https://a.example", mockserver=mockserver, cli_params=["--intype", "txt"] + ) + assert not result.returncode + assert ( + result.stdout + == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' + ) + + +def test_intype_jsonl_implicit(mockserver): + result = _run( + input='{"url": "https://a.example", "browserHtml": true}', mockserver=mockserver + ) + assert not result.returncode + assert ( + result.stdout + == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' + ) + + +def test_intype_jsonl_explicit(mockserver): + result = _run( + input='{"url": "https://a.example", "browserHtml": true}', + mockserver=mockserver, + cli_params=["--intype", "jl"], + ) + assert not result.returncode + assert ( + result.stdout + == b'{"url": "https://a.example", "browserHtml": "Hello

World!

"}\n' + ) + + +@pytest.mark.flaky(reruns=16) +def test_limit_and_shuffle(mockserver): + result = _run( + input="https://a.example\nhttps://b.example", + mockserver=mockserver, + cli_params=["--limit", "1", "--shuffle"], + ) + assert not result.returncode + assert ( + result.stdout + == b'{"url": "https://b.example", "browserHtml": "Hello

World!

"}\n' + ) + + +def test_run_non_json_response(mockserver): + result = _run( + input="https://nonjson.example", + mockserver=mockserver, + ) + assert not result.returncode + assert result.stdout == b"" + assert b"json.decoder.JSONDecodeError" in result.stderr diff --git a/tests/test_utils.py b/tests/test_utils.py index 03efd60..912ed41 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -1,9 +1,20 @@ import pytest +from aiohttp import TCPConnector from pytest import raises +from zyte_api._utils import create_session from zyte_api.utils import _guess_intype, _process_query +@pytest.mark.asyncio +async def test_create_session_custom_connector(): + # Declare a connector with a random parameter to avoid it matching the + # default one. + custom_connector = TCPConnector(limit=1850) + session = create_session(connector=custom_connector) + assert session.connector == custom_connector + + @pytest.mark.parametrize( "file_name,first_line,expected", ( @@ -85,6 +96,11 @@ def test_guess_intype(file_name, first_line, expected): {"url": "https://example.com#a"}, {"url": "https://example.com#a"}, ), + # If no URL is passed, nothing is done. + ( + {"a": "b"}, + {"a": "b"}, + ), # NOTE: We use w3lib.url.safe_url_string for escaping. Tests covering # the URL escaping logic exist upstream. ), @@ -96,3 +112,13 @@ def test_process_query(input, output): def test_process_query_bytes(): with raises(ValueError): _process_query({"url": b"https://example.com"}) + + +def test_deprecated_create_session(): + from zyte_api.aio.client import create_session as _create_session + + with pytest.warns( + DeprecationWarning, + match=r"^zyte_api\.aio\.client\.create_session is deprecated", + ): + _create_session() diff --git a/tox.ini b/tox.ini index fec688d..cb2b6c2 100644 --- a/tox.ini +++ b/tox.ini @@ -6,13 +6,14 @@ deps = pytest pytest-asyncio pytest-cov + pytest-rerunfailures pytest-twisted responses twisted commands = py.test \ - --cov-report=term --cov-report=html --cov-report=xml --cov=zyte_api \ + --cov-report=term-missing --cov-report=html --cov-report=xml --cov=zyte_api \ --doctest-modules \ {posargs:zyte_api tests} diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 6d8c821..dc36330 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -6,6 +6,7 @@ import logging import random import sys +from warnings import warn import tqdm from tenacity import retry_if_exception @@ -31,12 +32,21 @@ async def run( out, *, n_conn, - stop_on_errors, + stop_on_errors=_UNSET, api_url, api_key=None, retry_errors=True, store_errors=None, ): + if stop_on_errors is not _UNSET: + warn( + "The stop_on_errors parameter is deprecated.", + DeprecationWarning, + stacklevel=2, + ) + else: + stop_on_errors = False + def write_output(content): json.dump(content, out, ensure_ascii=False) out.write("\n") @@ -83,6 +93,8 @@ def write_output(content): def read_input(input_fp, intype): assert intype in {"txt", "jl", _UNSET} lines = input_fp.readlines() + if not lines: + return [] if intype is _UNSET: intype = _guess_intype(input_fp.name, lines) if intype == "txt": @@ -171,6 +183,10 @@ def _main(program_name="zyte-api"): logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) queries = read_input(args.input, args.intype) + if not queries: + print("No input queries found. Is the input file empty?", file=sys.stderr) + sys.exit(-1) + if args.shuffle: random.shuffle(queries) if args.limit: @@ -186,7 +202,6 @@ def _main(program_name="zyte-api"): queries, out=args.output, n_conn=args.n_conn, - stop_on_errors=False, api_url=args.api_url, api_key=args.api_key, retry_errors=not args.dont_retry_errors, diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index 072322d..d445f67 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -15,9 +15,7 @@ class RequestError(ClientResponseError): def __init__(self, *args, **kwargs): self.response_content = kwargs.pop("response_content") - self.request_id = kwargs.pop("request_id", None) - if self.request_id is None: - self.request_id = kwargs.get("headers", {}).get("request-id") + self.request_id = kwargs.get("headers", {}).get("request-id") super().__init__(*args, **kwargs) @property diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index d570cf5..6da3649 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -92,12 +92,10 @@ def wait(self, retry_state: RetryCallState) -> float: assert exc, "Unexpected empty exception" if _is_throttling_error(exc): return self.throttling_wait(retry_state=retry_state) - elif _is_network_error(exc): + if _is_network_error(exc): return self.network_error_wait(retry_state=retry_state) - elif _is_temporary_download_error(exc): - return self.temporary_download_error_wait(retry_state=retry_state) - else: - raise RuntimeError("Invalid retry state exception: %s" % exc) + assert _is_temporary_download_error(exc) # See retry_condition + return self.temporary_download_error_wait(retry_state=retry_state) def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" @@ -105,12 +103,10 @@ def stop(self, retry_state: RetryCallState) -> bool: assert exc, "Unexpected empty exception" if _is_throttling_error(exc): return self.throttling_stop(retry_state) - elif _is_network_error(exc): + if _is_network_error(exc): return self.network_error_stop(retry_state) - elif _is_temporary_download_error(exc): - return self.temporary_download_error_stop(retry_state) - else: - raise RuntimeError("Invalid retry state exception: %s" % exc) + assert _is_temporary_download_error(exc) # See retry_condition + return self.temporary_download_error_stop(retry_state) def reraise(self) -> bool: return True diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index 749cd3e..c9029b9 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -11,7 +11,7 @@ def _get_loop(): try: return asyncio.get_event_loop() - except RuntimeError: + except RuntimeError: # pragma: no cover (tests always have a running loop) loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) return loop diff --git a/zyte_api/_utils.py b/zyte_api/_utils.py index 86727bb..e090724 100644 --- a/zyte_api/_utils.py +++ b/zyte_api/_utils.py @@ -15,7 +15,10 @@ def deprecated_create_session( connection_pool_size=100, **kwargs ) -> aiohttp.ClientSession: warn( - "zyte_api.create_session is deprecated, use AsyncZyteAPI.session instead.", + ( + "zyte_api.aio.client.create_session is deprecated, use " + "ZyteAPI.session or AsyncZyteAPI.session instead." + ), DeprecationWarning, ) return create_session(connection_pool_size=connection_pool_size, **kwargs) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 50027c6..208cb38 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -1,139 +1,7 @@ -""" -Asyncio client for Zyte API -""" +from .._async import AsyncZyteAPI +from .._utils import deprecated_create_session as create_session # noqa: F401 -import asyncio -import time -from typing import Iterator, List, Optional -import aiohttp -from tenacity import AsyncRetrying - -from .._async import _post_func -from .._utils import create_session # noqa: F401 -from ..apikey import get_apikey -from ..constants import API_URL -from ..stats import AggStats, ResponseStats -from ..utils import USER_AGENT, _process_query -from .errors import RequestError -from .retry import zyte_api_retrying - - -class AsyncClient: - def __init__( - self, - *, - api_key=None, - api_url=API_URL, - n_conn=15, - retrying: Optional[AsyncRetrying] = None, - user_agent: Optional[str] = None, - ): - self.api_key = get_apikey(api_key) - self.api_url = api_url - self.n_conn = n_conn - self.agg_stats = AggStats() - self.retrying = retrying or zyte_api_retrying - self.user_agent = user_agent or USER_AGENT - - async def request_raw( - self, - query: dict, - *, - endpoint: str = "extract", - session=None, - handle_retries=True, - retrying: Optional[AsyncRetrying] = None, - ): - retrying = retrying or self.retrying - post = _post_func(session) - auth = aiohttp.BasicAuth(self.api_key) - headers = {"User-Agent": self.user_agent, "Accept-Encoding": "br"} - - response_stats = [] - start_global = time.perf_counter() - - async def request(): - stats = ResponseStats.create(start_global) - self.agg_stats.n_attempts += 1 - - post_kwargs = dict( - url=self.api_url + endpoint, - json=_process_query(query), - auth=auth, - headers=headers, - ) - - try: - async with post(**post_kwargs) as resp: - stats.record_connected(resp.status, self.agg_stats) - if resp.status >= 400: - content = await resp.read() - resp.release() - stats.record_read() - stats.record_request_error(content, self.agg_stats) - - raise RequestError( - request_info=resp.request_info, - history=resp.history, - status=resp.status, - message=resp.reason, - headers=resp.headers, - response_content=content, - ) - - response = await resp.json() - stats.record_read(self.agg_stats) - return response - except Exception as e: - if not isinstance(e, RequestError): - self.agg_stats.n_errors += 1 - stats.record_exception(e, agg_stats=self.agg_stats) - raise - finally: - response_stats.append(stats) - - if handle_retries: - request = retrying.wraps(request) - - try: - # Try to make a request - result = await request() - self.agg_stats.n_success += 1 - except Exception: - self.agg_stats.n_fatal_errors += 1 - raise - - return result - - def request_parallel_as_completed( - self, - queries: List[dict], - *, - endpoint: str = "extract", - session: Optional[aiohttp.ClientSession] = None, - handle_retries=True, - retrying: Optional[AsyncRetrying] = None, - ) -> Iterator[asyncio.Future]: - """Send multiple requests to Zyte API in parallel. - Return an `asyncio.as_completed` iterator. - - ``queries`` is a list of requests to process (dicts). - - ``session`` is an optional aiohttp.ClientSession object. - Set the session TCPConnector limit to a value greater than - the number of connections. - """ - sem = asyncio.Semaphore(self.n_conn) - - async def _request(query): - async with sem: - return await self.request_raw( - query, - endpoint=endpoint, - session=session, - handle_retries=handle_retries, - retrying=retrying, - ) - - return asyncio.as_completed([_request(query) for query in queries]) +class AsyncClient(AsyncZyteAPI): + request_raw = AsyncZyteAPI.get + request_parallel_as_completed = AsyncZyteAPI.iter From 914d5bebb4db199a05cb2f2eceb36596e2d135af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 12:20:26 +0200 Subject: [PATCH 079/126] Refactor docs after recent changes (#67) --- README.rst | 89 +++++++--- docs/_ext/__init__.py | 43 +++++ docs/_templates/custom-class-template.rst | 35 ---- docs/_templates/custom-module-template.rst | 70 -------- docs/api_reference.rst | 13 -- docs/asyncio_api.rst | 93 ----------- docs/command_line.rst | 98 ----------- docs/conf.py | 12 +- docs/index.rst | 31 ++-- docs/install.rst | 22 --- docs/intro/basic.rst | 9 + docs/intro/install.rst | 9 + docs/license.rst | 7 - docs/ref/api.rst | 39 +++++ docs/ref/cli.rst | 13 ++ docs/requirements.txt | 3 +- docs/sync.rst | 8 - docs/use/api.rst | 181 +++++++++++++++++++++ docs/use/cli.rst | 114 +++++++++++++ docs/use/key.rst | 49 ++++++ tests/test_retry.py | 7 + zyte_api/__init__.py | 8 +- zyte_api/__main__.py | 80 +++++---- zyte_api/_async.py | 29 ++-- zyte_api/_errors.py | 15 +- zyte_api/_retry.py | 57 ++++++- zyte_api/_sync.py | 79 ++++++--- zyte_api/errors.py | 15 +- 28 files changed, 763 insertions(+), 465 deletions(-) create mode 100644 docs/_ext/__init__.py delete mode 100644 docs/_templates/custom-class-template.rst delete mode 100644 docs/_templates/custom-module-template.rst delete mode 100644 docs/api_reference.rst delete mode 100644 docs/asyncio_api.rst delete mode 100644 docs/command_line.rst delete mode 100644 docs/install.rst create mode 100644 docs/intro/basic.rst create mode 100644 docs/intro/install.rst delete mode 100644 docs/license.rst create mode 100644 docs/ref/api.rst create mode 100644 docs/ref/cli.rst delete mode 100644 docs/sync.rst create mode 100644 docs/use/api.rst create mode 100644 docs/use/cli.rst create mode 100644 docs/use/key.rst create mode 100644 tests/test_retry.py diff --git a/README.rst b/README.rst index 703c3ad..4035e57 100644 --- a/README.rst +++ b/README.rst @@ -18,52 +18,101 @@ python-zyte-api :target: https://codecov.io/gh/zytedata/zyte-api :alt: Coverage report -Python client libraries for `Zyte API`_. +.. description-start -Command-line utility and asyncio-based library are provided by this package. +Command-line client and Python client library for `Zyte API`_. + +.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html + +.. description-end Installation ============ -:: - - pip install zyte-api +.. install-start -``zyte-api`` requires Python 3.8+. +.. code-block:: shell -API key -======= + pip install zyte-api -Make sure you have an API key for the `Zyte API`_ service. -You can set ``ZYTE_API_KEY`` environment -variable with the key to avoid passing it around explicitly. +.. note:: Python 3.8+ is required. +.. install-end Basic usage =========== -Use the ``zyte-api`` command to send Zyte API requests from the command line: +.. basic-start + +Set your API key +---------------- + +.. key-get-start + +After you `sign up for a Zyte API account +`_, copy `your API key +`_. + +.. key-get-end + + +Use the command-line client +--------------------------- + +Then you can use the zyte-api command-line client to send Zyte API requests. +First create a text file with a list of URLs: + +.. code-block:: none + + https://books.toscrape.com + https://quotes.toscrape.com + +And then call ``zyte-api`` from your shell: .. code-block:: shell - zyte-api url-list.txt --output results.jsonl + zyte-api url-list.txt --api-key YOUR_API_KEY --output results.jsonl -Or use the Python async API: + +Use the Python sync API +----------------------- + +For very basic Python scripts, use the sync API: + +.. code-block:: python + + from zyte_api import ZyteAPI + + client = ZyteAPI(api_key="YOUR_API_KEY") + response = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) + + +Use the Python async API +------------------------ + +For asyncio code, use the async API: .. code-block:: python + import asyncio + from zyte_api import AsyncZyteAPI - client = AsyncZyteAPI() - response = await client.get({"url": url, "httpResponseBody": True}) + + async def main(): + client = AsyncZyteAPI(api_key="YOUR_API_KEY") + response = await client.get( + {"url": "https://toscrape.com", "httpResponseBody": True} + ) + + + asyncio.run(main()) + +.. basic-end Read the `documentation `_ for more information. -License is BSD 3-clause. - * Documentation: https://python-zyte-api.readthedocs.io * Source code: https://github.com/zytedata/python-zyte-api * Issue tracker: https://github.com/zytedata/python-zyte-api/issues - -.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html diff --git a/docs/_ext/__init__.py b/docs/_ext/__init__.py new file mode 100644 index 0000000..ee080d2 --- /dev/null +++ b/docs/_ext/__init__.py @@ -0,0 +1,43 @@ +import re + +from docutils import nodes +from docutils.parsers.rst.roles import set_classes + + +def http_api_reference_role( + name, rawtext, text, lineno, inliner, options={}, content=[] +): + match = re.search( + r"(?s)^(.+?)\s*<\s*((?:request|response):[a-zA-Z.]+)\s*>\s*$", text + ) + if match: + display_text = match[1] + reference = match[2] + else: + display_text = None + reference = text + if reference.startswith("request:"): + request_or_response = "request" + elif reference.startswith("response:"): + request_or_response = "response/200" + else: + raise ValueError( + f":http: directive reference must start with request: or " + f"response:, got {reference} from {text!r}." + ) + + field = reference.split(":", maxsplit=1)[1] + if not display_text: + display_text = field + refuri = ( + f"https://docs.zyte.com/zyte-api/usage/reference.html" + f"#operation/extract/{request_or_response}/{field}" + ) + set_classes(options) + node = nodes.reference(rawtext, display_text, refuri=refuri, **options) + return [node], [] + + +def setup(app): + # https://github.com/scrapy-plugins/scrapy-zyte-api/blob/2bfb2bef2e43293a62f47781914331bc4fa08f06/docs/_ext/__init__.py#L42 + app.add_role("http", http_api_reference_role) diff --git a/docs/_templates/custom-class-template.rst b/docs/_templates/custom-class-template.rst deleted file mode 100644 index b30e1a0..0000000 --- a/docs/_templates/custom-class-template.rst +++ /dev/null @@ -1,35 +0,0 @@ -.. - Template based in the original one, with some changes - proposed on https://stackoverflow.com/a/62613202/3887420 - -{{ fullname | escape | underline}} - -.. currentmodule:: {{ module }} - -.. autoclass:: {{ objname }} - :members: - :show-inheritance: - - {% block methods %} - .. automethod:: __init__ - - {% if methods %} - .. rubric:: {{ _('Methods') }} - - .. autosummary:: - {% for item in methods %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block attributes %} - {% if attributes %} - .. rubric:: {{ _('Attributes') }} - - .. autosummary:: - {% for item in attributes %} - ~{{ name }}.{{ item }} - {%- endfor %} - {% endif %} - {% endblock %} diff --git a/docs/_templates/custom-module-template.rst b/docs/_templates/custom-module-template.rst deleted file mode 100644 index f4d4155..0000000 --- a/docs/_templates/custom-module-template.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. - Template based in the original one, with some changes - proposed on https://stackoverflow.com/a/62613202/3887420 - -{{ fullname | escape | underline}} - -.. automodule:: {{ fullname }} - - {% block attributes %} - {% if attributes %} - .. rubric:: {{ _('Module Attributes') }} - - .. autosummary:: - :toctree: - {% for item in attributes %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block functions %} - {% if functions %} - .. rubric:: {{ _('Functions') }} - - .. autosummary:: - :toctree: - {% for item in functions %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block classes %} - {% if classes %} - .. rubric:: {{ _('Classes') }} - - .. autosummary:: - :toctree: - :template: custom-class-template.rst - {% for item in classes %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - - {% block exceptions %} - {% if exceptions %} - .. rubric:: {{ _('Exceptions') }} - - .. autosummary:: - :toctree: - {% for item in exceptions %} - {{ item }} - {%- endfor %} - {% endif %} - {% endblock %} - -{% block modules %} -{% if modules %} -.. rubric:: Modules - -.. autosummary:: - :toctree: - :template: custom-module-template.rst - :recursive: -{% for item in modules %} - {{ item }} -{%- endfor %} -{% endif %} -{% endblock %} diff --git a/docs/api_reference.rst b/docs/api_reference.rst deleted file mode 100644 index 3b18395..0000000 --- a/docs/api_reference.rst +++ /dev/null @@ -1,13 +0,0 @@ -============= -API Reference -============= - -.. - Based on ideas found on https://stackoverflow.com/a/62613202/3887420 - -.. autosummary:: - :toctree: _autosummary - :template: custom-module-template.rst - :recursive: - - zyte_api diff --git a/docs/asyncio_api.rst b/docs/asyncio_api.rst deleted file mode 100644 index 071ccfe..0000000 --- a/docs/asyncio_api.rst +++ /dev/null @@ -1,93 +0,0 @@ -.. _`asyncio_api`: - -=========== -asyncio API -=========== - -Create an instance of the ``AsyncZyteAPI`` to use the asyncio client API. You -can use the method ``get`` to perform individual requests: - -.. code-block:: python - - import asyncio - from zyte_api import AsyncZyteAPI - - client = AsyncZyteAPI(api_key="YOUR_API_KEY") - - - async def main(): - result = await client.get({"url": "https://toscrape.com", "httpResponseBody": True}) - - - asyncio.run(main()) - -.. tip:: You can skip the ``api_key`` parameter if you :ref:`use an environment - variable instead `. - -There is also an ``iter`` method, which allows to process many URLs in -parallel, using multiple connections: - -.. code-block:: python - - import asyncio - - from zyte_api import AsyncZyteAPI, RequestError - - - async def main(): - client = AsyncZyteAPI(api_key="YOUR_API_KEY") - queries = [ - {"url": "https://toscrape.com", "httpResponseBody": True}, - {"url": "https://books.toscrape.com", "httpResponseBody": True}, - ] - for future in client.iter(queries): - try: - result = await future - except RequestError as e: - ... - - - asyncio.run(main()) - - -``iter`` yields results as they come, not necessarily in their original order. - -``iter`` and ``get`` methods handle throttling (http 429 errors) and network -errors, retrying a request in these cases. - -When using ``iter`` or multiple ``get`` calls, consider using a session: - -.. code-block:: python - - import asyncio - - from zyte_api import AsyncZyteAPI, RequestError - - - async def main(): - client = AsyncZyteAPI(api_key="YOUR_API_KEY") - async with client.session() as session: - queries = [ - {"url": "https://toscrape.com", "httpResponseBody": True}, - {"url": "https://books.toscrape.com", "httpResponseBody": True}, - ] - for future in session.iter(queries): - try: - result = await future - except RequestError as e: - ... - - - asyncio.run(main()) - -Sessions improve performance through a pool of reusable connections to the Zyte -API server. - -To send many queries with a concurrency limit, set ``n_conn`` in your client -(default is ``15``): - -.. code-block:: python - - client = AsyncZyteAPI(n_conn=30) - -``n_conn`` will be enforce across all your ``get`` and ``iter`` calls. diff --git a/docs/command_line.rst b/docs/command_line.rst deleted file mode 100644 index c8cd39b..0000000 --- a/docs/command_line.rst +++ /dev/null @@ -1,98 +0,0 @@ -.. _`command_line`: - -====================== -Command-line interface -====================== - -The most basic way to use the client is from a command line. - -First, create a file with urls, an URL per line (e.g. ``urls.txt``). - -Second, set ``ZYTE_API_KEY`` env variable with your -API key (you can also pass API key as ``--api-key`` script -argument). - -Then run a script, to get the results: - -.. code-block:: shell - - zyte-api urls.txt --output res.jsonl - -.. note:: You may use ``python -m zyte_api`` instead of ``zyte-api``. - -Requests to get browser HTML from those input URLs will be sent to Zyte API, -using up to 20 parallel connections, and the API responses will be stored in -the ``res.jsonl`` `JSON Lines`_ file, 1 response per line. - -.. _JSON Lines: https://jsonlines.org/ - -The results may be stored in an order which is different from the input order. -If you need to match the output results to the input URLs, the best way is to -use the ``echoData`` field (see below); it is passed through, and returned -as-is in the ``echoData`` attribute. By default it will contain the input URL -the content belongs to. - -If you need more flexibility, you can customize the requests by creating -a JSON Lines file with queries: a JSON object per line. You can pass any -`Zyte API`_ options there. For example, you could create the following -``requests.jsonl`` file: - -.. code-block:: json - - {"url": "https://example.com", "browserHtml": true, "geolocation": "GB", "echoData": "homepage"} - {"url": "https://example.com/foo", "browserHtml": true, "javascript": false} - {"url": "https://example.com/bar", "browserHtml": true, "geolocation": "US"} - -See `API docs`_ for a description of all supported parameters. - -.. _API docs: https://docs.zyte.com/zyte-api/openapi.html -.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html - -To get results for this ``requests.jsonl`` file, run: - -.. code-block:: shell - - zyte-api requests.jsonl --output res.jsonl - -Processing speed -~~~~~~~~~~~~~~~~ - -Each API key has a limit on RPS. To get your URLs processed faster you can -increase the number concurrent connections. - -Best options depend on the RPS limit and on websites you're extracting -data from. For example, if your API key has a limit of 3RPS, and average -response time you observe for your websites is 10s, then to get to these -3RPS you may set the number of concurrent connections to 30. - -To set these options in the CLI, use the ``--n-conn`` argument: - -.. code-block:: shell - - zyte-api urls.txt --n-conn 30 --output res.jsonl - -If too many requests are being processed in parallel, you'll be getting -throttling errors. They are handled by CLI automatically, but they make -extraction less efficient; please tune the concurrency options to -not hit the throttling errors (HTTP 429) often. - -You may be also limited by the website speed. The Zyte API tries not to hit any -individual website too hard, but it could be better to limit this on a client -side as well. If you're extracting data from a single website, it could make -sense to decrease the amount of parallel requests; it can ensure higher success -ratio overall. - -If you're extracting data from multiple websites, it makes sense to spread the -load across time: if you have websites A, B and C, don't send requests in -AAAABBBBCCCC order, send them in ABCABCABCABC order instead. - -To do so, you can change the order of the queries in your input file. -Alternatively, you can pass ``--shuffle`` options; it randomly shuffles -input queries before sending them to the API: - -.. code-block:: shell - - zyte-api urls.txt --shuffle --output res.jsonl - -Run ``zyte-api --help`` to get description of all supported -options. diff --git a/docs/conf.py b/docs/conf.py index 5002ec5..de0bb8f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -12,6 +12,7 @@ # import os import sys +from pathlib import Path import sphinx_rtd_theme @@ -39,17 +40,19 @@ # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. +sys.path.insert(0, str(Path(__file__).parent.absolute())) # _ext extensions = [ + "_ext", "sphinx.ext.autodoc", "sphinx.ext.intersphinx", "sphinx.ext.ifconfig", "sphinx.ext.viewcode", "sphinx.ext.githubpages", - "sphinx.ext.autosummary", + "sphinxarg.ext", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] +# templates_path = ["_templates"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: @@ -208,6 +211,10 @@ "https://tenacity.readthedocs.io/en/latest/", None, ), + "zyte": ( + "https://docs.zyte.com", + None, + ), } autodoc_default_options = { @@ -217,4 +224,3 @@ } add_module_names = False -autosummary_generate = True diff --git a/docs/index.rst b/docs/index.rst index fee03f6..7fe3951 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -2,26 +2,35 @@ python-zyte-api =============== -Command-line client and Python client library for `Zyte API`_. - -:ref:`license` is BSD 3-clause. +.. include:: ../README.rst + :start-after: description-start + :end-before: description-end .. toctree:: :caption: Getting started :maxdepth: 1 - install - command_line - sync - asyncio_api + intro/install + intro/basic + +.. toctree:: + :caption: Usage + :maxdepth: 1 + + use/key + use/cli + use/api + +.. toctree:: + :caption: Reference + :maxdepth: 1 + + ref/cli + ref/api .. toctree:: :caption: All the rest :maxdepth: 1 - api_reference contributing changelog - license - -.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html \ No newline at end of file diff --git a/docs/install.rst b/docs/install.rst deleted file mode 100644 index d034b04..0000000 --- a/docs/install.rst +++ /dev/null @@ -1,22 +0,0 @@ -.. _`install`: - -============ -Installation -============ - -:: - - pip install zyte-api - -``zyte-api`` requires Python 3.8+. - -.. _api-key: - -API key -======= - -Make sure you have an API key for the `Zyte API`_ service. -You can set ``ZYTE_API_KEY`` environment -variable with the key to avoid passing it around explicitly. - -.. _Zyte API: https://docs.zyte.com/zyte-api/get-started.html diff --git a/docs/intro/basic.rst b/docs/intro/basic.rst new file mode 100644 index 0000000..32ff015 --- /dev/null +++ b/docs/intro/basic.rst @@ -0,0 +1,9 @@ +.. _basic: + +=========== +Basic usage +=========== + +.. include:: /../README.rst + :start-after: basic-start + :end-before: basic-end diff --git a/docs/intro/install.rst b/docs/intro/install.rst new file mode 100644 index 0000000..be46008 --- /dev/null +++ b/docs/intro/install.rst @@ -0,0 +1,9 @@ +.. _install: + +============ +Installation +============ + +.. include:: /../README.rst + :start-after: install-start + :end-before: install-end diff --git a/docs/license.rst b/docs/license.rst deleted file mode 100644 index e6a41ca..0000000 --- a/docs/license.rst +++ /dev/null @@ -1,7 +0,0 @@ -.. _`license`: - -======= -License -======= - -.. include:: ../LICENSE diff --git a/docs/ref/api.rst b/docs/ref/api.rst new file mode 100644 index 0000000..5a6c4ab --- /dev/null +++ b/docs/ref/api.rst @@ -0,0 +1,39 @@ +.. _api-ref: + +============= +API reference +============= + +.. module:: zyte_api + +Sync API +======== + +.. autoclass:: ZyteAPI + :members: + + +Async API +========= + +.. autoclass:: AsyncZyteAPI + :members: + + +Retries +======= + +.. autodata:: zyte_api_retrying + :no-value: + +.. autoclass:: RetryFactory + + +Errors +====== + +.. autoexception:: RequestError + :members: + +.. autoclass:: ParsedError + :members: diff --git a/docs/ref/cli.rst b/docs/ref/cli.rst new file mode 100644 index 0000000..7afd4f5 --- /dev/null +++ b/docs/ref/cli.rst @@ -0,0 +1,13 @@ +.. _cli-ref: + +============= +CLI reference +============= + +zyte-api +======== + +.. argparse:: + :ref: zyte_api.__main__._get_argument_parser + :prog: zyte-api + :nodefault: diff --git a/docs/requirements.txt b/docs/requirements.txt index 8861680..164cfa9 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,4 +1,5 @@ -tenacity aiohttp >= 3.6.0 Sphinx >= 4.2.0 +sphinx-argparse sphinx-rtd-theme >= 0.4 +tenacity diff --git a/docs/sync.rst b/docs/sync.rst deleted file mode 100644 index ba34f05..0000000 --- a/docs/sync.rst +++ /dev/null @@ -1,8 +0,0 @@ -.. _sync: - -======== -Sync API -======== - -.. autoclass:: zyte_api.ZyteAPI - :members: diff --git a/docs/use/api.rst b/docs/use/api.rst new file mode 100644 index 0000000..4f1522a --- /dev/null +++ b/docs/use/api.rst @@ -0,0 +1,181 @@ +.. _api: + +.. currentmodule:: zyte_api + +===================== +Python client library +===================== + +Once you have :ref:`installed python-zyte-api ` and :ref:`configured +your API key `, you can use one of its APIs from Python code: + +- The :ref:`sync API ` can be used to build simple, proof-of-concept or + debugging Python scripts. + +- The :ref:`async API ` can be used from :ref:`coroutines + `, and is meant for production usage, as well as for asyncio + environments like `Jupyter notebooks`_. + + .. _Jupyter notebooks: https://jupyter.org/ + +.. _sync: + +Sync API +======== + +Create a :class:`ZyteAPI` object, and use its +:meth:`~ZyteAPI.get` method to perform a single request: + +.. code-block:: python + + from zyte_api import ZyteAPI + + client = ZyteAPI() + result = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) + +To perform multiple requests, use a :meth:`~ZyteAPI.session` for +better performance, and use :meth:`~ZyteAPI.iter` to send multiple +requests in parallel: + +.. code-block:: python + + from zyte_api import ZyteAPI, RequestError + + client = ZyteAPI() + with client.session() as session: + queries = [ + {"url": "https://toscrape.com", "httpResponseBody": True}, + {"url": "https://books.toscrape.com", "httpResponseBody": True}, + ] + for result_or_exception in session.iter(queries): + if isinstance(result_or_exception, dict): + ... + elif isinstance(result_or_exception, RequestError): + ... + else: + assert isinstance(result_or_exception, Exception) + ... + +.. tip:: :meth:`~ZyteAPI.iter` yields results as they come, not + necessarily in their original order. Use :http:`request:echoData` to track + the source request. + +.. _asyncio_api: + +Async API +========= + +Create an :class:`AsyncZyteAPI` object, and use its +:meth:`~AsyncZyteAPI.get` method to perform a single request: + +.. code-block:: python + + import asyncio + + from zyte_api import AsyncZyteAPI + + + async def main(): + client = AsyncZyteAPI() + result = await client.get({"url": "https://toscrape.com", "httpResponseBody": True}) + + + asyncio.run(main()) + +To perform multiple requests, use a :meth:`~AsyncZyteAPI.session` for +better performance, and use :meth:`~AsyncZyteAPI.iter` to send +multiple requests in parallel: + +.. code-block:: python + + import asyncio + + from zyte_api import ZyteAPI, RequestError + + + async def main(): + client = ZyteAPI() + async with client.session() as session: + queries = [ + {"url": "https://toscrape.com", "httpResponseBody": True}, + {"url": "https://books.toscrape.com", "httpResponseBody": True}, + ] + for future in session.iter(queries): + try: + result = await future + except RequestError as e: + ... + except Exception as e: + ... + + + asyncio.run(main()) + +.. tip:: :meth:`~AsyncZyteAPI.iter` yields results as they come, not + necessarily in their original order. Use :http:`request:echoData` to track + the source request. + + +.. _api-optimize: + +Optimization +============ + +:class:`ZyteAPI` and :class:`AsyncZyteAPI` use 15 +concurrent connections by default. + +To change that, use the ``n_conn`` parameter when creating your client object: + +.. code-block:: python + + client = ZyteAPI(n_conn=30) + +The number of concurrent connections if enforced across all method calls, +including different sessions of the same client. + +For guidelines on how to choose the optimal value for you, and other +optimization tips, see :ref:`zyte-api-optimize`. + + +Errors and retries +================== + +Methods of :class:`ZyteAPI` and :class:`AsyncZyteAPI` automatically handle +retries for :ref:`rate-limiting ` and :ref:`unsuccessful +` responses, as well as network errors. + +.. _retry-policy: +.. _default-retry-policy: + +The default retry policy, :data:`~zyte_api.zyte_api_retrying`, does the +following: + +- Retries :ref:`rate-limiting responses ` forever. + +- Retries :ref:`unsuccessful responses ` up + to 3 times. + +- Retries network errors for up to 15 minutes. + +All retries are done with an exponential backoff algorithm. + +To customize the retry policy, create your own :class:`~tenacity.AsyncRetrying` +object, e.g. using a custom subclass of :data:`~zyte_api.RetryFactory`, and +pass it when creating your client object: + +.. code-block:: python + + client = ZyteAPI(retrying=custom_retry_policy) + +When retries are exceeded for a given request, an exception is raised. Except +for the :meth:`~ZyteAPI.iter` method of the :ref:`sync API `, which +yields exceptions instead of raising them, to prevent exceptions from +interrupting the entire iteration. + +The type of exception depends on the issue that caused the final request +attempt to fail. Unsuccessful responses trigger a :exc:`RequestError` and +network errors trigger :ref:`aiohttp exceptions `. +Other exceptions could be raised; for example, from a custom retry policy. + + +.. seealso:: :ref:`api-ref` diff --git a/docs/use/cli.rst b/docs/use/cli.rst new file mode 100644 index 0000000..49f4a9a --- /dev/null +++ b/docs/use/cli.rst @@ -0,0 +1,114 @@ +.. _command_line: + +=================== +Command-line client +=================== + +Once you have :ref:`installed python-zyte-api ` and :ref:`configured +your API key `, you can use the ``zyte-api`` command-line client. + +To use ``zyte-api``, pass an :ref:`input file ` as the first +parameter and specify an :ref:`output file ` with ``--output``. +For example: + +.. code-block:: shell + + zyte-api urls.txt --output result.jsonl + +.. _input-file: + +Input file +========== + +The input file can be either of the following: + +- A plain-text file with a list of target URLs, one per line. For example: + + .. code-block:: none + + https://books.toscrape.com + https://quotes.toscrape.com + + For each URL, a Zyte API request will be sent with + :http:`request:browserHtml` set to ``True``. + +- A `JSON Lines `_ file with a object of :ref:`Zyte + API request parameters ` per line. For example: + + .. code-block:: json + + {"url": "https://a.example", "browserHtml": true, "geolocation": "GB"} + {"url": "https://b.example", "httpResponseBody": true} + {"url": "https://books.toscrape.com", "productNavigation": true} + + +.. _output-file: + +Output file +=========== + +You can specify the path to an output file with the ``--output``/``-o`` switch. +If not specified, the output is printed on the standard output. + +.. warning:: The output path is overwritten. + +The output file is in `JSON Lines`_ format. Each line contains a JSON object +with a response from Zyte API. + +By default, ``zyte-api`` uses multiple concurrent connections for +:ref:`performance reasons ` and, as a result, the order of +responses will probably not match the order of the source requests from the +:ref:`input file `. If you need to match the output results to the +input requests, the best way is to use :http:`request:echoData`. By default, +``zyte-api`` fills :http:`request:echoData` with the input URL. + + +.. _cli-optimize: + +Optimization +============ + +By default, ``zyte-api`` uses 20 concurrent connections for requests. Use the +``--n-conn`` switch to change that: + +.. code-block:: shell + + zyte-api --n-conn 40 … + +The ``--shuffle`` option can be useful if you target multiple websites and your +:ref:`input file ` is sorted by website, to randomize the request +order and hence distribute the load somewhat evenly: + +.. code-block:: shell + + zyte-api urls.txt --shuffle … + +For guidelines on how to choose the optimal ``--n-conn`` value for you, and +other optimization tips, see :ref:`zyte-api-optimize`. + + +Errors and retries +================== + +``zyte-api`` automatically handles retries for :ref:`rate-limiting +` and :ref:`unsuccessful +` responses, as well as network errors, +following the :ref:`default retry policy `. + +Use ``--dont-retry-errors`` to disable the retrying of error responses, and +retrying only :ref:`rate-limiting responses `: + +.. code-block:: shell + + zyte-api --dont-retry-errors … + +By default, errors are only logged in the standard error output (``stderr``). +If you want to include error responses in the output file, use +``--store-errors``: + +.. code-block:: shell + + zyte-api --store-errors … + + +.. seealso:: :ref:`cli-ref` diff --git a/docs/use/key.rst b/docs/use/key.rst new file mode 100644 index 0000000..c632895 --- /dev/null +++ b/docs/use/key.rst @@ -0,0 +1,49 @@ +.. _api-key: + +======= +API key +======= + +.. include:: /../README.rst + :start-after: key-get-start + :end-before: key-get-end + +It is recommended to configure your API key through an environment variable, so +that it can be picked by both the :ref:`command-line client ` and +the :ref:`Python client library `: + +- On Windows: + + .. code-block:: shell + + > set ZYTE_API_KEY=YOUR_API_KEY + +- On macOS and Linux: + + .. code-block:: shell + + $ export ZYTE_API_KEY=YOUR_API_KEY + +Alternatively, you may pass your API key to the clients directly: + +- To pass your API key directly to the command-line client, use the + ``--api-key`` switch: + + .. code-block:: shell + + zyte-api --api-key YOUR_API_KEY … + +- To pass your API key directly to the Python client classes, use the + ``api_key`` parameter when creating a client object: + + .. code-block:: python + + from zyte_api import ZyteAPI + + client = ZyteAPI(api_key="YOUR_API_KEY") + + .. code-block:: python + + from zyte_api import AsyncZyteAPI + + client = AsyncZyteAPI(api_key="YOUR_API_KEY") diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 0000000..21fc41d --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,7 @@ +def test_deprecated_imports(): + from zyte_api import RetryFactory, zyte_api_retrying + from zyte_api.aio.retry import RetryFactory as DeprecatedRetryFactory + from zyte_api.aio.retry import zyte_api_retrying as deprecated_zyte_api_retrying + + assert RetryFactory is DeprecatedRetryFactory + assert zyte_api_retrying is deprecated_zyte_api_retrying diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 8c37dc3..347c509 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,5 +4,11 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import RetryFactory, zyte_api_retrying +from ._retry import RetryFactory +from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI +from .errors import ParsedError + +# We re-define the variable here for Sphinx to pick the documentation. +#: :ref:`Default retry policy `. +zyte_api_retrying = _zyte_api_retrying diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index dc36330..776f9c1 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -14,7 +14,7 @@ from zyte_api._async import AsyncZyteAPI from zyte_api._retry import RetryFactory, _is_throttling_error from zyte_api._utils import create_session -from zyte_api.constants import API_URL, ENV_VARIABLE +from zyte_api.constants import API_URL from zyte_api.utils import _guess_intype @@ -109,80 +109,94 @@ def read_input(input_fp, intype): return records -def _main(program_name="zyte-api"): - """Process urls from input file through Zyte API""" +def _get_argument_parser(program_name="zyte-api"): p = argparse.ArgumentParser( prog=program_name, - description=""" - Process input URLs from a file using Zyte API. - """, + description="Send Zyte API requests.", ) p.add_argument( - "input", + "INPUT", type=argparse.FileType("r", encoding="utf8"), - help="Input file with urls, url per line by default. The " - "Format can be changed using `--intype` argument.", + help=( + "Path to an input file (see 'Command-line client > Input file' in " + "the docs for details)." + ), ) p.add_argument( "--intype", default=_UNSET, choices=["txt", "jl"], - help="Type of the input file. " - "Allowed values are 'txt' (1 URL per line) and 'jl' " - "(JSON Lines file, each object describing the " - "parameters of a request). " - "If not specified, the input type is guessed based on " - "the input file name extension (.jl, .jsonl, .txt) or " - "content, and assumed to be txt if guessing fails.", - ) - p.add_argument( - "--limit", type=int, help="Max number of URLs to take from the input" + help=( + "Type of the input file, either 'txt' (plain text) or 'jl' (JSON " + "Lines).\n" + "\n" + "If not specified, the input type is guessed based on the input " + "file extension ('.jl', '.jsonl', or '.txt'), or in its content, " + "with 'txt' as fallback." + ), ) + p.add_argument("--limit", type=int, help="Maximum number of requests to send.") p.add_argument( "--output", "-o", default=sys.stdout, type=argparse.FileType("w", encoding="utf8"), - help=".jsonlines file to store extracted data. " - "By default, results are printed to stdout.", + help=( + "Path for the output file. Results are written into the output " + "file in JSON Lines format.\n" + "\n" + "If not specified, results are printed to the standard output." + ), ) p.add_argument( "--n-conn", type=int, default=20, - help="number of connections to the API server " "(default: %(default)s)", + help=("Number of concurrent connections to use (default: %(default)s)."), ) p.add_argument( "--api-key", - help="Zyte API key. " - "You can also set %s environment variable instead " - "of using this option." % ENV_VARIABLE, + help="Zyte API key.", ) p.add_argument( - "--api-url", help="Zyte API endpoint (default: %(default)s)", default=API_URL + "--api-url", help="Zyte API endpoint (default: %(default)s).", default=API_URL ) p.add_argument( "--loglevel", "-L", default="INFO", choices=["DEBUG", "INFO", "WARNING", "ERROR"], - help="log level (default: %(default)s)", + help="Log level (default: %(default)s).", + ) + p.add_argument( + "--shuffle", + help="Shuffle request order.", + action="store_true", ) - p.add_argument("--shuffle", help="Shuffle input URLs", action="store_true") p.add_argument( "--dont-retry-errors", - help="Don't retry request and network errors", + help="Do not retry unsuccessful responses and network errors, only rate-limiting responses.", action="store_true", ) p.add_argument( "--store-errors", - help="when set to true, it includes all types of responses, and when set to false," - " it includes only error-free responses in the output.", + help=( + "Store error responses in the output file.\n" + "\n" + "If omitted, only successful responses are stored." + ), + action="store_true", ) + return p + + +def _main(program_name="zyte-api"): + """Process urls from input file through Zyte API""" + p = _get_argument_parser(program_name=program_name) args = p.parse_args() logging.basicConfig(stream=sys.stderr, level=getattr(logging, args.loglevel)) - queries = read_input(args.input, args.intype) + queries = read_input(args.INPUT, args.intype) if not queries: print("No input queries found. Is the input file empty?", file=sys.stderr) sys.exit(-1) @@ -193,7 +207,7 @@ def _main(program_name="zyte-api"): queries = queries[: args.limit] logger.info( - f"Loaded {len(queries)} urls from {args.input.name}; shuffled: {args.shuffle}" + f"Loaded {len(queries)} urls from {args.INPUT.name}; shuffled: {args.shuffle}" ) logger.info(f"Running Zyte API (connections: {args.n_conn})") diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 57a661e..48e3970 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,5 +1,6 @@ import asyncio import time +from asyncio import Future from functools import partial from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional @@ -15,9 +16,9 @@ from .utils import USER_AGENT, _process_query if TYPE_CHECKING: - _ResponseFuture = asyncio.Future[Dict[str, Any]] + _ResponseFuture = Future[Dict[str, Any]] else: - _ResponseFuture = asyncio.Future # Python 3.8 support + _ResponseFuture = Future # Python 3.8 support def _post_func(session): @@ -74,7 +75,7 @@ def iter( endpoint: str = "extract", handle_retries=True, retrying: Optional[AsyncRetrying] = None, - ) -> Iterator[asyncio.Future]: + ) -> Iterator[Future]: self._check_context() return self._client.iter( queries=queries, @@ -86,6 +87,11 @@ def iter( class AsyncZyteAPI: + """:ref:`Asynchronous Zyte API client `. + + Parameters work the same as for :class:`ZyteAPI`. + """ + def __init__( self, *, @@ -111,7 +117,8 @@ async def get( session=None, handle_retries=True, retrying: Optional[AsyncRetrying] = None, - ): + ) -> _ResponseFuture: + """Asynchronous equivalent to :meth:`ZyteAPI.get`.""" retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) @@ -183,18 +190,10 @@ def iter( handle_retries=True, retrying: Optional[AsyncRetrying] = None, ) -> Iterator[_ResponseFuture]: - """Send multiple requests to Zyte API in parallel, and return an - iterator of futures for responses. - - `Responses are iterated in arrival order - `__, - i.e. response order may not match the order in the original query. - - ``queries`` is a list of requests to process (dicts). + """Asynchronous equivalent to :meth:`ZyteAPI.iter`. - ``session`` is an optional aiohttp.ClientSession object. - Set the session TCPConnector limit to a value greater than - the number of connections. + .. note:: Yielded futures, when awaited, do raise their exceptions, + instead of only returning them. """ def _request(query): diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index d445f67..ec5c43a 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -1,4 +1,5 @@ import logging +from typing import Optional from aiohttp import ClientResponseError @@ -8,18 +9,20 @@ class RequestError(ClientResponseError): - """Exception which is raised when Request-level error is returned. - In contrast with ClientResponseError, it allows to inspect response - content. - """ + """Exception raised upon receiving a :ref:`rate-limiting + ` or :ref:`unsuccessful + ` response from Zyte API.""" def __init__(self, *args, **kwargs): - self.response_content = kwargs.pop("response_content") - self.request_id = kwargs.get("headers", {}).get("request-id") + #: Response body. + self.response_content: Optional[bytes] = kwargs.pop("response_content") + #: Request ID. + self.request_id: Optional[str] = kwargs.get("headers", {}).get("request-id") super().__init__(*args, **kwargs) @property def parsed(self): + """Response as a :class:`ParsedError` object.""" return ParsedError.from_body(self.response_content) def __str__(self): diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 6da3649..bd169b6 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -55,8 +55,61 @@ def _is_temporary_download_error(exc: BaseException) -> bool: class RetryFactory: - """ - Build custom retry configuration + """Factory class that builds the :class:`tenacity.AsyncRetrying` object + that defines the :ref:`default retry policy `. + + To create a custom retry policy, you can subclass this factory class, + modify it as needed, and then call :meth:`build` on your subclass to get + the corresponding :class:`tenacity.AsyncRetrying` object. + + For example, to increase the maximum number of attempts for :ref:`temporary + download errors ` from 4 (i.e. 3 + retries) to 10 (i.e. 9 retries): + + .. code-block:: python + + from tenacity import stop_after_attempt + from zyte_api import RetryFactory + + + class CustomRetryFactory(RetryFactory): + temporary_download_error_stop = stop_after_attempt(10) + + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + + To retry :ref:`permanent download errors + `, treating them the same as + :ref:`temporary download errors `: + + .. code-block:: python + + from tenacity import RetryCallState, retry_if_exception, stop_after_attempt + from zyte_api import RequestError, RetryFactory + + + def is_permanent_download_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status == 521 + + + class CustomRetryFactory(RetryFactory): + + retry_condition = RetryFactory.retry_condition | retry_if_exception( + is_permanent_download_error + ) + + def wait(self, retry_state: RetryCallState) -> float: + if is_permanent_download_error(retry_state.outcome.exception()): + return self.temporary_download_error_wait(retry_state=retry_state) + return super().wait(retry_state) + + def stop(self, retry_state: RetryCallState) -> bool: + if is_permanent_download_error(retry_state.outcome.exception()): + return self.temporary_download_error_stop(retry_state) + return super().stop(retry_state) + + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() """ retry_condition: retry_base = ( diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index c9029b9..f522824 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -77,21 +77,24 @@ def iter( class ZyteAPI: - """Synchronous Zyte API client. + """:ref:`Synchronous Zyte API client `. - To create an instance, pass your API key: + *api_key* is your Zyte API key. If not specified, it is read from the + ``ZYTE_API_KEY`` environment variable. See :ref:`api-key`. - .. code-block:: python + *api_url* is the Zyte API base URL. - client = ZyteAPI(api_key="YOUR_API_KEY") + *n_conn* is the maximum number of concurrent requests to use. See + :ref:`api-optimize`. - Or :ref:`use an environment variable ` and omit your API key: + *retrying* is the retry policy for requests. Defaults to + :data:`~zyte_api.zyte_api_retrying`. - .. code-block:: python + *user_agent* is the user agent string reported to Zyte API. Defaults to + ``python-zyte-api/``. - client = ZyteAPI() - - Use :meth:`get` and :meth:`iter` to send queries to Zyte API. + .. tip:: To change the ``User-Agent`` header sent to a target website, use + :http:`request:customHttpRequestHeaders` instead. """ def __init__( @@ -120,11 +123,20 @@ def get( handle_retries: bool = True, retrying: Optional[AsyncRetrying] = None, ) -> dict: - """Send a query to Zyte API and get the result. + """Send *query* to Zyte API and return the result. + + *endpoint* is the Zyte API endpoint path relative to the client object + *api_url*. + + *session* is the network session to use. Consider using + :meth:`session` instead of this parameter. - .. code-block:: python + *handle_retries* determines whether or not a :ref:`retry policy + ` should be used. - result = client.get({"url": "https://toscrape.com", "httpResponseBody": True}) + *retrying* is the :ref:`retry policy ` to use, provided + *handle_retries* is ``True``. If not specified, the :ref:`default retry + policy ` is used. """ loop = _get_loop() future = self._async_client.get( @@ -145,25 +157,20 @@ def iter( handle_retries: bool = True, retrying: Optional[AsyncRetrying] = None, ) -> Generator[Union[dict, Exception], None, None]: - """Send multiple queries to Zyte API in parallel and iterate over their - results as they come. + """Send multiple *queries* to Zyte API in parallel and iterate over + their results as they come. - .. code-block:: python - - queries = [ - {"url": "https://books.toscrape.com", "httpResponseBody": True}, - {"url": "https://quotes.toscrape.com", "httpResponseBody": True}, - ] - for result in client.iter(queries): - print(result) + The number of *queries* can exceed the *n_conn* parameter set on the + client object. Extra queries will be queued, there will be only up to + *n_conn* requests being processed in parallel at a time. Results may come an a different order from the original list of - *queries*. You can use echoData_ to attach metadata to queries that you - can later use to restore their original order. + *queries*. You can use :http:`request:echoData` to attach metadata to + queries, and later use that metadata to restore their original order. - .. _echoData: https://docs.zyte.com/zyte-api/usage/reference.html#operation/extract/request/echoData + When exceptions occur, they are yielded, not raised. - When exceptions occur, they are also yielded, not raised. + The remaining parameters work the same as in :meth:`get`. """ loop = _get_loop() for future in self._async_client.iter( @@ -179,4 +186,24 @@ def iter( yield exception def session(self, **kwargs): + """:ref:`Context manager ` to create a contextual + session. + + A contextual session is an object that has the same API as the client + object, except: + + - :meth:`get` and :meth:`iter` do not have a *session* parameter, + the contextual session creates an :class:`aiohttp.ClientSession` + object and passes it to :meth:`get` and :meth:`iter` automatically. + + - It does not have a :meth:`session` method. + + Using the same :class:`aiohttp.ClientSession` object for all Zyte API + requests improves performance by keeping a pool of reusable connections + to Zyte API. + + The :class:`aiohttp.ClientSession` object is created with sane defaults + for Zyte API, but you can use *kwargs* to pass additional parameters to + :class:`aiohttp.ClientSession` and even override those sane defaults. + """ return _Session(client=self, **kwargs) diff --git a/zyte_api/errors.py b/zyte_api/errors.py index 8088b54..0dce512 100644 --- a/zyte_api/errors.py +++ b/zyte_api/errors.py @@ -6,14 +6,25 @@ @attr.s(auto_attribs=True) class ParsedError: - """Parsed error from Zyte API""" + """Parsed error response body from Zyte API.""" + #: Raw response body from Zyte API. response_body: bytes + + #: JSON-decoded response body. + #: + #: If ``None``, :data:`parse_error` indicates the reason. data: Optional[dict] + + #: If :data:`data` is ``None``, this indicates whether the reason is that + #: :data:`response_body` is not valid JSON (``"bad_json"``) or that it is + #: not a JSON object (``"bad_format"``). parse_error: Optional[str] @classmethod def from_body(cls, response_body: bytes) -> "ParsedError": + """Return a :class:`ParsedError` object built out of the specified + error response body.""" data = None parse_error = None @@ -30,4 +41,6 @@ def from_body(cls, response_body: bytes) -> "ParsedError": @property def type(self) -> Optional[str]: + """ID of the error type, e.g. ``"/limits/over-user-limit"`` or + ``"/download/temporary-error"``.""" return (self.data or {}).get("type", None) From 3d5d0fd604c1a1e9737f7851910675de56f98b2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 14:21:23 +0200 Subject: [PATCH 080/126] Release notes (#68) --- CHANGES.rst | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index a39a513..f7f4504 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,31 @@ Changes ======= +0.5.0 (unreleased) +------------------ + +* Removed Python 3.7 support. + +* Added :class:`~zyte_api.ZyteAPI` and :class:`~zyte_api.AsyncZyteAPI` to + provide both sync and async Python interfaces with a cleaner API. + +* Deprecated ``zyte_api.aio``: + + * Replace ``zyte_api.aio.client.AsyncClient`` with the new + :class:`~zyte_api.AsyncZyteAPI` class. + + * Replace ``zyte_api.aio.client.create_session`` with the new + :meth:`AsyncZyteAPI.session ` method. + + * Import ``zyte_api.aio.errors.RequestError``, + ``zyte_api.aio.retry.RetryFactory`` and + ``zyte_api.aio.retry.zyte_api_retrying`` directly from ``zyte_api`` now. + +* When using the command-line interface, you can now use ``--store-errors`` to + have error responses be stored alongside successful responses. + +* Improved the documentation. + 0.4.8 (2023-11-02) ------------------ From f7ba12ec4644b7f6146f19cb01090be7ed319348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 14:22:47 +0200 Subject: [PATCH 081/126] Set the release date for 0.5.0 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index f7f4504..71310ae 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.5.0 (unreleased) +0.5.0 (2024-04-05) ------------------ * Removed Python 3.7 support. From 85e9ab587ba29056bd50d5f4224bef37a06e5e17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 14:24:27 +0200 Subject: [PATCH 082/126] =?UTF-8?q?Bump=20version:=200.4.8=20=E2=86=92=200?= =?UTF-8?q?.5.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 5a0b6bd..ef07e3a 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.4.8 +current_version = 0.5.0 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index de0bb8f..5a318e8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ # The short X.Y version version = "" # The full version, including alpha/beta/rc tags -release = "0.4.8" +release = "0.5.0" # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index a3a9bd5..3d18726 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = "0.4.8" +__version__ = "0.5.0" From 396f26001c512e28c24d5281581a18cf1efc1fe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 14:34:21 +0200 Subject: [PATCH 083/126] Remove the changelog from the PyPI description --- setup.py | 2 +- tox.ini | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 7383672..310eb5e 100755 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ def get_version(): name="zyte-api", version=get_version(), description="Python interface to Zyte API", - long_description=open("README.rst").read() + "\n\n" + open("CHANGES.rst").read(), + long_description=open("README.rst").read(), long_description_content_type="text/x-rst", author="Zyte Group Ltd", author_email="opensource@zyte.com", diff --git a/tox.ini b/tox.ini index cb2b6c2..21023e8 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38,py39,py310,py311,mypy,docs +envlist = py38,py39,py310,py311,mypy,docs,twine [testenv] deps = @@ -36,3 +36,11 @@ commands = [testenv:pre-commit] deps = pre-commit commands = pre-commit run --all-files --show-diff-on-failure + +[testenv:twine] +deps = + twine==4.0.2 + build==1.0.3 +commands = + python setup.py sdist + twine check dist/* From 9968ac494c0917e2d77063ffd88a234430a4a8f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 5 Apr 2024 14:37:50 +0200 Subject: [PATCH 084/126] ReadTheDocs: do not fail on warnings --- .readthedocs.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.readthedocs.yml b/.readthedocs.yml index ead29a8..de19d2d 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -2,7 +2,6 @@ version: 2 formats: all sphinx: configuration: docs/conf.py - fail_on_warning: true build: os: ubuntu-22.04 tools: From 1baecba4015eb4c320c7173ff05d4fb6735cd7f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 16 Apr 2024 09:03:30 +0200 Subject: [PATCH 085/126] Add session.close(), remove internal _context (#69) --- tests/test_async.py | 59 +++++++++++++++++++++++++++++++++++++++++---- tests/test_sync.py | 50 ++++++++++++++++++++++++++++++++++---- tests/test_utils.py | 3 ++- zyte_api/_async.py | 34 ++++++++++++++------------ zyte_api/_sync.py | 52 +++++++++++++++++++++------------------ 5 files changed, 149 insertions(+), 49 deletions(-) diff --git a/tests/test_async.py b/tests/test_async.py index eb3b327..2110f63 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -388,7 +388,7 @@ async def test_semaphore(client_cls, get_method, iter_method, mockserver): @pytest.mark.asyncio -async def test_session(mockserver): +async def test_session_context_manager(mockserver): client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) queries = [ {"url": "https://a.example", "httpResponseBody": True}, @@ -408,7 +408,7 @@ async def test_session(mockserver): ] actual_results = [] async with client.session() as session: - assert session._context.connector.limit == client.n_conn + assert session._session.connector.limit == client.n_conn actual_results.append(await session.get(queries[0])) for future in session.iter(queries[1:]): try: @@ -416,16 +416,65 @@ async def test_session(mockserver): except Exception as e: result = e actual_results.append(result) - aiohttp_session = session._context + aiohttp_session = session._session assert not aiohttp_session.closed assert aiohttp_session.closed - assert session._context is None with pytest.raises(RuntimeError): await session.get(queries[0]) with pytest.raises(RuntimeError): - session.iter(queries[1:]) + future = next(iter(session.iter(queries[1:]))) + await future + + assert len(actual_results) == len(expected_results) + for actual_result in actual_results: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results + + +@pytest.mark.asyncio +async def test_session_no_context_manager(mockserver): + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + Exception, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + ] + actual_results = [] + session = client.session() + assert session._session.connector.limit == client.n_conn + actual_results.append(await session.get(queries[0])) + for future in session.iter(queries[1:]): + try: + result = await future + except Exception as e: + result = e + actual_results.append(result) + aiohttp_session = session._session + assert not aiohttp_session.closed + await session.close() + assert aiohttp_session.closed + + with pytest.raises(RuntimeError): + await session.get(queries[0]) + + with pytest.raises(RuntimeError): + future = next(iter(session.iter(queries[1:]))) + await future assert len(actual_results) == len(expected_results) for actual_result in actual_results: diff --git a/tests/test_sync.py b/tests/test_sync.py index 6012d13..8e014e2 100644 --- a/tests/test_sync.py +++ b/tests/test_sync.py @@ -67,7 +67,7 @@ def test_semaphore(mockserver): assert client._async_client._semaphore.__aexit__.call_count == len(queries) -def test_session(mockserver): +def test_session_context_manager(mockserver): client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) queries = [ {"url": "https://a.example", "httpResponseBody": True}, @@ -87,20 +87,60 @@ def test_session(mockserver): ] actual_results = [] with client.session() as session: - assert session._context.connector.limit == client._async_client.n_conn + assert session._session.connector.limit == client._async_client.n_conn actual_results.append(session.get(queries[0])) for result in session.iter(queries[1:]): actual_results.append(result) - aiohttp_session = session._context + aiohttp_session = session._session assert not aiohttp_session.closed assert aiohttp_session.closed - assert session._context is None with pytest.raises(RuntimeError): session.get(queries[0]) + assert isinstance(next(iter(session.iter(queries[1:]))), RuntimeError) + + assert len(actual_results) == len(expected_results) + for actual_result in actual_results: + if isinstance(actual_result, Exception): + assert Exception in expected_results + else: + assert actual_result in expected_results + + +def test_session_no_context_manager(mockserver): + client = ZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + queries = [ + {"url": "https://a.example", "httpResponseBody": True}, + {"url": "https://exception.example", "httpResponseBody": True}, + {"url": "https://b.example", "httpResponseBody": True}, + ] + expected_results = [ + { + "url": "https://a.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + Exception, + { + "url": "https://b.example", + "httpResponseBody": "PGh0bWw+PGJvZHk+SGVsbG88aDE+V29ybGQhPC9oMT48L2JvZHk+PC9odG1sPg==", + }, + ] + actual_results = [] + session = client.session() + assert session._session.connector.limit == client._async_client.n_conn + actual_results.append(session.get(queries[0])) + for result in session.iter(queries[1:]): + actual_results.append(result) + aiohttp_session = session._session + assert not aiohttp_session.closed + session.close() + assert aiohttp_session.closed + with pytest.raises(RuntimeError): - session.iter(queries[1:]) + session.get(queries[0]) + + assert isinstance(next(iter(session.iter(queries[1:]))), RuntimeError) assert len(actual_results) == len(expected_results) for actual_result in actual_results: diff --git a/tests/test_utils.py b/tests/test_utils.py index 912ed41..9228c57 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -114,7 +114,8 @@ def test_process_query_bytes(): _process_query({"url": b"https://example.com"}) -def test_deprecated_create_session(): +@pytest.mark.asyncio # https://github.com/aio-libs/aiohttp/pull/1468 +async def test_deprecated_create_session(): from zyte_api.aio.client import create_session as _create_session with pytest.warns( diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 48e3970..0c948f3 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -33,23 +33,15 @@ class _AsyncSession: def __init__(self, client, **session_kwargs): self._client = client self._session = create_session(client.n_conn, **session_kwargs) - self._context = None async def __aenter__(self): - self._context = await self._session.__aenter__() return self async def __aexit__(self, *exc_info): - result = await self._context.__aexit__(*exc_info) - self._context = None - return result + await self._session.close() - def _check_context(self): - if self._context is None: - raise RuntimeError( - "Attempt to use session method on a session either not opened " - "or already closed." - ) + async def close(self): + await self._session.close() async def get( self, @@ -59,13 +51,12 @@ async def get( handle_retries=True, retrying: Optional[AsyncRetrying] = None, ): - self._check_context() return await self._client.get( query=query, endpoint=endpoint, handle_retries=handle_retries, retrying=retrying, - session=self._context, + session=self._session, ) def iter( @@ -76,11 +67,10 @@ def iter( handle_retries=True, retrying: Optional[AsyncRetrying] = None, ) -> Iterator[Future]: - self._check_context() return self._client.iter( queries=queries, endpoint=endpoint, - session=self._context, + session=self._session, handle_retries=handle_retries, retrying=retrying, ) @@ -208,4 +198,18 @@ def _request(query): return asyncio.as_completed([_request(query) for query in queries]) def session(self, **kwargs): + """Asynchronous equivalent to :meth:`ZyteAPI.session`. + + You do not need to use :meth:`~AsyncZyteAPI.session` as an async + context manager as long as you await ``close()`` on the object it + returns when you are done: + + .. code-block:: python + + session = client.session() + try: + ... + finally: + await session.close() + """ return _AsyncSession(client=self, **kwargs) diff --git a/zyte_api/_sync.py b/zyte_api/_sync.py index f522824..413618d 100644 --- a/zyte_api/_sync.py +++ b/zyte_api/_sync.py @@ -20,26 +20,24 @@ def _get_loop(): class _Session: def __init__(self, client, **session_kwargs): self._client = client - self._session = client._async_client.session(**session_kwargs) - self._context = None - def __enter__(self): + # https://github.com/aio-libs/aiohttp/pull/1468 + async def create_session(): + return client._async_client.session(**session_kwargs)._session + loop = _get_loop() - self._context = loop.run_until_complete(self._session.__aenter__())._context + self._session = loop.run_until_complete(create_session()) + + def __enter__(self): return self def __exit__(self, *exc_info): loop = _get_loop() - result = loop.run_until_complete(self._context.__aexit__(*exc_info)) - self._context = None - return result + loop.run_until_complete(self._session.close()) - def _check_context(self): - if self._context is None: - raise RuntimeError( - "Attempt to use session method on a session either not opened " - "or already closed." - ) + def close(self): + loop = _get_loop() + loop.run_until_complete(self._session.close()) def get( self, @@ -49,13 +47,12 @@ def get( handle_retries=True, retrying: Optional[AsyncRetrying] = None, ): - self._check_context() return self._client.get( query=query, endpoint=endpoint, handle_retries=handle_retries, retrying=retrying, - session=self._context, + session=self._session, ) def iter( @@ -66,11 +63,10 @@ def iter( handle_retries=True, retrying: Optional[AsyncRetrying] = None, ) -> Generator[Union[dict, Exception], None, None]: - self._check_context() return self._client.iter( queries=queries, endpoint=endpoint, - session=self._context, + session=self._session, handle_retries=handle_retries, retrying=retrying, ) @@ -186,15 +182,14 @@ def iter( yield exception def session(self, **kwargs): - """:ref:`Context manager ` to create a contextual - session. + """:ref:`Context manager ` to create a session. - A contextual session is an object that has the same API as the client - object, except: + A session is an object that has the same API as the client object, + except: - :meth:`get` and :meth:`iter` do not have a *session* parameter, - the contextual session creates an :class:`aiohttp.ClientSession` - object and passes it to :meth:`get` and :meth:`iter` automatically. + the session creates an :class:`aiohttp.ClientSession` object and + passes it to :meth:`get` and :meth:`iter` automatically. - It does not have a :meth:`session` method. @@ -205,5 +200,16 @@ def session(self, **kwargs): The :class:`aiohttp.ClientSession` object is created with sane defaults for Zyte API, but you can use *kwargs* to pass additional parameters to :class:`aiohttp.ClientSession` and even override those sane defaults. + + You do not need to use :meth:`session` as a context manager as long as + you call ``close()`` on the object it returns when you are done: + + .. code-block:: python + + session = client.session() + try: + ... + finally: + session.close() """ return _Session(client=self, **kwargs) From ecfbb5bb79c1c193982fa7587f7b59a78111a53c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 16 Apr 2024 10:00:37 +0200 Subject: [PATCH 086/126] Release notes for 0.5.1 (#70) --- CHANGES.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 71310ae..9f01021 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,13 @@ Changes ======= +0.5.1 (unreleased) +------------------ + +* :class:`~zyte_api.ZyteAPI` and :class:`~zyte_api.AsyncZyteAPI` sessions no + longer need to be used as context managers, and can instead be closed with a + ``close()`` method. + 0.5.0 (2024-04-05) ------------------ From 7bba2eadffc24f460e0b6637652eb467c4a18ca4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 16 Apr 2024 10:01:14 +0200 Subject: [PATCH 087/126] Set the release date of 0.5.1 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index 9f01021..365f632 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.5.1 (unreleased) +0.5.1 (2024-04-16) ------------------ * :class:`~zyte_api.ZyteAPI` and :class:`~zyte_api.AsyncZyteAPI` sessions no From 22d044464a6615450a2a1a37f37f86ccd7bf0300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 16 Apr 2024 10:01:20 +0200 Subject: [PATCH 088/126] =?UTF-8?q?Bump=20version:=200.5.0=20=E2=86=92=200?= =?UTF-8?q?.5.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index ef07e3a..8d0a0ab 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.0 +current_version = 0.5.1 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 5a318e8..3385e1f 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ # The short X.Y version version = "" # The full version, including alpha/beta/rc tags -release = "0.5.0" +release = "0.5.1" # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 3d18726..dd9b22c 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = "0.5.0" +__version__ = "0.5.1" From e91d3a477d7b94ee596019bf6596fad4c93a6308 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 26 Apr 2024 12:46:06 +0200 Subject: [PATCH 089/126] Add conservative_retrying --- docs/ref/api.rst | 5 +++++ docs/use/api.rst | 14 ++++++++---- zyte_api/__init__.py | 12 ++++++++-- zyte_api/_retry.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) diff --git a/docs/ref/api.rst b/docs/ref/api.rst index 5a6c4ab..b7b9229 100644 --- a/docs/ref/api.rst +++ b/docs/ref/api.rst @@ -26,8 +26,13 @@ Retries .. autodata:: zyte_api_retrying :no-value: +.. autodata:: conservative_retrying + :no-value: + .. autoclass:: RetryFactory +.. autoclass:: ConservativeRetryFactory + Errors ====== diff --git a/docs/use/api.rst b/docs/use/api.rst index 4f1522a..c96b618 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -159,13 +159,19 @@ following: All retries are done with an exponential backoff algorithm. -To customize the retry policy, create your own :class:`~tenacity.AsyncRetrying` -object, e.g. using a custom subclass of :data:`~zyte_api.RetryFactory`, and -pass it when creating your client object: +If some :ref:`unsuccessful responses ` exceed +maximum retries with the default retry policy, try using +:data:`~zyte_api.conservative_retrying` instead. Alternatively, the reference +documentation of :class:`~zyte_api.RetryFactory` and +:class:`~zyte_api.ConvervativeRetryFactory` features some examples of custom +retry policies, and you can always build your own +:class:`~tenacity.AsyncRetrying` object from scratch. + +To use a custom retry policy, pass it when creating your client object: .. code-block:: python - client = ZyteAPI(retrying=custom_retry_policy) + client = ZyteAPI(retrying=custom_retrying) When retries are exceeded for a given request, an exception is raised. Except for the :meth:`~ZyteAPI.iter` method of the :ref:`sync API `, which diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 347c509..2dc853d 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,11 +4,19 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import RetryFactory +from ._retry import ConservativeRetryFactory, RetryFactory +from ._retry import conservative_retrying as _conservative_retrying from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI from .errors import ParsedError -# We re-define the variable here for Sphinx to pick the documentation. +# We re-define the variables here for Sphinx to pick the documentation. + #: :ref:`Default retry policy `. zyte_api_retrying = _zyte_api_retrying + +#: Alternative :ref:`retry policy ` that builds on top of +#: :data:`zyte_api_retrying`, but increases the number of attempts for +#: temporary download errors from 4 to 16, and retries as temporary download +#: errors any 5xx HTTP status code other than 503 (retried as rate-limiting). +conservative_retrying = _conservative_retrying diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index bd169b6..ee227a4 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -177,3 +177,56 @@ def build(self) -> AsyncRetrying: zyte_api_retrying: AsyncRetrying = RetryFactory().build() + + +def is_maybe_temporary_error(exc: BaseException) -> bool: + return ( + isinstance(exc, RequestError) + and exc.status >= 500 + and exc.status not in {503, 520} + ) + + +class ConservativeRetryFactory(RetryFactory): + """Alternative factory class that builds :data:`conservative_retrying`. + + To create a custom retry policy based on :data:`conservative_retrying`, you + can subclass this factory class, modify it as needed, and then call + :meth:`build` on your subclass to get the corresponding + :class:`tenacity.AsyncRetrying` object. + + For example, to increase the maximum number of attempts for errors treated + as temporary download errors by :data:`conservative_retrying` from 16 (i.e. + 15 retries) to 32 (i.e. 31 retries): + + .. code-block:: python + + from tenacity import stop_after_attempt + from zyte_api import ConservativeRetryFactory + + + class CustomRetryFactory(ConservativeRetryFactory): + temporary_download_error_stop = stop_after_attempt(32) + + + CUSTOM_RETRY_POLICY = CustomRetryFactory().build() + """ + + retry_condition = RetryFactory.retry_condition | retry_if_exception( + is_maybe_temporary_error + ) + + temporary_download_error_stop = stop_after_attempt(16) + + def stop(self, retry_state: RetryCallState) -> bool: + if is_maybe_temporary_error(retry_state.outcome.exception()): + return self.temporary_download_error_stop(retry_state) + return super().stop(retry_state) + + def wait(self, retry_state: RetryCallState) -> float: + if is_maybe_temporary_error(retry_state.outcome.exception()): + return self.temporary_download_error_wait(retry_state=retry_state) + return super().wait(retry_state) + + +conservative_retrying = ConservativeRetryFactory().build() From 413dd4bf42f30db4ac618df296fb030c6fcb7a15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 26 Apr 2024 12:53:13 +0200 Subject: [PATCH 090/126] Keep mypy happy --- zyte_api/_retry.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index ee227a4..ad695bf 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -219,12 +219,18 @@ class CustomRetryFactory(ConservativeRetryFactory): temporary_download_error_stop = stop_after_attempt(16) def stop(self, retry_state: RetryCallState) -> bool: - if is_maybe_temporary_error(retry_state.outcome.exception()): + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if is_maybe_temporary_error(exc): return self.temporary_download_error_stop(retry_state) return super().stop(retry_state) def wait(self, retry_state: RetryCallState) -> float: - if is_maybe_temporary_error(retry_state.outcome.exception()): + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if is_maybe_temporary_error(exc): return self.temporary_download_error_wait(retry_state=retry_state) return super().wait(retry_state) From 48acc006ba217056fdbd5791926438fb212cc40d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Apr 2024 16:28:41 +0200 Subject: [PATCH 091/126] =?UTF-8?q?Conservative=20=E2=86=92=20Aggresive?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/ref/api.rst | 4 ++-- docs/use/api.rst | 2 +- zyte_api/__init__.py | 6 +++--- zyte_api/_retry.py | 14 +++++++------- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/ref/api.rst b/docs/ref/api.rst index b7b9229..2a5efb1 100644 --- a/docs/ref/api.rst +++ b/docs/ref/api.rst @@ -26,12 +26,12 @@ Retries .. autodata:: zyte_api_retrying :no-value: -.. autodata:: conservative_retrying +.. autodata:: aggresive_retrying :no-value: .. autoclass:: RetryFactory -.. autoclass:: ConservativeRetryFactory +.. autoclass:: AggresiveRetryFactory Errors diff --git a/docs/use/api.rst b/docs/use/api.rst index c96b618..301090e 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -161,7 +161,7 @@ All retries are done with an exponential backoff algorithm. If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using -:data:`~zyte_api.conservative_retrying` instead. Alternatively, the reference +:data:`~zyte_api.aggresive_retrying` instead. Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` and :class:`~zyte_api.ConvervativeRetryFactory` features some examples of custom retry policies, and you can always build your own diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 2dc853d..0fa9b80 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,8 +4,8 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import ConservativeRetryFactory, RetryFactory -from ._retry import conservative_retrying as _conservative_retrying +from ._retry import AggresiveRetryFactory, RetryFactory +from ._retry import aggresive_retrying as _aggresive_retrying from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI from .errors import ParsedError @@ -19,4 +19,4 @@ #: :data:`zyte_api_retrying`, but increases the number of attempts for #: temporary download errors from 4 to 16, and retries as temporary download #: errors any 5xx HTTP status code other than 503 (retried as rate-limiting). -conservative_retrying = _conservative_retrying +aggresive_retrying = _aggresive_retrying diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index ad695bf..6f178fa 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -187,25 +187,25 @@ def is_maybe_temporary_error(exc: BaseException) -> bool: ) -class ConservativeRetryFactory(RetryFactory): - """Alternative factory class that builds :data:`conservative_retrying`. +class AggresiveRetryFactory(RetryFactory): + """Alternative factory class that builds :data:`aggresive_retrying`. - To create a custom retry policy based on :data:`conservative_retrying`, you + To create a custom retry policy based on :data:`aggresive_retrying`, you can subclass this factory class, modify it as needed, and then call :meth:`build` on your subclass to get the corresponding :class:`tenacity.AsyncRetrying` object. For example, to increase the maximum number of attempts for errors treated - as temporary download errors by :data:`conservative_retrying` from 16 (i.e. + as temporary download errors by :data:`aggresive_retrying` from 16 (i.e. 15 retries) to 32 (i.e. 31 retries): .. code-block:: python from tenacity import stop_after_attempt - from zyte_api import ConservativeRetryFactory + from zyte_api import AggresiveRetryFactory - class CustomRetryFactory(ConservativeRetryFactory): + class CustomRetryFactory(AggresiveRetryFactory): temporary_download_error_stop = stop_after_attempt(32) @@ -235,4 +235,4 @@ def wait(self, retry_state: RetryCallState) -> float: return super().wait(retry_state) -conservative_retrying = ConservativeRetryFactory().build() +aggresive_retrying = AggresiveRetryFactory().build() From 985827edf8cd4e157474e66094906bb8567f4391 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Apr 2024 16:29:39 +0200 Subject: [PATCH 092/126] =?UTF-8?q?is=5Fmaybe=5Ftemporary=5Ferror=20?= =?UTF-8?q?=E2=86=92=20=5Fmaybe=5Ftemporary=5Ferror?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/_retry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 6f178fa..344dd2d 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -179,7 +179,7 @@ def build(self) -> AsyncRetrying: zyte_api_retrying: AsyncRetrying = RetryFactory().build() -def is_maybe_temporary_error(exc: BaseException) -> bool: +def _maybe_temporary_error(exc: BaseException) -> bool: return ( isinstance(exc, RequestError) and exc.status >= 500 @@ -213,7 +213,7 @@ class CustomRetryFactory(AggresiveRetryFactory): """ retry_condition = RetryFactory.retry_condition | retry_if_exception( - is_maybe_temporary_error + _maybe_temporary_error ) temporary_download_error_stop = stop_after_attempt(16) @@ -222,7 +222,7 @@ def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if is_maybe_temporary_error(exc): + if _maybe_temporary_error(exc): return self.temporary_download_error_stop(retry_state) return super().stop(retry_state) @@ -230,7 +230,7 @@ def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if is_maybe_temporary_error(exc): + if _maybe_temporary_error(exc): return self.temporary_download_error_wait(retry_state=retry_state) return super().wait(retry_state) From aca5fc63caae54692263c2daee7ce49f956f1e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Apr 2024 17:12:54 +0200 Subject: [PATCH 093/126] Fix the description of the default retry policy --- docs/use/api.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index 301090e..847fa6a 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -152,8 +152,8 @@ following: - Retries :ref:`rate-limiting responses ` forever. -- Retries :ref:`unsuccessful responses ` up - to 3 times. +- Retries :ref:`temporary download errors + ` up to 3 times. - Retries network errors for up to 15 minutes. From 2a7d19e9f741f4bd0ecc4847ea08a6bca1d0095a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Apr 2024 17:20:20 +0200 Subject: [PATCH 094/126] Add tests for the attempt-based limits of the default retry policy --- tests/test_retry.py | 95 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/tests/test_retry.py b/tests/test_retry.py index 21fc41d..612e69b 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -1,3 +1,11 @@ +from collections import deque +from copy import copy + +import pytest + +from zyte_api import RequestError, zyte_api_retrying + + def test_deprecated_imports(): from zyte_api import RetryFactory, zyte_api_retrying from zyte_api.aio.retry import RetryFactory as DeprecatedRetryFactory @@ -5,3 +13,90 @@ def test_deprecated_imports(): assert RetryFactory is DeprecatedRetryFactory assert zyte_api_retrying is deprecated_zyte_api_retrying + + +def mock_request_error(*, status=200): + return RequestError( + history=None, + request_info=None, + response_content=None, + status=status, + ) + + +# Number of times to test request errors that must be retried forever. +FOREVER_TIMES = 100 + + +@pytest.mark.parametrize( + ("retrying", "exceptions", "exhausted"), + ( + *( + (zyte_api_retrying, exceptions, exhausted) + for exceptions, exhausted in ( + ( + (mock_request_error(status=429),) * FOREVER_TIMES, + False, + ), + ( + (mock_request_error(status=503),) * FOREVER_TIMES, + False, + ), + ( + (mock_request_error(status=520),) * 3, + False, + ), + ( + (mock_request_error(status=520),) * 4, + True, + ), + ( + ( + mock_request_error(status=429), + mock_request_error(status=429), + mock_request_error(status=520), + ), + False, + ), + ( + ( + mock_request_error(status=429), + mock_request_error(status=429), + mock_request_error(status=429), + mock_request_error(status=520), + ), + True, + ), + ) + ), + ), +) +@pytest.mark.asyncio +async def test_retrying_attempt_based_stop(retrying, exceptions, exhausted): + """Test retry stops based on a number of attempts (as opposed to those + based on time passed).""" + last_exception = exceptions[-1] + exceptions = deque(exceptions) + + def wait(retry_state): + return 0.0 + + retrying = copy(retrying) + retrying.wait = wait + + async def run(): + try: + exception = exceptions.popleft() + except IndexError: + return + else: + raise exception + + run = retrying.wraps(run) + try: + await run() + except Exception as exception: + assert exhausted + assert exception == last_exception + else: + assert not exhausted From 9ed623d922f5027bce06d2b7df58a96d5c64a717 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 30 Apr 2024 17:53:47 +0200 Subject: [PATCH 095/126] Reconfigure Codecov --- .github/workflows/test.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 08c211b..0d0ce2d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -33,7 +33,9 @@ jobs: tox -e py - name: coverage if: ${{ success() }} - run: bash <(curl -s https://codecov.io/bash) + uses: codecov/codecov-action@v4.0.1 + with: + token: ${{ secrets.CODECOV_TOKEN }} check: runs-on: ubuntu-latest From b7ff4194eebcaa450b9c091ea7477744635732ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 May 2024 22:10:20 +0200 Subject: [PATCH 096/126] Lower aggresive retrying of temporary download errors from 16 to 8 attempts --- tests/test_retry.py | 37 ++++++++++++++++++++++++++++++++++++- zyte_api/_retry.py | 2 +- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index 612e69b..f45a8ba 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -3,7 +3,7 @@ import pytest -from zyte_api import RequestError, zyte_api_retrying +from zyte_api import RequestError, aggresive_retrying, zyte_api_retrying def test_deprecated_imports(): @@ -69,6 +69,41 @@ def mock_request_error(*, status=200): ), ) ), + *( + (aggresive_retrying, exceptions, exhausted) + for exceptions, exhausted in ( + ( + (mock_request_error(status=429),) * FOREVER_TIMES, + False, + ), + ( + (mock_request_error(status=503),) * FOREVER_TIMES, + False, + ), + ( + (mock_request_error(status=520),) * 7, + False, + ), + ( + (mock_request_error(status=520),) * 8, + True, + ), + ( + ( + *(mock_request_error(status=429),) * 6, + mock_request_error(status=520), + ), + False, + ), + ( + ( + *(mock_request_error(status=429),) * 7, + mock_request_error(status=520), + ), + True, + ), + ) + ), ), ) @pytest.mark.asyncio diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 344dd2d..607753e 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -216,7 +216,7 @@ class CustomRetryFactory(AggresiveRetryFactory): _maybe_temporary_error ) - temporary_download_error_stop = stop_after_attempt(16) + temporary_download_error_stop = stop_after_attempt(8) def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" From 41a8d6916b5c1945204701d515a55f3f9602b1c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 7 May 2024 23:00:16 +0200 Subject: [PATCH 097/126] Ignore rate-limiting errors when counting max temporary download errors --- tests/test_retry.py | 47 ++++++++++++++++++++++++++++++++++++++++----- zyte_api/_retry.py | 29 ++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index f45a8ba..e60a1eb 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -52,19 +52,36 @@ def mock_request_error(*, status=200): ), ( ( - mock_request_error(status=429), - mock_request_error(status=429), + *(mock_request_error(status=429),) * 2, mock_request_error(status=520), ), False, ), ( ( - mock_request_error(status=429), - mock_request_error(status=429), - mock_request_error(status=429), + *(mock_request_error(status=429),) * 3, mock_request_error(status=520), ), + False, + ), + ( + ( + *( + mock_request_error(status=429), + mock_request_error(status=520), + ) + * 3, + ), + False, + ), + ( + ( + *( + mock_request_error(status=429), + mock_request_error(status=520), + ) + * 4, + ), True, ), ) @@ -100,6 +117,26 @@ def mock_request_error(*, status=200): *(mock_request_error(status=429),) * 7, mock_request_error(status=520), ), + False, + ), + ( + ( + *( + mock_request_error(status=429), + mock_request_error(status=520), + ) + * 7, + ), + False, + ), + ( + ( + *( + mock_request_error(status=429), + mock_request_error(status=520), + ) + * 8, + ), True, ), ) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 607753e..35c0520 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -1,5 +1,6 @@ import asyncio import logging +from collections import Counter from aiohttp import client_exceptions from tenacity import ( @@ -10,14 +11,13 @@ before_sleep_log, retry_base, retry_if_exception, - stop_after_attempt, stop_after_delay, wait_chain, wait_fixed, wait_random, wait_random_exponential, ) -from tenacity.stop import stop_never +from tenacity.stop import stop_base, stop_never from ._errors import RequestError @@ -54,6 +54,27 @@ def _is_temporary_download_error(exc: BaseException) -> bool: return isinstance(exc, RequestError) and exc.status == 520 +class stop_on_count(stop_base): + """Keep a call count with the specified counter name, and stop after the + specified number os calls. + + Unlike stop_after_attempt, this callable does not take into account + attempts for which a different stop callable was used. + """ + + def __init__(self, max_count: int, counter_name: str) -> None: + self._max_count = max_count - 1 + self._counter_name = counter_name + + def __call__(self, retry_state: "RetryCallState") -> bool: + if not hasattr(retry_state, "counter"): + retry_state.counter = Counter() + if retry_state.counter[self._counter_name] >= self._max_count: + return True + retry_state.counter[self._counter_name] += 1 + return False + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -137,7 +158,7 @@ def stop(self, retry_state: RetryCallState) -> bool: temporary_download_error_wait = network_error_wait throttling_stop = stop_never network_error_stop = stop_after_delay(15 * 60) - temporary_download_error_stop = stop_after_attempt(4) + temporary_download_error_stop = stop_on_count(4, "temporary_download_error") def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" @@ -216,7 +237,7 @@ class CustomRetryFactory(AggresiveRetryFactory): _maybe_temporary_error ) - temporary_download_error_stop = stop_after_attempt(8) + temporary_download_error_stop = stop_on_count(8, "temporary_download_error") def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" From 439cac9e9975c5c23ecb4157975b30258bb6ec10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 May 2024 21:09:19 +0200 Subject: [PATCH 098/126] Only stop retries for network errors after 15 uninterrupted minutes of network errors --- docs/ref/api.rst | 4 +- docs/use/api.rst | 2 +- tests/test_retry.py | 147 +++++++++++++++++++++++++++++++++++-------- zyte_api/__init__.py | 6 +- zyte_api/_retry.py | 68 +++++++++++++++++--- 5 files changed, 187 insertions(+), 40 deletions(-) diff --git a/docs/ref/api.rst b/docs/ref/api.rst index 2a5efb1..70164ed 100644 --- a/docs/ref/api.rst +++ b/docs/ref/api.rst @@ -26,12 +26,12 @@ Retries .. autodata:: zyte_api_retrying :no-value: -.. autodata:: aggresive_retrying +.. autodata:: aggressive_retrying :no-value: .. autoclass:: RetryFactory -.. autoclass:: AggresiveRetryFactory +.. autoclass:: AggressiveRetryFactory Errors diff --git a/docs/use/api.rst b/docs/use/api.rst index 847fa6a..b7bc424 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -161,7 +161,7 @@ All retries are done with an exponential backoff algorithm. If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using -:data:`~zyte_api.aggresive_retrying` instead. Alternatively, the reference +:data:`~zyte_api.aggressive_retrying` instead. Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` and :class:`~zyte_api.ConvervativeRetryFactory` features some examples of custom retry policies, and you can always build your own diff --git a/tests/test_retry.py b/tests/test_retry.py index e60a1eb..3d13869 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -1,9 +1,11 @@ from collections import deque from copy import copy +from unittest.mock import patch import pytest +from aiohttp.client_exceptions import ServerConnectionError -from zyte_api import RequestError, aggresive_retrying, zyte_api_retrying +from zyte_api import RequestError, aggressive_retrying, zyte_api_retrying def test_deprecated_imports(): @@ -28,12 +30,20 @@ def mock_request_error(*, status=200): FOREVER_TIMES = 100 +class fast_forward: + def __init__(self, time): + self.time = time + + @pytest.mark.parametrize( - ("retrying", "exceptions", "exhausted"), + ("retrying", "outcomes", "exhausted"), ( + # Shared behaviors of all retry policies *( - (zyte_api_retrying, exceptions, exhausted) - for exceptions, exhausted in ( + (retrying, outcomes, exhausted) + for retrying in (zyte_api_retrying, aggressive_retrying) + for outcomes, exhausted in ( + # Rate limiting is retried forever. ( (mock_request_error(status=429),) * FOREVER_TIMES, False, @@ -42,6 +52,92 @@ def mock_request_error(*, status=200): (mock_request_error(status=503),) * FOREVER_TIMES, False, ), + # Network errors are retried until there have only been network + # errors (of any kind) for 15 minutes straight or more. + ( + ( + ServerConnectionError(), + fast_forward(15 * 60 - 1), + ServerConnectionError(), + ), + False, + ), + ( + ( + ServerConnectionError(), + fast_forward(15 * 60), + ServerConnectionError(), + ), + True, + ), + ( + ( + mock_request_error(status=429), + fast_forward(15 * 60 - 1), + ServerConnectionError(), + ), + False, + ), + ( + ( + mock_request_error(status=429), + fast_forward(15 * 60), + ServerConnectionError(), + ), + False, + ), + ( + ( + ServerConnectionError(), + fast_forward(7 * 60), + mock_request_error(status=429), + fast_forward(8 * 60 - 1), + ServerConnectionError(), + ), + False, + ), + ( + ( + ServerConnectionError(), + fast_forward(7 * 60), + mock_request_error(status=429), + fast_forward(8 * 60), + ServerConnectionError(), + ), + False, + ), + ( + ( + ServerConnectionError(), + fast_forward(7 * 60), + mock_request_error(status=429), + fast_forward(8 * 60), + ServerConnectionError(), + fast_forward(15 * 60 - 1), + ServerConnectionError(), + ), + False, + ), + ( + ( + ServerConnectionError(), + fast_forward(7 * 60), + mock_request_error(status=429), + fast_forward(8 * 60), + ServerConnectionError(), + fast_forward(15 * 60), + ServerConnectionError(), + ), + True, + ), + ) + ), + # Behaviors specific to the default retry policy + *( + (zyte_api_retrying, outcomes, exhausted) + for outcomes, exhausted in ( + # Temporary download errors are retried until they have + # happened 4 times in total. ( (mock_request_error(status=520),) * 3, False, @@ -86,17 +182,12 @@ def mock_request_error(*, status=200): ), ) ), + # Behaviors specific to the aggressive retry policy *( - (aggresive_retrying, exceptions, exhausted) - for exceptions, exhausted in ( - ( - (mock_request_error(status=429),) * FOREVER_TIMES, - False, - ), - ( - (mock_request_error(status=503),) * FOREVER_TIMES, - False, - ), + (aggressive_retrying, outcomes, exhausted) + for outcomes, exhausted in ( + # Temporary download errors are retried until they have + # happened 8 times in total. ( (mock_request_error(status=520),) * 7, False, @@ -144,11 +235,13 @@ def mock_request_error(*, status=200): ), ) @pytest.mark.asyncio -async def test_retrying_attempt_based_stop(retrying, exceptions, exhausted): +@patch("time.monotonic") +async def test_retrying(monotonic_mock, retrying, outcomes, exhausted): """Test retry stops based on a number of attempts (as opposed to those based on time passed).""" - last_exception = exceptions[-1] - exceptions = deque(exceptions) + monotonic_mock.return_value = 0 + last_outcome = outcomes[-1] + outcomes = deque(outcomes) def wait(retry_state): return 0.0 @@ -157,18 +250,22 @@ def wait(retry_state): retrying.wait = wait async def run(): - try: - exception = exceptions.popleft() - except IndexError: - return - else: - raise exception + while True: + try: + outcome = outcomes.popleft() + except IndexError: + return + else: + if isinstance(outcome, fast_forward): + monotonic_mock.return_value += outcome.time + continue + raise outcome run = retrying.wraps(run) try: await run() - except Exception as exception: + except Exception as outcome: assert exhausted - assert exception == last_exception + assert outcome == last_outcome else: assert not exhausted diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 0fa9b80..f9da695 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,8 +4,8 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import AggresiveRetryFactory, RetryFactory -from ._retry import aggresive_retrying as _aggresive_retrying +from ._retry import AggressiveRetryFactory, RetryFactory +from ._retry import aggressive_retrying as _aggressive_retrying from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI from .errors import ParsedError @@ -19,4 +19,4 @@ #: :data:`zyte_api_retrying`, but increases the number of attempts for #: temporary download errors from 4 to 16, and retries as temporary download #: errors any 5xx HTTP status code other than 503 (retried as rate-limiting). -aggresive_retrying = _aggresive_retrying +aggressive_retrying = _aggressive_retrying diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 35c0520..211ea68 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -1,6 +1,8 @@ import asyncio import logging from collections import Counter +from datetime import timedelta +from typing import Union from aiohttp import client_exceptions from tenacity import ( @@ -11,7 +13,6 @@ before_sleep_log, retry_base, retry_if_exception, - stop_after_delay, wait_chain, wait_fixed, wait_random, @@ -75,6 +76,55 @@ def __call__(self, retry_state: "RetryCallState") -> bool: return False +time_unit_type = Union[int, float, timedelta] + + +def to_seconds(time_unit: time_unit_type) -> float: + return float( + time_unit.total_seconds() if isinstance(time_unit, timedelta) else time_unit + ) + + +class stop_after_uninterrumpted_delay(stop_base): + """Stop when this stop callable has been called for the specified time + uninterrupted, i.e. without calls to different stop callables. + + Unlike stop_after_delay, this callable resets its timer after any attempt + for which a different stop callable was used. + """ + + def __init__(self, max_delay: time_unit_type, timer_name: str) -> None: + self._max_delay = to_seconds(max_delay) + self._timer_name = timer_name + + def __call__(self, retry_state: "RetryCallState") -> bool: + if not hasattr(retry_state, "uninterrupted_start_times"): + retry_state.uninterrupted_start_times = {} + if self._timer_name not in retry_state.uninterrupted_start_times: + # First time. + retry_state.uninterrupted_start_times[self._timer_name] = [ + retry_state.attempt_number, + retry_state.outcome_timestamp, + ] + return False + attempt_number, start_time = retry_state.uninterrupted_start_times[ + self._timer_name + ] + if retry_state.attempt_number - attempt_number > 1: + # There was a different stop reason since the last attempt, + # resetting the timer. + retry_state.uninterrupted_start_times[self._timer_name] = [ + retry_state.attempt_number, + retry_state.outcome_timestamp, + ] + return False + if retry_state.outcome_timestamp - start_time < self._max_delay: + # Within time, do not stop, only increase the attempt count. + retry_state.uninterrupted_start_times[self._timer_name][0] += 1 + return False + return True + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -157,7 +207,7 @@ def stop(self, retry_state: RetryCallState) -> bool: ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never - network_error_stop = stop_after_delay(15 * 60) + network_error_stop = stop_after_uninterrumpted_delay(15 * 60, "network_error") temporary_download_error_stop = stop_on_count(4, "temporary_download_error") def wait(self, retry_state: RetryCallState) -> float: @@ -208,25 +258,25 @@ def _maybe_temporary_error(exc: BaseException) -> bool: ) -class AggresiveRetryFactory(RetryFactory): - """Alternative factory class that builds :data:`aggresive_retrying`. +class AggressiveRetryFactory(RetryFactory): + """Alternative factory class that builds :data:`aggressive_retrying`. - To create a custom retry policy based on :data:`aggresive_retrying`, you + To create a custom retry policy based on :data:`aggressive_retrying`, you can subclass this factory class, modify it as needed, and then call :meth:`build` on your subclass to get the corresponding :class:`tenacity.AsyncRetrying` object. For example, to increase the maximum number of attempts for errors treated - as temporary download errors by :data:`aggresive_retrying` from 16 (i.e. + as temporary download errors by :data:`aggressive_retrying` from 16 (i.e. 15 retries) to 32 (i.e. 31 retries): .. code-block:: python from tenacity import stop_after_attempt - from zyte_api import AggresiveRetryFactory + from zyte_api import AggressiveRetryFactory - class CustomRetryFactory(AggresiveRetryFactory): + class CustomRetryFactory(AggressiveRetryFactory): temporary_download_error_stop = stop_after_attempt(32) @@ -256,4 +306,4 @@ def wait(self, retry_state: RetryCallState) -> float: return super().wait(retry_state) -aggresive_retrying = AggresiveRetryFactory().build() +aggressive_retrying = AggressiveRetryFactory().build() From 9878e108c0c5bf1c63fefc62c36b750f90f8570a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 8 May 2024 21:39:24 +0200 Subject: [PATCH 099/126] Implement a custom download error handling for the aggressive retry policy --- tests/test_retry.py | 45 ++++++++++++++++++++++++++++++++++++-- zyte_api/_retry.py | 53 +++++++++++++++++++++++++++++++++++++-------- 2 files changed, 87 insertions(+), 11 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index 3d13869..7a54ec4 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -187,7 +187,8 @@ def __init__(self, time): (aggressive_retrying, outcomes, exhausted) for outcomes, exhausted in ( # Temporary download errors are retried until they have - # happened 8 times in total. + # happened 8 times in total. Permanent download errors also + # count towards that limit. ( (mock_request_error(status=520),) * 7, False, @@ -230,6 +231,46 @@ def __init__(self, time): ), True, ), + ( + ( + *(mock_request_error(status=520),) * 5, + *(mock_request_error(status=521),) * 1, + *(mock_request_error(status=520),) * 1, + ), + False, + ), + ( + ( + *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=521),) * 1, + *(mock_request_error(status=520),) * 1, + ), + True, + ), + ( + ( + *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=521),) * 1, + ), + False, + ), + ( + ( + *(mock_request_error(status=520),) * 7, + *(mock_request_error(status=521),) * 1, + ), + True, + ), + # Permanent download errors are retried until they have + # happened 4 times in total. + ( + (*(mock_request_error(status=521),) * 3,), + False, + ), + ( + (*(mock_request_error(status=521),) * 4,), + True, + ), ) ), ), @@ -266,6 +307,6 @@ async def run(): await run() except Exception as outcome: assert exhausted - assert outcome == last_outcome + assert outcome is last_outcome else: assert not exhausted diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 211ea68..dd54242 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -64,15 +64,15 @@ class stop_on_count(stop_base): """ def __init__(self, max_count: int, counter_name: str) -> None: - self._max_count = max_count - 1 + self._max_count = max_count self._counter_name = counter_name def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "counter"): retry_state.counter = Counter() + retry_state.counter[self._counter_name] += 1 if retry_state.counter[self._counter_name] >= self._max_count: return True - retry_state.counter[self._counter_name] += 1 return False @@ -250,14 +250,42 @@ def build(self) -> AsyncRetrying: zyte_api_retrying: AsyncRetrying = RetryFactory().build() -def _maybe_temporary_error(exc: BaseException) -> bool: +def _download_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status in {520, 521} + + +def _undocumented_error(exc: BaseException) -> bool: return ( isinstance(exc, RequestError) and exc.status >= 500 - and exc.status not in {503, 520} + and exc.status not in {503, 520, 521} ) +class stop_on_download_error(stop_base): + """Stop after the specified max numbers of total or permanent download + errors.""" + + def __init__(self, max_total: int, max_permanent: int) -> None: + self._max_total = max_total + self._max_permanent = max_permanent + + def __call__(self, retry_state: "RetryCallState") -> bool: + if not hasattr(retry_state, "counter"): + retry_state.counter = Counter() + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if exc.status == 521: + retry_state.counter["permanent_download_error"] += 1 + if retry_state.counter["permanent_download_error"] >= self._max_permanent: + return True + retry_state.counter["download_error"] += 1 + if retry_state.counter["download_error"] >= self._max_total: + return True + return False + + class AggressiveRetryFactory(RetryFactory): """Alternative factory class that builds :data:`aggressive_retrying`. @@ -283,17 +311,22 @@ class CustomRetryFactory(AggressiveRetryFactory): CUSTOM_RETRY_POLICY = CustomRetryFactory().build() """ - retry_condition = RetryFactory.retry_condition | retry_if_exception( - _maybe_temporary_error + retry_condition = ( + RetryFactory.retry_condition + | retry_if_exception(_download_error) + | retry_if_exception(_undocumented_error) ) - temporary_download_error_stop = stop_on_count(8, "temporary_download_error") + download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) + download_error_wait = RetryFactory.temporary_download_error_wait def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if _maybe_temporary_error(exc): + if _download_error(exc): + return self.download_error_stop(retry_state) + if _undocumented_error(exc): return self.temporary_download_error_stop(retry_state) return super().stop(retry_state) @@ -301,7 +334,9 @@ def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if _maybe_temporary_error(exc): + if _download_error(exc): + return self.download_error_wait(retry_state) + if _undocumented_error(exc): return self.temporary_download_error_wait(retry_state=retry_state) return super().wait(retry_state) From 4d4688df8feb4a08b7157624c333c9984b7365d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 9 May 2024 09:17:39 +0200 Subject: [PATCH 100/126] Retry undocumented 5xx errors up to 4 times, not counting rate-limiting responses or network errors as interruptions --- tests/test_retry.py | 57 +++++++++++++++++++++++++++++++++++++++++++++ zyte_api/_retry.py | 37 ++++++++++++++++++++++++++--- 2 files changed, 91 insertions(+), 3 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index 7a54ec4..709d7a6 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -271,6 +271,63 @@ def __init__(self, time): (*(mock_request_error(status=521),) * 4,), True, ), + # Undocumented 5xx errors are retried until they have happened + # 4 times in a row, not counting rate-limiting responses or + # network errors. + *( + scenario + for status in ( + 500, + 502, + 504, + ) + for scenario in ( + ( + (*(mock_request_error(status=status),) * 3,), + False, + ), + ( + (*(mock_request_error(status=status),) * 4,), + True, + ), + ( + ( + *(mock_request_error(status=status),) * 2, + mock_request_error(status=429), + mock_request_error(status=503), + ServerConnectionError(), + mock_request_error(status=status), + ), + False, + ), + ( + ( + *(mock_request_error(status=status),) * 3, + mock_request_error(status=429), + mock_request_error(status=503), + ServerConnectionError(), + mock_request_error(status=status), + ), + True, + ), + ( + ( + mock_request_error(status=status), + mock_request_error(status=555), + *(mock_request_error(status=status),) * 3, + ), + False, + ), + ( + ( + mock_request_error(status=status), + mock_request_error(status=555), + *(mock_request_error(status=status),) * 4, + ), + True, + ), + ) + ), ) ), ), diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index dd54242..3b5e890 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -2,7 +2,7 @@ import logging from collections import Counter from datetime import timedelta -from typing import Union +from typing import Set, Union from aiohttp import client_exceptions from tenacity import ( @@ -286,6 +286,29 @@ def __call__(self, retry_state: "RetryCallState") -> bool: return False +class stop_on_uninterrupted_status(stop_base): + """Stop after the specified max number of error responses with the same + status code in a row.""" + + def __init__(self, _max: int, ignore_status: Set[int]) -> None: + self._max = _max + self._ignore_status = ignore_status + + def __call__(self, retry_state: "RetryCallState") -> bool: + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + count = 0 + for status in reversed(retry_state.status_history): + if status == exc.status: + count += 1 + if count >= self._max: + return True + elif status not in self._ignore_status: + return False + return False + + class AggressiveRetryFactory(RetryFactory): """Alternative factory class that builds :data:`aggressive_retrying`. @@ -320,14 +343,22 @@ class CustomRetryFactory(AggressiveRetryFactory): download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) download_error_wait = RetryFactory.temporary_download_error_wait + undocumented_error_stop = stop_on_uninterrupted_status( + 4, ignore_status={-1, 429, 503} + ) + undocumented_error_wait = RetryFactory.temporary_download_error_wait + def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" + if not hasattr(retry_state, "status_history"): + retry_state.status_history = [] + retry_state.status_history.append(getattr(exc, "status", -1)) if _download_error(exc): return self.download_error_stop(retry_state) if _undocumented_error(exc): - return self.temporary_download_error_stop(retry_state) + return self.undocumented_error_stop(retry_state) return super().stop(retry_state) def wait(self, retry_state: RetryCallState) -> float: @@ -337,7 +368,7 @@ def wait(self, retry_state: RetryCallState) -> float: if _download_error(exc): return self.download_error_wait(retry_state) if _undocumented_error(exc): - return self.temporary_download_error_wait(retry_state=retry_state) + return self.undocumented_error_wait(retry_state=retry_state) return super().wait(retry_state) From b71edb0cf15ed549f062b9afd4ff6689c42968ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 9 May 2024 09:29:16 +0200 Subject: [PATCH 101/126] Ignore mypy complaints about RetryCallState custom attributes added at run time --- zyte_api/_retry.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 3b5e890..6e6e8ba 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -69,9 +69,9 @@ def __init__(self, max_count: int, counter_name: str) -> None: def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "counter"): - retry_state.counter = Counter() - retry_state.counter[self._counter_name] += 1 - if retry_state.counter[self._counter_name] >= self._max_count: + retry_state.counter = Counter() # type: ignore + retry_state.counter[self._counter_name] += 1 # type: ignore + if retry_state.counter[self._counter_name] >= self._max_count: # type: ignore return True return False @@ -99,28 +99,28 @@ def __init__(self, max_delay: time_unit_type, timer_name: str) -> None: def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "uninterrupted_start_times"): - retry_state.uninterrupted_start_times = {} - if self._timer_name not in retry_state.uninterrupted_start_times: + retry_state.uninterrupted_start_times = {} # type: ignore + if self._timer_name not in retry_state.uninterrupted_start_times: # type: ignore # First time. - retry_state.uninterrupted_start_times[self._timer_name] = [ + retry_state.uninterrupted_start_times[self._timer_name] = [ # type: ignore retry_state.attempt_number, retry_state.outcome_timestamp, ] return False - attempt_number, start_time = retry_state.uninterrupted_start_times[ + attempt_number, start_time = retry_state.uninterrupted_start_times[ # type: ignore self._timer_name ] if retry_state.attempt_number - attempt_number > 1: # There was a different stop reason since the last attempt, # resetting the timer. - retry_state.uninterrupted_start_times[self._timer_name] = [ + retry_state.uninterrupted_start_times[self._timer_name] = [ # type: ignore retry_state.attempt_number, retry_state.outcome_timestamp, ] return False if retry_state.outcome_timestamp - start_time < self._max_delay: # Within time, do not stop, only increase the attempt count. - retry_state.uninterrupted_start_times[self._timer_name][0] += 1 + retry_state.uninterrupted_start_times[self._timer_name][0] += 1 # type: ignore return False return True @@ -272,16 +272,16 @@ def __init__(self, max_total: int, max_permanent: int) -> None: def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "counter"): - retry_state.counter = Counter() + retry_state.counter = Counter() # type: ignore assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if exc.status == 521: - retry_state.counter["permanent_download_error"] += 1 - if retry_state.counter["permanent_download_error"] >= self._max_permanent: + if exc.status == 521: # type: ignore + retry_state.counter["permanent_download_error"] += 1 # type: ignore + if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore return True - retry_state.counter["download_error"] += 1 - if retry_state.counter["download_error"] >= self._max_total: + retry_state.counter["download_error"] += 1 # type: ignore + if retry_state.counter["download_error"] >= self._max_total: # type: ignore return True return False @@ -299,8 +299,8 @@ def __call__(self, retry_state: "RetryCallState") -> bool: exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" count = 0 - for status in reversed(retry_state.status_history): - if status == exc.status: + for status in reversed(retry_state.status_history): # type: ignore + if status == exc.status: # type: ignore count += 1 if count >= self._max: return True @@ -353,8 +353,8 @@ def stop(self, retry_state: RetryCallState) -> bool: exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" if not hasattr(retry_state, "status_history"): - retry_state.status_history = [] - retry_state.status_history.append(getattr(exc, "status", -1)) + retry_state.status_history = [] # type: ignore + retry_state.status_history.append(getattr(exc, "status", -1)) # type: ignore if _download_error(exc): return self.download_error_stop(retry_state) if _undocumented_error(exc): From bba313d96ff564536fd183f1c3ac1cc4c5ddd4cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Thu, 9 May 2024 10:12:26 +0200 Subject: [PATCH 102/126] Clean up APIs for easier subclassing, and update retry docs --- docs/use/api.rst | 30 +++++++++++--- zyte_api/__init__.py | 6 +++ zyte_api/_retry.py | 99 ++++++++++++++++---------------------------- 3 files changed, 66 insertions(+), 69 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index b7bc424..ed5c1e8 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -155,23 +155,41 @@ following: - Retries :ref:`temporary download errors ` up to 3 times. -- Retries network errors for up to 15 minutes. +- Retries network errors until they have happened for 15 minutes straight. All retries are done with an exponential backoff algorithm. +.. _aggressive-retry-policy: + If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using -:data:`~zyte_api.aggressive_retrying` instead. Alternatively, the reference -documentation of :class:`~zyte_api.RetryFactory` and -:class:`~zyte_api.ConvervativeRetryFactory` features some examples of custom +:data:`~zyte_api.aggressive_retrying` instead, which modifies the default retry +policy as follows: + +- Temporary download error are retried 7 times. :ref:`Permanent download + errors ` also count towards this retry + limit. + +- Retries permanent download errors up to 3 times. + +- Retries error responses with an HTTP status code in the 500-599 range (503, + 520 and 521 excluded) until they have happened 4 times in a row, not + counting rate-limiting responses or network errors that may happen in + between. + +Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` +and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom retry policies, and you can always build your own :class:`~tenacity.AsyncRetrying` object from scratch. -To use a custom retry policy, pass it when creating your client object: +To use :data:`~zyte_api.aggressive_retrying` or a custom retry policy, pass it +when creating your client object: .. code-block:: python - client = ZyteAPI(retrying=custom_retrying) + from zyte_api import ZyteAPI, aggressive_retrying + + client = ZyteAPI(retrying=aggressive_retrying) When retries are exceeded for a given request, an exception is raised. Except for the :meth:`~ZyteAPI.iter` method of the :ref:`sync API `, which diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index f9da695..da61ae6 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -6,6 +6,12 @@ from ._errors import RequestError from ._retry import AggressiveRetryFactory, RetryFactory from ._retry import aggressive_retrying as _aggressive_retrying +from ._retry import ( + stop_after_uninterrumpted_delay, + stop_on_count, + stop_on_download_error, + stop_on_uninterrupted_status, +) from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI from .errors import ParsedError diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 6e6e8ba..a8f5241 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -2,7 +2,7 @@ import logging from collections import Counter from datetime import timedelta -from typing import Set, Union +from typing import Union from aiohttp import client_exceptions from tenacity import ( @@ -63,9 +63,9 @@ class stop_on_count(stop_base): attempts for which a different stop callable was used. """ - def __init__(self, max_count: int, counter_name: str) -> None: + def __init__(self, max_count: int) -> None: self._max_count = max_count - self._counter_name = counter_name + self._counter_name = id(self) def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "counter"): @@ -93,9 +93,9 @@ class stop_after_uninterrumpted_delay(stop_base): for which a different stop callable was used. """ - def __init__(self, max_delay: time_unit_type, timer_name: str) -> None: + def __init__(self, max_delay: time_unit_type) -> None: self._max_delay = to_seconds(max_delay) - self._timer_name = timer_name + self._timer_name = id(self) def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "uninterrupted_start_times"): @@ -133,51 +133,22 @@ class RetryFactory: modify it as needed, and then call :meth:`build` on your subclass to get the corresponding :class:`tenacity.AsyncRetrying` object. - For example, to increase the maximum number of attempts for :ref:`temporary - download errors ` from 4 (i.e. 3 - retries) to 10 (i.e. 9 retries): + For example, to double the number of attempts for :ref:`temporary + download errors ` and the time network + errors are retried: .. code-block:: python - from tenacity import stop_after_attempt - from zyte_api import RetryFactory - - - class CustomRetryFactory(RetryFactory): - temporary_download_error_stop = stop_after_attempt(10) - - - CUSTOM_RETRY_POLICY = CustomRetryFactory().build() - - To retry :ref:`permanent download errors - `, treating them the same as - :ref:`temporary download errors `: - - .. code-block:: python - - from tenacity import RetryCallState, retry_if_exception, stop_after_attempt - from zyte_api import RequestError, RetryFactory - - - def is_permanent_download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 521 + from zyte_api import ( + RetryFactory, + stop_after_uninterrumpted_delay, + stop_on_count, + ) class CustomRetryFactory(RetryFactory): - - retry_condition = RetryFactory.retry_condition | retry_if_exception( - is_permanent_download_error - ) - - def wait(self, retry_state: RetryCallState) -> float: - if is_permanent_download_error(retry_state.outcome.exception()): - return self.temporary_download_error_wait(retry_state=retry_state) - return super().wait(retry_state) - - def stop(self, retry_state: RetryCallState) -> bool: - if is_permanent_download_error(retry_state.outcome.exception()): - return self.temporary_download_error_stop(retry_state) - return super().stop(retry_state) + network_error_stop = stop_after_uninterrumpted_delay(30 * 60) + temporary_download_error_stop = stop_on_count(8) CUSTOM_RETRY_POLICY = CustomRetryFactory().build() @@ -207,8 +178,8 @@ def stop(self, retry_state: RetryCallState) -> bool: ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never - network_error_stop = stop_after_uninterrumpted_delay(15 * 60, "network_error") - temporary_download_error_stop = stop_on_count(4, "temporary_download_error") + network_error_stop = stop_after_uninterrumpted_delay(15 * 60) + temporary_download_error_stop = stop_on_count(4) def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" @@ -290,9 +261,8 @@ class stop_on_uninterrupted_status(stop_base): """Stop after the specified max number of error responses with the same status code in a row.""" - def __init__(self, _max: int, ignore_status: Set[int]) -> None: + def __init__(self, _max: int) -> None: self._max = _max - self._ignore_status = ignore_status def __call__(self, retry_state: "RetryCallState") -> bool: assert retry_state.outcome, "Unexpected empty outcome" @@ -304,31 +274,36 @@ def __call__(self, retry_state: "RetryCallState") -> bool: count += 1 if count >= self._max: return True - elif status not in self._ignore_status: + elif status not in {-1, 429, 503}: return False return False class AggressiveRetryFactory(RetryFactory): - """Alternative factory class that builds :data:`aggressive_retrying`. + """Factory class that builds the :class:`tenacity.AsyncRetrying` object + that defines the :ref:`aggressive retry policy `. - To create a custom retry policy based on :data:`aggressive_retrying`, you - can subclass this factory class, modify it as needed, and then call - :meth:`build` on your subclass to get the corresponding - :class:`tenacity.AsyncRetrying` object. + To create a custom retry policy, you can subclass this factory class, + modify it as needed, and then call :meth:`build` on your subclass to get + the corresponding :class:`tenacity.AsyncRetrying` object. - For example, to increase the maximum number of attempts for errors treated - as temporary download errors by :data:`aggressive_retrying` from 16 (i.e. - 15 retries) to 32 (i.e. 31 retries): + For example, to double the maximum number of attempts for all error + responses and double the time network errors are retried: .. code-block:: python - from tenacity import stop_after_attempt - from zyte_api import AggressiveRetryFactory + from zyte_api import ( + AggressiveRetryFactory, + stop_after_uninterrumpted_delay, + stop_on_download_error, + stop_on_uninterrupted_status, + ) class CustomRetryFactory(AggressiveRetryFactory): - temporary_download_error_stop = stop_after_attempt(32) + download_error_stop = stop_on_download_error(max_total=16, max_permanent=8) + network_error_stop = stop_after_uninterrumpted_delay(30 * 60) + undocumented_error_stop = stop_on_uninterrupted_status(8) CUSTOM_RETRY_POLICY = CustomRetryFactory().build() @@ -343,9 +318,7 @@ class CustomRetryFactory(AggressiveRetryFactory): download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) download_error_wait = RetryFactory.temporary_download_error_wait - undocumented_error_stop = stop_on_uninterrupted_status( - 4, ignore_status={-1, 429, 503} - ) + undocumented_error_stop = stop_on_uninterrupted_status(4) undocumented_error_wait = RetryFactory.temporary_download_error_wait def stop(self, retry_state: RetryCallState) -> bool: From 5f4770281c787770e0388297233637c76fec7332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 10:31:41 +0200 Subject: [PATCH 103/126] Implement RequestError.query --- tests/test_main.py | 6 ++++-- zyte_api/_async.py | 4 +++- zyte_api/_errors.py | 14 +++++++++++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/test_main.py b/tests/test_main.py index b18c25f..9fbb3bd 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -154,12 +154,13 @@ async def test_run_stop_on_errors_false(mockserver): @pytest.mark.asyncio async def test_run_stop_on_errors_true(mockserver): - queries = [{"url": "https://exception.example", "httpResponseBody": True}] + query = {"url": "https://exception.example", "httpResponseBody": True} + queries = [query] with NamedTemporaryFile("w") as output_file: with pytest.warns( DeprecationWarning, match=r"^The stop_on_errors parameter is deprecated\.$" ): - with pytest.raises(RequestError): + with pytest.raises(RequestError) as exc_info: await run( queries=queries, out=output_file, @@ -168,6 +169,7 @@ async def test_run_stop_on_errors_true(mockserver): api_key="a", stop_on_errors=True, ) + assert exc_info.value.query == query def _run(*, input, mockserver, cli_params=None): diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 0c948f3..630133e 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -121,9 +121,10 @@ async def request(): stats = ResponseStats.create(start_global) self.agg_stats.n_attempts += 1 + safe_query = _process_query(query) post_kwargs = dict( url=self.api_url + endpoint, - json=_process_query(query), + json=safe_query, auth=auth, headers=headers, ) @@ -145,6 +146,7 @@ async def request(): message=resp.reason, headers=resp.headers, response_content=content, + query=safe_query, ) response = await resp.json() diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index ec5c43a..ea589ae 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -1,5 +1,5 @@ import logging -from typing import Optional +from typing import Any, Dict, Optional from aiohttp import ClientResponseError @@ -14,10 +14,18 @@ class RequestError(ClientResponseError): ` response from Zyte API.""" def __init__(self, *args, **kwargs): - #: Response body. - self.response_content: Optional[bytes] = kwargs.pop("response_content") + #: Query sent to Zyte API. + #: + #: May be slightly different from the input query due to + #: pre-processing logic on the client side. + self.query: Dict[str, Any] = kwargs.pop("query") + #: Request ID. self.request_id: Optional[str] = kwargs.get("headers", {}).get("request-id") + + #: Response body. + self.response_content: Optional[bytes] = kwargs.pop("response_content") + super().__init__(*args, **kwargs) @property From 7cf84238bceb5ca296a84af255601a0a3f346f6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 14:52:22 +0200 Subject: [PATCH 104/126] Improve the wording around custom retry policy usage --- docs/use/api.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index ed5c1e8..435a2a4 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -182,8 +182,9 @@ and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom retry policies, and you can always build your own :class:`~tenacity.AsyncRetrying` object from scratch. -To use :data:`~zyte_api.aggressive_retrying` or a custom retry policy, pass it -when creating your client object: +To use :data:`~zyte_api.aggressive_retrying` or a custom retry policy, pass an +instance of your :class:`~tenacity.AsyncRetrying` subclass when creating your +client object: .. code-block:: python From d5d174062cd19f72650c8c32eac9c4b8ff550584 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 15:33:54 +0200 Subject: [PATCH 105/126] Raise ValueError if retrying does not get an instance of AsyncRetrying --- tests/test_async.py | 9 ++++++++- zyte_api/_async.py | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/test_async.py b/tests/test_async.py index 2110f63..088177c 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -4,7 +4,7 @@ import pytest from tenacity import AsyncRetrying -from zyte_api import AsyncZyteAPI, RequestError +from zyte_api import AggressiveRetryFactory, AsyncZyteAPI, RequestError from zyte_api._retry import RetryFactory from zyte_api.aio.client import AsyncClient from zyte_api.apikey import NoApiKey @@ -482,3 +482,10 @@ async def test_session_no_context_manager(mockserver): assert Exception in expected_results else: assert actual_result in expected_results + + +def test_retrying_class(): + """A descriptive exception is raised when creating a client with an + AsyncRetrying subclass or similar instead of an instance of it.""" + with pytest.raises(ValueError): + AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 0c948f3..5d83414 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -91,6 +91,11 @@ def __init__( retrying: Optional[AsyncRetrying] = None, user_agent: Optional[str] = None, ): + if retrying is not None and not isinstance(retrying, AsyncRetrying): + raise ValueError( + "The retrying parameter, if defined, must be an instance of " + "AsyncRetrying." + ) self.api_key = get_apikey(api_key) self.api_url = api_url self.n_conn = n_conn From adf4f109d61669aba267add1b659976651aca7ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 15:35:15 +0200 Subject: [PATCH 106/126] =?UTF-8?q?Fix=20typo:=20uninterrumpted=20?= =?UTF-8?q?=E2=86=92=20uninterrupted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/__init__.py | 2 +- zyte_api/_retry.py | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index da61ae6..95ce4a7 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -7,7 +7,7 @@ from ._retry import AggressiveRetryFactory, RetryFactory from ._retry import aggressive_retrying as _aggressive_retrying from ._retry import ( - stop_after_uninterrumpted_delay, + stop_after_uninterrupted_delay, stop_on_count, stop_on_download_error, stop_on_uninterrupted_status, diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index a8f5241..8a87268 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -85,7 +85,7 @@ def to_seconds(time_unit: time_unit_type) -> float: ) -class stop_after_uninterrumpted_delay(stop_base): +class stop_after_uninterrupted_delay(stop_base): """Stop when this stop callable has been called for the specified time uninterrupted, i.e. without calls to different stop callables. @@ -141,13 +141,13 @@ class RetryFactory: from zyte_api import ( RetryFactory, - stop_after_uninterrumpted_delay, + stop_after_uninterrupted_delay, stop_on_count, ) class CustomRetryFactory(RetryFactory): - network_error_stop = stop_after_uninterrumpted_delay(30 * 60) + network_error_stop = stop_after_uninterrupted_delay(30 * 60) temporary_download_error_stop = stop_on_count(8) @@ -178,7 +178,7 @@ class CustomRetryFactory(RetryFactory): ) temporary_download_error_wait = network_error_wait throttling_stop = stop_never - network_error_stop = stop_after_uninterrumpted_delay(15 * 60) + network_error_stop = stop_after_uninterrupted_delay(15 * 60) temporary_download_error_stop = stop_on_count(4) def wait(self, retry_state: RetryCallState) -> float: @@ -294,7 +294,7 @@ class AggressiveRetryFactory(RetryFactory): from zyte_api import ( AggressiveRetryFactory, - stop_after_uninterrumpted_delay, + stop_after_uninterrupted_delay, stop_on_download_error, stop_on_uninterrupted_status, ) @@ -302,7 +302,7 @@ class AggressiveRetryFactory(RetryFactory): class CustomRetryFactory(AggressiveRetryFactory): download_error_stop = stop_on_download_error(max_total=16, max_permanent=8) - network_error_stop = stop_after_uninterrumpted_delay(30 * 60) + network_error_stop = stop_after_uninterrupted_delay(30 * 60) undocumented_error_stop = stop_on_uninterrupted_status(8) From f51c561413bfe52c4be18ea1d23989e70f15d780 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 15:39:06 +0200 Subject: [PATCH 107/126] Avoid using id(self) --- zyte_api/_retry.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 8a87268..a44122a 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -2,6 +2,7 @@ import logging from collections import Counter from datetime import timedelta +from itertools import count from typing import Union from aiohttp import client_exceptions @@ -24,6 +25,8 @@ logger = logging.getLogger(__name__) +_IDS = count() + _NETWORK_ERRORS = ( asyncio.TimeoutError, # could happen while reading the response body @@ -65,13 +68,13 @@ class stop_on_count(stop_base): def __init__(self, max_count: int) -> None: self._max_count = max_count - self._counter_name = id(self) + self._counter_id = next(_IDS) def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "counter"): retry_state.counter = Counter() # type: ignore - retry_state.counter[self._counter_name] += 1 # type: ignore - if retry_state.counter[self._counter_name] >= self._max_count: # type: ignore + retry_state.counter[self._counter_id] += 1 # type: ignore + if retry_state.counter[self._counter_id] >= self._max_count: # type: ignore return True return False @@ -95,32 +98,32 @@ class stop_after_uninterrupted_delay(stop_base): def __init__(self, max_delay: time_unit_type) -> None: self._max_delay = to_seconds(max_delay) - self._timer_name = id(self) + self._timer_id = next(_IDS) def __call__(self, retry_state: "RetryCallState") -> bool: if not hasattr(retry_state, "uninterrupted_start_times"): retry_state.uninterrupted_start_times = {} # type: ignore - if self._timer_name not in retry_state.uninterrupted_start_times: # type: ignore + if self._timer_id not in retry_state.uninterrupted_start_times: # type: ignore # First time. - retry_state.uninterrupted_start_times[self._timer_name] = [ # type: ignore + retry_state.uninterrupted_start_times[self._timer_id] = [ # type: ignore retry_state.attempt_number, retry_state.outcome_timestamp, ] return False attempt_number, start_time = retry_state.uninterrupted_start_times[ # type: ignore - self._timer_name + self._timer_id ] if retry_state.attempt_number - attempt_number > 1: # There was a different stop reason since the last attempt, # resetting the timer. - retry_state.uninterrupted_start_times[self._timer_name] = [ # type: ignore + retry_state.uninterrupted_start_times[self._timer_id] = [ # type: ignore retry_state.attempt_number, retry_state.outcome_timestamp, ] return False if retry_state.outcome_timestamp - start_time < self._max_delay: # Within time, do not stop, only increase the attempt count. - retry_state.uninterrupted_start_times[self._timer_name][0] += 1 # type: ignore + retry_state.uninterrupted_start_times[self._timer_id][0] += 1 # type: ignore return False return True From 2f533a74ffe9147b50f16117bacd94f6c5be4ce7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 15:45:46 +0200 Subject: [PATCH 108/126] Add missing parameter --- tests/test_retry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_retry.py b/tests/test_retry.py index 709d7a6..ac8132c 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -23,6 +23,7 @@ def mock_request_error(*, status=200): request_info=None, response_content=None, status=status, + query={}, ) From 99f8523cb278d91af83f8254aa3015a4ba627c32 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 17:18:46 +0200 Subject: [PATCH 109/126] Release notes for 0.5.2 (#74) --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 365f632..f155be5 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,12 @@ Changes ======= +0.5.2 (2024-05-DD) +------------------ + +* :class:`~zyte_api.RequestError` now has a :data:`~zyte_api.RequestError.query` + attribute with the Zyte API request parameters that caused the error. + 0.5.1 (2024-04-16) ------------------ From 452f711dfd878c62e78843c1db69a73f6d316095 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 17:19:27 +0200 Subject: [PATCH 110/126] Set the release date for 0.5.2 --- CHANGES.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.rst b/CHANGES.rst index f155be5..e38dd9d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,7 +1,7 @@ Changes ======= -0.5.2 (2024-05-DD) +0.5.2 (2024-05-10) ------------------ * :class:`~zyte_api.RequestError` now has a :data:`~zyte_api.RequestError.query` From 51e59ea530e4cf04aa4017b83c117e1168a6066c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 17:19:33 +0200 Subject: [PATCH 111/126] =?UTF-8?q?Bump=20version:=200.5.1=20=E2=86=92=200?= =?UTF-8?q?.5.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8d0a0ab..30ebee3 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.1 +current_version = 0.5.2 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index 3385e1f..f52aa22 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ # The short X.Y version version = "" # The full version, including alpha/beta/rc tags -release = "0.5.1" +release = "0.5.2" # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index dd9b22c..7225152 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = "0.5.1" +__version__ = "0.5.2" From f9a8c2647d92c8e0e2eccf7c890bfc313f9f9434 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 May 2024 22:58:47 +0200 Subject: [PATCH 112/126] Concentrate retry tests and complete coverage --- tests/mockserver.py | 3 + tests/test_async.py | 170 -------------------------------------------- tests/test_retry.py | 110 ++++++++++++++++++++++++++-- 3 files changed, 109 insertions(+), 174 deletions(-) diff --git a/tests/mockserver.py b/tests/mockserver.py index 40b1d6e..023b72f 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -86,6 +86,9 @@ def render_POST(self, request): request.setResponseCode(429) response_data = {"status": 429, "type": "/limits/over-user-limit"} return json.dumps(response_data).encode() + if domain == "e500.example": + request.setResponseCode(500) + return "" if domain == "e520.example": request.setResponseCode(520) response_data = {"status": 520, "type": "/download/temporary-error"} diff --git a/tests/test_async.py b/tests/test_async.py index 088177c..3f33ce7 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -2,17 +2,13 @@ from unittest.mock import AsyncMock import pytest -from tenacity import AsyncRetrying from zyte_api import AggressiveRetryFactory, AsyncZyteAPI, RequestError -from zyte_api._retry import RetryFactory from zyte_api.aio.client import AsyncClient from zyte_api.apikey import NoApiKey from zyte_api.errors import ParsedError from zyte_api.utils import USER_AGENT -from .mockserver import DropResource, MockServer - @pytest.mark.parametrize( ("client_cls",), @@ -72,46 +68,6 @@ async def test_get(client_cls, get_method, mockserver): assert actual_result == expected_result -UNSET = object() - - -class OutlierException(RuntimeError): - pass - - -@pytest.mark.parametrize( - ("client_cls", "get_method"), - ( - (AsyncZyteAPI, "get"), - (AsyncClient, "request_raw"), - ), -) -@pytest.mark.parametrize( - ("value", "exception"), - ( - (UNSET, OutlierException), - (True, OutlierException), - (False, RequestError), - ), -) -@pytest.mark.asyncio -async def test_get_handle_retries(client_cls, get_method, value, exception, mockserver): - kwargs = {} - if value is not UNSET: - kwargs["handle_retries"] = value - - def broken_stop(_): - raise OutlierException - - retrying = AsyncRetrying(stop=broken_stop) - client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) - with pytest.raises(exception): - await getattr(client, get_method)( - {"url": "https://exception.example", "browserHtml": True}, - **kwargs, - ) - - @pytest.mark.parametrize( ("client_cls", "get_method"), ( @@ -234,132 +190,6 @@ async def test_iter(client_cls, iter_method, mockserver): assert actual_result in expected_results -@pytest.mark.parametrize( - ("client_cls", "get_method"), - ( - (AsyncZyteAPI, "get"), - (AsyncClient, "request_raw"), - ), -) -@pytest.mark.parametrize( - ("subdomain", "waiter"), - ( - ("e429", "throttling"), - ("e520", "temporary_download_error"), - ), -) -@pytest.mark.asyncio -async def test_retry_wait(client_cls, get_method, subdomain, waiter, mockserver): - def broken_wait(self, retry_state): - raise OutlierException - - class CustomRetryFactory(RetryFactory): - pass - - setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) - - retrying = CustomRetryFactory().build() - client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) - with pytest.raises(OutlierException): - await getattr(client, get_method)( - {"url": f"https://{subdomain}.example", "browserHtml": True}, - ) - - -@pytest.mark.parametrize( - ("client_cls", "get_method"), - ( - (AsyncZyteAPI, "get"), - (AsyncClient, "request_raw"), - ), -) -@pytest.mark.asyncio -async def test_retry_wait_network_error(client_cls, get_method): - waiter = "network_error" - - def broken_wait(self, retry_state): - raise OutlierException - - class CustomRetryFactory(RetryFactory): - pass - - setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) - - retrying = CustomRetryFactory().build() - with MockServer(resource=DropResource) as mockserver: - client = client_cls( - api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying - ) - with pytest.raises(OutlierException): - await getattr(client, get_method)( - {"url": "https://example.com", "browserHtml": True}, - ) - - -@pytest.mark.parametrize( - ("client_cls", "get_method"), - ( - (AsyncZyteAPI, "get"), - (AsyncClient, "request_raw"), - ), -) -@pytest.mark.parametrize( - ("subdomain", "stopper"), - ( - ("e429", "throttling"), - ("e520", "temporary_download_error"), - ), -) -@pytest.mark.asyncio -async def test_retry_stop(client_cls, get_method, subdomain, stopper, mockserver): - def broken_stop(self, retry_state): - raise OutlierException - - class CustomRetryFactory(RetryFactory): - def wait(self, retry_state): - return None - - setattr(CustomRetryFactory, f"{stopper}_stop", broken_stop) - - retrying = CustomRetryFactory().build() - client = client_cls(api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying) - with pytest.raises(OutlierException): - await getattr(client, get_method)( - {"url": f"https://{subdomain}.example", "browserHtml": True}, - ) - - -@pytest.mark.parametrize( - ("client_cls", "get_method"), - ( - (AsyncZyteAPI, "get"), - (AsyncClient, "request_raw"), - ), -) -@pytest.mark.asyncio -async def test_retry_stop_network_error(client_cls, get_method): - stopper = "network_error" - - def broken_stop(self, retry_state): - raise OutlierException - - class CustomRetryFactory(RetryFactory): - def wait(self, retry_state): - return None - - setattr(CustomRetryFactory, f"{stopper}_stop", broken_stop) - - retrying = CustomRetryFactory().build() - with MockServer(resource=DropResource) as mockserver: - client = client_cls( - api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying - ) - with pytest.raises(OutlierException): - await getattr(client, get_method)( - {"url": "https://example.com", "browserHtml": True}, - ) - - @pytest.mark.parametrize( ("client_cls", "get_method", "iter_method"), ( diff --git a/tests/test_retry.py b/tests/test_retry.py index ac8132c..d00e43e 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -4,8 +4,18 @@ import pytest from aiohttp.client_exceptions import ServerConnectionError +from tenacity import AsyncRetrying -from zyte_api import RequestError, aggressive_retrying, zyte_api_retrying +from zyte_api import ( + AggressiveRetryFactory, + AsyncZyteAPI, + RequestError, + RetryFactory, + aggressive_retrying, + zyte_api_retrying, +) + +from .mockserver import DropResource, MockServer def test_deprecated_imports(): @@ -17,6 +27,100 @@ def test_deprecated_imports(): assert zyte_api_retrying is deprecated_zyte_api_retrying +UNSET = object() + + +class OutlierException(RuntimeError): + pass + + +@pytest.mark.parametrize( + ("value", "exception"), + ( + (UNSET, OutlierException), + (True, OutlierException), + (False, RequestError), + ), +) +@pytest.mark.asyncio +async def test_get_handle_retries(value, exception, mockserver): + kwargs = {} + if value is not UNSET: + kwargs["handle_retries"] = value + + def broken_stop(_): + raise OutlierException + + retrying = AsyncRetrying(stop=broken_stop) + client = AsyncZyteAPI( + api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying + ) + with pytest.raises(exception): + await client.get( + {"url": "https://exception.example", "browserHtml": True}, + **kwargs, + ) + + +@pytest.mark.parametrize( + ("retry_factory", "status", "waiter"), + ( + (RetryFactory, 429, "throttling"), + (RetryFactory, 520, "temporary_download_error"), + (AggressiveRetryFactory, 429, "throttling"), + (AggressiveRetryFactory, 500, "undocumented_error"), + (AggressiveRetryFactory, 520, "download_error"), + ), +) +@pytest.mark.asyncio +async def test_retry_wait(retry_factory, status, waiter, mockserver): + def broken_wait(self, retry_state): + raise OutlierException + + class CustomRetryFactory(retry_factory): + pass + + setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) + retrying = CustomRetryFactory().build() + client = AsyncZyteAPI( + api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying + ) + with pytest.raises(OutlierException): + await client.get( + {"url": f"https://e{status}.example", "browserHtml": True}, + ) + + +@pytest.mark.parametrize( + ("retry_factory",), + ( + (RetryFactory,), + (AggressiveRetryFactory,), + ), +) +@pytest.mark.asyncio +async def test_retry_wait_network_error(retry_factory): + waiter = "network_error" + + def broken_wait(self, retry_state): + raise OutlierException + + class CustomRetryFactory(retry_factory): + pass + + setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) + + retrying = CustomRetryFactory().build() + with MockServer(resource=DropResource) as mockserver: + client = AsyncZyteAPI( + api_key="a", api_url=mockserver.urljoin("/"), retrying=retrying + ) + with pytest.raises(OutlierException): + await client.get( + {"url": "https://example.com", "browserHtml": True}, + ) + + def mock_request_error(*, status=200): return RequestError( history=None, @@ -335,9 +439,7 @@ def __init__(self, time): ) @pytest.mark.asyncio @patch("time.monotonic") -async def test_retrying(monotonic_mock, retrying, outcomes, exhausted): - """Test retry stops based on a number of attempts (as opposed to those - based on time passed).""" +async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): monotonic_mock.return_value = 0 last_outcome = outcomes[-1] outcomes = deque(outcomes) From b5b55b1c921471823dd481d17db751d15ec01665 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 May 2024 13:10:50 +0200 Subject: [PATCH 113/126] =?UTF-8?q?undocumented=5Ferror=5Fstop:=20stop=5Fo?= =?UTF-8?q?n=5Funinterrupted=5Fstatus=20=E2=86=92=20stop=5Fon=5Fcount?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- zyte_api/__init__.py | 1 - zyte_api/_retry.py | 28 +++------------------------- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 95ce4a7..b70297f 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -10,7 +10,6 @@ stop_after_uninterrupted_delay, stop_on_count, stop_on_download_error, - stop_on_uninterrupted_status, ) from ._retry import zyte_api_retrying as _zyte_api_retrying from ._sync import ZyteAPI diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index a44122a..52c24f4 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -260,28 +260,6 @@ def __call__(self, retry_state: "RetryCallState") -> bool: return False -class stop_on_uninterrupted_status(stop_base): - """Stop after the specified max number of error responses with the same - status code in a row.""" - - def __init__(self, _max: int) -> None: - self._max = _max - - def __call__(self, retry_state: "RetryCallState") -> bool: - assert retry_state.outcome, "Unexpected empty outcome" - exc = retry_state.outcome.exception() - assert exc, "Unexpected empty exception" - count = 0 - for status in reversed(retry_state.status_history): # type: ignore - if status == exc.status: # type: ignore - count += 1 - if count >= self._max: - return True - elif status not in {-1, 429, 503}: - return False - return False - - class AggressiveRetryFactory(RetryFactory): """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`aggressive retry policy `. @@ -298,15 +276,15 @@ class AggressiveRetryFactory(RetryFactory): from zyte_api import ( AggressiveRetryFactory, stop_after_uninterrupted_delay, + stop_on_count, stop_on_download_error, - stop_on_uninterrupted_status, ) class CustomRetryFactory(AggressiveRetryFactory): download_error_stop = stop_on_download_error(max_total=16, max_permanent=8) network_error_stop = stop_after_uninterrupted_delay(30 * 60) - undocumented_error_stop = stop_on_uninterrupted_status(8) + undocumented_error_stop = stop_on_count(8) CUSTOM_RETRY_POLICY = CustomRetryFactory().build() @@ -321,7 +299,7 @@ class CustomRetryFactory(AggressiveRetryFactory): download_error_stop = stop_on_download_error(max_total=8, max_permanent=4) download_error_wait = RetryFactory.temporary_download_error_wait - undocumented_error_stop = stop_on_uninterrupted_status(4) + undocumented_error_stop = stop_on_count(4) undocumented_error_wait = RetryFactory.temporary_download_error_wait def stop(self, retry_state: RetryCallState) -> bool: From 1e791d5963fbf2b802b8b438f0cefb4d18ddcbc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 May 2024 13:13:25 +0200 Subject: [PATCH 114/126] Update the docs --- docs/use/api.rst | 4 +--- zyte_api/__init__.py | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index 435a2a4..7d05a47 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -173,9 +173,7 @@ policy as follows: - Retries permanent download errors up to 3 times. - Retries error responses with an HTTP status code in the 500-599 range (503, - 520 and 521 excluded) until they have happened 4 times in a row, not - counting rate-limiting responses or network errors that may happen in - between. + 520 and 521 excluded) up to 3 times. Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index b70297f..1f97fd2 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -20,8 +20,5 @@ #: :ref:`Default retry policy `. zyte_api_retrying = _zyte_api_retrying -#: Alternative :ref:`retry policy ` that builds on top of -#: :data:`zyte_api_retrying`, but increases the number of attempts for -#: temporary download errors from 4 to 16, and retries as temporary download -#: errors any 5xx HTTP status code other than 503 (retried as rate-limiting). +#: :ref:`Aggresive retry policy `. aggressive_retrying = _aggressive_retrying From 5638ce918d8a186d115add4343e1150c1a94826e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 15 May 2024 13:15:47 +0200 Subject: [PATCH 115/126] Update test expectations --- tests/test_retry.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index d00e43e..ef8f2d0 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -376,9 +376,7 @@ def __init__(self, time): (*(mock_request_error(status=521),) * 4,), True, ), - # Undocumented 5xx errors are retried until they have happened - # 4 times in a row, not counting rate-limiting responses or - # network errors. + # Undocumented 5xx errors are retried up to 3 times. *( scenario for status in ( @@ -419,7 +417,7 @@ def __init__(self, time): ( mock_request_error(status=status), mock_request_error(status=555), - *(mock_request_error(status=status),) * 3, + mock_request_error(status=status), ), False, ), @@ -427,7 +425,7 @@ def __init__(self, time): ( mock_request_error(status=status), mock_request_error(status=555), - *(mock_request_error(status=status),) * 4, + *(mock_request_error(status=status),) * 2, ), True, ), From d87c0e6ec902c784f09f3cf83d5ec1a95075730a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 28 May 2024 16:54:49 +0200 Subject: [PATCH 116/126] Remove leftovers --- zyte_api/_retry.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 52c24f4..04b16b1 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -306,9 +306,6 @@ def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" exc = retry_state.outcome.exception() assert exc, "Unexpected empty exception" - if not hasattr(retry_state, "status_history"): - retry_state.status_history = [] # type: ignore - retry_state.status_history.append(getattr(exc, "status", -1)) # type: ignore if _download_error(exc): return self.download_error_stop(retry_state) if _undocumented_error(exc): From 9ca2fc50b8b28b34689ca803eac45d7084f81be9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 29 May 2024 15:49:34 +0200 Subject: [PATCH 117/126] Release notes for 0.6.0 (#75) --- CHANGES.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index e38dd9d..55e4c49 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -1,6 +1,27 @@ Changes ======= +0.6.0 (2024-05-29) +------------------ + +* Improved how the :ref:`default retry policy ` handles + :ref:`temporary download errors `. + Before, 3 HTTP 429 responses followed by a single HTTP 520 response would + have prevented a retry. Now, unrelated responses and errors do not count + towards the HTTP 520 retry limit. + +* Improved how the :ref:`default retry policy ` handles + network errors. Before, after 15 minutes of unsuccessful responses (e.g. HTTP + 429), any network error would prevent a retry. Now, network errors must happen + 15 minutes in a row, without different errors in between, to stop retries. + +* Implemented an optional :ref:`aggressive retry policy + `, which retries more errors more often, and could + be useful for long crawls or websites with a low success rate. + +* Improved the exception that is raised when passing an invalid retrying policy + object to a :ref:`Python client `. + 0.5.2 (2024-05-10) ------------------ From b25fb2b8308bf5472c9ce21a040671df8a983eb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 29 May 2024 15:50:03 +0200 Subject: [PATCH 118/126] =?UTF-8?q?Bump=20version:=200.5.2=20=E2=86=92=200?= =?UTF-8?q?.6.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- docs/conf.py | 2 +- zyte_api/__version__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 30ebee3..cecb753 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.5.2 +current_version = 0.6.0 commit = True tag = True tag_name = {new_version} diff --git a/docs/conf.py b/docs/conf.py index f52aa22..8ed5574 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -28,7 +28,7 @@ # The short X.Y version version = "" # The full version, including alpha/beta/rc tags -release = "0.5.2" +release = "0.6.0" # -- General configuration --------------------------------------------------- diff --git a/zyte_api/__version__.py b/zyte_api/__version__.py index 7225152..906d362 100644 --- a/zyte_api/__version__.py +++ b/zyte_api/__version__.py @@ -1 +1 @@ -__version__ = "0.5.2" +__version__ = "0.6.0" From 011215d793c0eedfe0b83bffa101cdfa03699b9a Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 14 Oct 2024 19:13:42 +0500 Subject: [PATCH 119/126] Update Python versions. --- .github/workflows/publish.yml | 2 +- .github/workflows/test.yml | 4 ++-- .readthedocs.yml | 2 +- README.rst | 2 +- setup.py | 4 ++-- tox.ini | 2 +- zyte_api/_async.py | 4 ++-- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 240ffca..9fab21b 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.13' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0d0ce2d..2a28888 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.8', '3.9', '3.10', '3.11'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v2 @@ -42,7 +42,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.11'] + python-version: ['3.13'] tox-job: ["mypy", "docs"] steps: diff --git a/.readthedocs.yml b/.readthedocs.yml index de19d2d..5332597 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -7,7 +7,7 @@ build: tools: # For available versions, see: # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python - python: "3.11" # Keep in sync with .github/workflows/test.yml + python: "3.13" # Keep in sync with .github/workflows/test.yml python: install: - requirements: docs/requirements.txt diff --git a/README.rst b/README.rst index 4035e57..e52e951 100644 --- a/README.rst +++ b/README.rst @@ -35,7 +35,7 @@ Installation pip install zyte-api -.. note:: Python 3.8+ is required. +.. note:: Python 3.9+ is required. .. install-end diff --git a/setup.py b/setup.py index 310eb5e..612d6d4 100755 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python import os from setuptools import find_packages, setup @@ -41,9 +40,10 @@ def get_version(): "Natural Language :: English", "Operating System :: OS Independent", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", ], ) diff --git a/tox.ini b/tox.ini index 21023e8..c3320ef 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py38,py39,py310,py311,mypy,docs,twine +envlist = py39,py310,py311,py312,py313,mypy,docs,twine [testenv] deps = diff --git a/zyte_api/_async.py b/zyte_api/_async.py index 261277f..afa20ff 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import asyncio import time from asyncio import Future @@ -17,8 +19,6 @@ if TYPE_CHECKING: _ResponseFuture = Future[Dict[str, Any]] -else: - _ResponseFuture = Future # Python 3.8 support def _post_func(session): From c69fe9e4c1e8525152d8c285bdf695d9d2a302bd Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Mon, 14 Oct 2024 19:21:19 +0500 Subject: [PATCH 120/126] Bump tool versions. --- .github/workflows/publish.yml | 4 ++-- .github/workflows/test.yml | 8 ++++---- .pre-commit-config.yaml | 10 +++++----- docs/conf.py | 9 +-------- pyproject.toml | 2 +- tests/test_main.py | 7 ++++--- tox.ini | 13 ++++++++----- 7 files changed, 25 insertions(+), 28 deletions(-) diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 9fab21b..820c29d 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -13,9 +13,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: '3.13' - name: Install dependencies diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2a28888..a844809 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -19,9 +19,9 @@ jobs: python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies @@ -46,9 +46,9 @@ jobs: tox-job: ["mypy", "docs"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - name: Install dependencies diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c5420cd..c2700c5 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,19 +1,19 @@ repos: - repo: https://github.com/PyCQA/isort - rev: 5.13.1 + rev: 5.13.2 hooks: - id: isort - repo: https://github.com/psf/black - rev: 24.3.0 + rev: 24.10.0 hooks: - id: black - repo: https://github.com/pycqa/flake8 - rev: 7.0.0 + rev: 7.1.1 hooks: - id: flake8 - repo: https://github.com/adamchainz/blacken-docs - rev: 1.16.0 + rev: 1.19.0 hooks: - id: blacken-docs additional_dependencies: - - black==24.3.0 + - black==24.10.0 diff --git a/docs/conf.py b/docs/conf.py index 8ed5574..a652286 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,8 +14,6 @@ import sys from pathlib import Path -import sphinx_rtd_theme - sys.path.insert(0, os.path.abspath("../")) @@ -58,7 +56,7 @@ # You can specify multiple suffix as a list of string: # # source_suffix = ['.rst', '.md'] -source_suffix = ".rst" +source_suffix = {".rst": "restructuredtext"} # The master toctree document. master_doc = "index" @@ -86,11 +84,6 @@ # html_theme = "sphinx_rtd_theme" -# Add any paths that contain custom themes here, relative to this directory. -# Add path to the RTD explicitly to robustify builds (otherwise might -# fail in a clean Debian build env) -html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] - # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. diff --git a/pyproject.toml b/pyproject.toml index 830e253..c6b28cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,4 +3,4 @@ profile = "black" multi_line_output = 3 [tool.black] -target-version = ["py38", "py39", "py310", "py311", "py312"] +target-version = ["py39", "py310", "py311", "py312", "py313"] diff --git a/tests/test_main.py b/tests/test_main.py index 9fbb3bd..7e3c8d8 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -109,9 +109,10 @@ async def test_run(queries, expected_response, store_errors, exception): async_client_mock.return_value.iter = request_parallel_mock # Patch the AsyncZyteAPI class in __main__ with the mock - with patch("zyte_api.__main__.AsyncZyteAPI", async_client_mock), patch( - "zyte_api.__main__.create_session" - ) as create_session_mock: + with ( + patch("zyte_api.__main__.AsyncZyteAPI", async_client_mock), + patch("zyte_api.__main__.create_session") as create_session_mock, + ): # Mock create_session to return an AsyncMock create_session_mock.return_value = AsyncMock() diff --git a/tox.ini b/tox.ini index c3320ef..33663b7 100644 --- a/tox.ini +++ b/tox.ini @@ -19,9 +19,12 @@ commands = [testenv:mypy] deps = - mypy==0.982 + mypy==1.12.0 + pytest==8.3.3 + Twisted==24.7.0 + types-tqdm==4.66.0.20240417 -commands = mypy --ignore-missing-imports --no-warn-no-return \ +commands = mypy --ignore-missing-imports \ zyte_api \ tests @@ -39,8 +42,8 @@ commands = pre-commit run --all-files --show-diff-on-failure [testenv:twine] deps = - twine==4.0.2 - build==1.0.3 + twine==5.1.1 + build==1.2.2 commands = - python setup.py sdist + python -m build --sdist twine check dist/* From 4132cedae1cb402fc89b1f25c15801c067b857b2 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Tue, 15 Oct 2024 16:10:59 +0500 Subject: [PATCH 121/126] Roll back RTD Python --- .github/workflows/test.yml | 2 +- .readthedocs.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a844809..b056a24 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -42,7 +42,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ['3.13'] + python-version: ['3.12'] # Keep in sync with .readthedocs.yml tox-job: ["mypy", "docs"] steps: diff --git a/.readthedocs.yml b/.readthedocs.yml index 5332597..f81f402 100644 --- a/.readthedocs.yml +++ b/.readthedocs.yml @@ -7,7 +7,7 @@ build: tools: # For available versions, see: # https://docs.readthedocs.io/en/stable/config-file/v2.html#build-tools-python - python: "3.13" # Keep in sync with .github/workflows/test.yml + python: "3.12" # Keep in sync with .github/workflows/test.yml python: install: - requirements: docs/requirements.txt From b95d703fd9c4f607d7a424797fc17b056ef22945 Mon Sep 17 00:00:00 2001 From: Andrey Rakhmatullin Date: Wed, 16 Oct 2024 14:13:18 +0500 Subject: [PATCH 122/126] Update docs referebcesto docs.zyte.com. --- CHANGES.rst | 2 +- docs/use/api.rst | 14 +++++++------- docs/use/cli.rst | 10 +++++----- zyte_api/_errors.py | 4 ++-- zyte_api/_retry.py | 2 +- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/CHANGES.rst b/CHANGES.rst index 55e4c49..fb5fff1 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -5,7 +5,7 @@ Changes ------------------ * Improved how the :ref:`default retry policy ` handles - :ref:`temporary download errors `. + :ref:`temporary download errors `. Before, 3 HTTP 429 responses followed by a single HTTP 520 response would have prevented a retry. Now, unrelated responses and errors do not count towards the HTTP 520 retry limit. diff --git a/docs/use/api.rst b/docs/use/api.rst index 7d05a47..8e3d34e 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -134,15 +134,15 @@ The number of concurrent connections if enforced across all method calls, including different sessions of the same client. For guidelines on how to choose the optimal value for you, and other -optimization tips, see :ref:`zyte-api-optimize`. +optimization tips, see :ref:`zapi-optimize`. Errors and retries ================== Methods of :class:`ZyteAPI` and :class:`AsyncZyteAPI` automatically handle -retries for :ref:`rate-limiting ` and :ref:`unsuccessful -` responses, as well as network errors. +retries for :ref:`rate-limiting ` and :ref:`unsuccessful +` responses, as well as network errors. .. _retry-policy: .. _default-retry-policy: @@ -150,10 +150,10 @@ retries for :ref:`rate-limiting ` and :ref:`unsuccessful The default retry policy, :data:`~zyte_api.zyte_api_retrying`, does the following: -- Retries :ref:`rate-limiting responses ` forever. +- Retries :ref:`rate-limiting responses ` forever. - Retries :ref:`temporary download errors - ` up to 3 times. + ` up to 3 times. - Retries network errors until they have happened for 15 minutes straight. @@ -161,13 +161,13 @@ All retries are done with an exponential backoff algorithm. .. _aggressive-retry-policy: -If some :ref:`unsuccessful responses ` exceed +If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using :data:`~zyte_api.aggressive_retrying` instead, which modifies the default retry policy as follows: - Temporary download error are retried 7 times. :ref:`Permanent download - errors ` also count towards this retry + errors ` also count towards this retry limit. - Retries permanent download errors up to 3 times. diff --git a/docs/use/cli.rst b/docs/use/cli.rst index 49f4a9a..abf2479 100644 --- a/docs/use/cli.rst +++ b/docs/use/cli.rst @@ -33,7 +33,7 @@ The input file can be either of the following: :http:`request:browserHtml` set to ``True``. - A `JSON Lines `_ file with a object of :ref:`Zyte - API request parameters ` per line. For example: + API request parameters ` per line. For example: .. code-block:: json @@ -84,19 +84,19 @@ order and hence distribute the load somewhat evenly: zyte-api urls.txt --shuffle … For guidelines on how to choose the optimal ``--n-conn`` value for you, and -other optimization tips, see :ref:`zyte-api-optimize`. +other optimization tips, see :ref:`zapi-optimize`. Errors and retries ================== ``zyte-api`` automatically handles retries for :ref:`rate-limiting -` and :ref:`unsuccessful -` responses, as well as network errors, +` and :ref:`unsuccessful +` responses, as well as network errors, following the :ref:`default retry policy `. Use ``--dont-retry-errors`` to disable the retrying of error responses, and -retrying only :ref:`rate-limiting responses `: +retrying only :ref:`rate-limiting responses `: .. code-block:: shell diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index ea589ae..6476c39 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -10,8 +10,8 @@ class RequestError(ClientResponseError): """Exception raised upon receiving a :ref:`rate-limiting - ` or :ref:`unsuccessful - ` response from Zyte API.""" + ` or :ref:`unsuccessful + ` response from Zyte API.""" def __init__(self, *args, **kwargs): #: Query sent to Zyte API. diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 04b16b1..3b01c0e 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -137,7 +137,7 @@ class RetryFactory: the corresponding :class:`tenacity.AsyncRetrying` object. For example, to double the number of attempts for :ref:`temporary - download errors ` and the time network + download errors ` and the time network errors are retried: .. code-block:: python From 9d60a9ac3045f6279166117eed154f9f38cc49b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Mon, 30 Dec 2024 17:25:07 +0100 Subject: [PATCH 123/126] Make the default retry policy the aggressive one with half the attempts --- docs/use/api.rst | 25 +++++----- tests/test_retry.py | 115 +++++++++++++++----------------------------- zyte_api/_retry.py | 97 ++++++++++++++++++------------------- 3 files changed, 98 insertions(+), 139 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index 8e3d34e..c6983bf 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -148,32 +148,29 @@ retries for :ref:`rate-limiting ` and :ref:`unsuccessful .. _default-retry-policy: The default retry policy, :data:`~zyte_api.zyte_api_retrying`, does the -following: +following for each request: - Retries :ref:`rate-limiting responses ` forever. -- Retries :ref:`temporary download errors - ` up to 3 times. +- Retries :ref:`temporary download errors ` + up to 3 times. :ref:`Permanent download errors + ` also count towards this retry limit. + +- Retries permanent download errors up to 3 times per request. - Retries network errors until they have happened for 15 minutes straight. +- Retries error responses with an HTTP status code in the 500-599 range (503, + 520 and 521 excluded) up to 3 times. + All retries are done with an exponential backoff algorithm. .. _aggressive-retry-policy: If some :ref:`unsuccessful responses ` exceed maximum retries with the default retry policy, try using -:data:`~zyte_api.aggressive_retrying` instead, which modifies the default retry -policy as follows: - -- Temporary download error are retried 7 times. :ref:`Permanent download - errors ` also count towards this retry - limit. - -- Retries permanent download errors up to 3 times. - -- Retries error responses with an HTTP status code in the 500-599 range (503, - 520 and 521 excluded) up to 3 times. +:data:`~zyte_api.aggressive_retrying` instead, which duplicates attempts for +all retry scenarios. Alternatively, the reference documentation of :class:`~zyte_api.RetryFactory` and :class:`~zyte_api.AggressiveRetryFactory` features some examples of custom diff --git a/tests/test_retry.py b/tests/test_retry.py index ef8f2d0..86e0293 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -140,6 +140,15 @@ def __init__(self, time): self.time = time +class scale: + + def __init__(self, factor): + self.factor = factor + + def __call__(self, number, add=0): + return int(number * self.factor) + add + + @pytest.mark.parametrize( ("retrying", "outcomes", "exhausted"), ( @@ -237,81 +246,36 @@ def __init__(self, time): ), ) ), - # Behaviors specific to the default retry policy + # Scaled behaviors, where the default retry policy uses half as many + # attempts as the aggressive retry policy. *( - (zyte_api_retrying, outcomes, exhausted) - for outcomes, exhausted in ( - # Temporary download errors are retried until they have - # happened 4 times in total. - ( - (mock_request_error(status=520),) * 3, - False, - ), - ( - (mock_request_error(status=520),) * 4, - True, - ), - ( - ( - *(mock_request_error(status=429),) * 2, - mock_request_error(status=520), - ), - False, - ), - ( - ( - *(mock_request_error(status=429),) * 3, - mock_request_error(status=520), - ), - False, - ), - ( - ( - *( - mock_request_error(status=429), - mock_request_error(status=520), - ) - * 3, - ), - False, - ), - ( - ( - *( - mock_request_error(status=429), - mock_request_error(status=520), - ) - * 4, - ), - True, - ), + (retrying, outcomes, exhausted) + for retrying, scaled in ( + (zyte_api_retrying, scale(0.5)), + (aggressive_retrying, scale(1)), ) - ), - # Behaviors specific to the aggressive retry policy - *( - (aggressive_retrying, outcomes, exhausted) for outcomes, exhausted in ( # Temporary download errors are retried until they have - # happened 8 times in total. Permanent download errors also - # count towards that limit. + # happened 8*factor times in total. Permanent download errors + # also count towards that limit. ( - (mock_request_error(status=520),) * 7, + (mock_request_error(status=520),) * scaled(8, -1), False, ), ( - (mock_request_error(status=520),) * 8, + (mock_request_error(status=520),) * scaled(8), True, ), ( ( - *(mock_request_error(status=429),) * 6, + *(mock_request_error(status=429),) * scaled(8, -2), mock_request_error(status=520), ), False, ), ( ( - *(mock_request_error(status=429),) * 7, + *(mock_request_error(status=429),) * scaled(8, -1), mock_request_error(status=520), ), False, @@ -322,7 +286,7 @@ def __init__(self, time): mock_request_error(status=429), mock_request_error(status=520), ) - * 7, + * scaled(8, -1), ), False, ), @@ -332,13 +296,13 @@ def __init__(self, time): mock_request_error(status=429), mock_request_error(status=520), ) - * 8, + * scaled(8), ), True, ), ( ( - *(mock_request_error(status=520),) * 5, + *(mock_request_error(status=520),) * scaled(8, -3), *(mock_request_error(status=521),) * 1, *(mock_request_error(status=520),) * 1, ), @@ -346,7 +310,7 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=520),) * scaled(8, -2), *(mock_request_error(status=521),) * 1, *(mock_request_error(status=520),) * 1, ), @@ -354,29 +318,30 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=520),) * 6, + *(mock_request_error(status=520),) * scaled(8, -2), *(mock_request_error(status=521),) * 1, ), False, ), ( ( - *(mock_request_error(status=520),) * 7, + *(mock_request_error(status=520),) * scaled(8, -1), *(mock_request_error(status=521),) * 1, ), True, ), # Permanent download errors are retried until they have - # happened 4 times in total. + # happened 4*factor times in total. ( - (*(mock_request_error(status=521),) * 3,), + (*(mock_request_error(status=521),) * scaled(4, -1),), False, ), ( - (*(mock_request_error(status=521),) * 4,), + (*(mock_request_error(status=521),) * scaled(4),), True, ), - # Undocumented 5xx errors are retried up to 3 times. + # Undocumented 5xx errors are retried until they have happened + # 4*factor times. *( scenario for status in ( @@ -386,16 +351,16 @@ def __init__(self, time): ) for scenario in ( ( - (*(mock_request_error(status=status),) * 3,), + (*(mock_request_error(status=status),) * scaled(4, -1),), False, ), ( - (*(mock_request_error(status=status),) * 4,), + (*(mock_request_error(status=status),) * scaled(4),), True, ), ( ( - *(mock_request_error(status=status),) * 2, + *(mock_request_error(status=status),) * scaled(4, -2), mock_request_error(status=429), mock_request_error(status=503), ServerConnectionError(), @@ -405,7 +370,7 @@ def __init__(self, time): ), ( ( - *(mock_request_error(status=status),) * 3, + *(mock_request_error(status=status),) * scaled(4, -1), mock_request_error(status=429), mock_request_error(status=503), ServerConnectionError(), @@ -415,17 +380,15 @@ def __init__(self, time): ), ( ( - mock_request_error(status=status), mock_request_error(status=555), - mock_request_error(status=status), + *(mock_request_error(status=status),) * scaled(4, -2), ), False, ), ( ( - mock_request_error(status=status), mock_request_error(status=555), - *(mock_request_error(status=status),) * 2, + *(mock_request_error(status=status),) * scaled(4, -1), ), True, ), @@ -464,7 +427,7 @@ async def run(): try: await run() except Exception as outcome: - assert exhausted + assert exhausted, outcome assert outcome is last_outcome else: assert not exhausted diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 3b01c0e..90bdcad 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -54,10 +54,6 @@ def _is_throttling_error(exc: BaseException) -> bool: return isinstance(exc, RequestError) and exc.status in (429, 503) -def _is_temporary_download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status == 520 - - class stop_on_count(stop_base): """Keep a call count with the specified counter name, and stop after the specified number os calls. @@ -128,6 +124,42 @@ def __call__(self, retry_state: "RetryCallState") -> bool: return True +class stop_on_download_error(stop_base): + """Stop after the specified max numbers of total or permanent download + errors.""" + + def __init__(self, max_total: int, max_permanent: int) -> None: + self._max_total = max_total + self._max_permanent = max_permanent + + def __call__(self, retry_state: "RetryCallState") -> bool: + if not hasattr(retry_state, "counter"): + retry_state.counter = Counter() # type: ignore + assert retry_state.outcome, "Unexpected empty outcome" + exc = retry_state.outcome.exception() + assert exc, "Unexpected empty exception" + if exc.status == 521: # type: ignore + retry_state.counter["permanent_download_error"] += 1 # type: ignore + if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore + return True + retry_state.counter["download_error"] += 1 # type: ignore + if retry_state.counter["download_error"] >= self._max_total: # type: ignore + return True + return False + + +def _download_error(exc: BaseException) -> bool: + return isinstance(exc, RequestError) and exc.status in {520, 521} + + +def _undocumented_error(exc: BaseException) -> bool: + return ( + isinstance(exc, RequestError) + and exc.status >= 500 + and exc.status not in {503, 520, 521} + ) + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -160,7 +192,8 @@ class CustomRetryFactory(RetryFactory): retry_condition: retry_base = ( retry_if_exception(_is_throttling_error) | retry_if_exception(_is_network_error) - | retry_if_exception(_is_temporary_download_error) + | retry_if_exception(_download_error) + | retry_if_exception(_undocumented_error) ) # throttling throttling_wait = wait_chain( @@ -182,7 +215,10 @@ class CustomRetryFactory(RetryFactory): temporary_download_error_wait = network_error_wait throttling_stop = stop_never network_error_stop = stop_after_uninterrupted_delay(15 * 60) - temporary_download_error_stop = stop_on_count(4) + temporary_download_error_stop = stop_on_download_error(max_total=4, max_permanent=2) + + undocumented_error_stop = stop_on_count(2) + undocumented_error_wait = network_error_wait def wait(self, retry_state: RetryCallState) -> float: assert retry_state.outcome, "Unexpected empty outcome" @@ -192,7 +228,9 @@ def wait(self, retry_state: RetryCallState) -> float: return self.throttling_wait(retry_state=retry_state) if _is_network_error(exc): return self.network_error_wait(retry_state=retry_state) - assert _is_temporary_download_error(exc) # See retry_condition + if _undocumented_error(exc): + return self.undocumented_error_wait(retry_state=retry_state) + assert _download_error(exc) # See retry_condition return self.temporary_download_error_wait(retry_state=retry_state) def stop(self, retry_state: RetryCallState) -> bool: @@ -203,7 +241,9 @@ def stop(self, retry_state: RetryCallState) -> bool: return self.throttling_stop(retry_state) if _is_network_error(exc): return self.network_error_stop(retry_state) - assert _is_temporary_download_error(exc) # See retry_condition + if _undocumented_error(exc): + return self.undocumented_error_stop(retry_state) + assert _download_error(exc) # See retry_condition return self.temporary_download_error_stop(retry_state) def reraise(self) -> bool: @@ -224,42 +264,6 @@ def build(self) -> AsyncRetrying: zyte_api_retrying: AsyncRetrying = RetryFactory().build() -def _download_error(exc: BaseException) -> bool: - return isinstance(exc, RequestError) and exc.status in {520, 521} - - -def _undocumented_error(exc: BaseException) -> bool: - return ( - isinstance(exc, RequestError) - and exc.status >= 500 - and exc.status not in {503, 520, 521} - ) - - -class stop_on_download_error(stop_base): - """Stop after the specified max numbers of total or permanent download - errors.""" - - def __init__(self, max_total: int, max_permanent: int) -> None: - self._max_total = max_total - self._max_permanent = max_permanent - - def __call__(self, retry_state: "RetryCallState") -> bool: - if not hasattr(retry_state, "counter"): - retry_state.counter = Counter() # type: ignore - assert retry_state.outcome, "Unexpected empty outcome" - exc = retry_state.outcome.exception() - assert exc, "Unexpected empty exception" - if exc.status == 521: # type: ignore - retry_state.counter["permanent_download_error"] += 1 # type: ignore - if retry_state.counter["permanent_download_error"] >= self._max_permanent: # type: ignore - return True - retry_state.counter["download_error"] += 1 # type: ignore - if retry_state.counter["download_error"] >= self._max_total: # type: ignore - return True - return False - - class AggressiveRetryFactory(RetryFactory): """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`aggressive retry policy `. @@ -300,7 +304,6 @@ class CustomRetryFactory(AggressiveRetryFactory): download_error_wait = RetryFactory.temporary_download_error_wait undocumented_error_stop = stop_on_count(4) - undocumented_error_wait = RetryFactory.temporary_download_error_wait def stop(self, retry_state: RetryCallState) -> bool: assert retry_state.outcome, "Unexpected empty outcome" @@ -308,8 +311,6 @@ def stop(self, retry_state: RetryCallState) -> bool: assert exc, "Unexpected empty exception" if _download_error(exc): return self.download_error_stop(retry_state) - if _undocumented_error(exc): - return self.undocumented_error_stop(retry_state) return super().stop(retry_state) def wait(self, retry_state: RetryCallState) -> float: @@ -318,8 +319,6 @@ def wait(self, retry_state: RetryCallState) -> float: assert exc, "Unexpected empty exception" if _download_error(exc): return self.download_error_wait(retry_state) - if _undocumented_error(exc): - return self.undocumented_error_wait(retry_state=retry_state) return super().wait(retry_state) From d052ed144eb630ad9e39826613a83982a22e1d84 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Jan 2025 15:51:31 +0100 Subject: [PATCH 124/126] Initial stab at circuit break for undocumented erors --- docs/use/api.rst | 3 ++ pyproject.toml | 3 ++ tests/mockserver.py | 9 +++-- tests/test_async.py | 24 ++++++++++- tests/test_main.py | 16 +++----- tests/test_retry.py | 95 +++++++++++++++++++++++++++++++++++++++++++- zyte_api/__init__.py | 2 +- zyte_api/__main__.py | 7 ++-- zyte_api/_async.py | 10 ++++- zyte_api/_errors.py | 4 +- zyte_api/_retry.py | 77 ++++++++++++++++++++++++++++++++--- zyte_api/stats.py | 6 +-- 12 files changed, 223 insertions(+), 33 deletions(-) diff --git a/docs/use/api.rst b/docs/use/api.rst index c6983bf..c9419bd 100644 --- a/docs/use/api.rst +++ b/docs/use/api.rst @@ -163,6 +163,9 @@ following for each request: - Retries error responses with an HTTP status code in the 500-599 range (503, 520 and 521 excluded) up to 3 times. +- Disallows new requests if undocumented error responses are more than 10 + *and* more than 1% of all responses. + All retries are done with an exponential backoff algorithm. .. _aggressive-retry-policy: diff --git a/pyproject.toml b/pyproject.toml index c6b28cc..e882810 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,3 +4,6 @@ multi_line_output = 3 [tool.black] target-version = ["py39", "py310", "py311", "py312", "py313"] + +[tool.mypy] +check_untyped_defs = true diff --git a/tests/mockserver.py b/tests/mockserver.py index 023b72f..ce84138 100644 --- a/tests/mockserver.py +++ b/tests/mockserver.py @@ -6,10 +6,12 @@ from base64 import b64encode from importlib import import_module from subprocess import PIPE, Popen -from typing import Any, Dict +from typing import Any, Dict, cast from urllib.parse import urlparse from twisted.internet import reactor +from twisted.internet.defer import Deferred +from twisted.internet.interfaces import IReactorTime from twisted.internet.task import deferLater from twisted.web.resource import Resource from twisted.web.server import NOT_DONE_YET, Site @@ -40,7 +42,7 @@ def _cancelrequest(_): d.addErrback(lambda _: None) d.cancel() - d = deferLater(reactor, delay, f, *a, **kw) + d: Deferred = deferLater(cast(IReactorTime, reactor), delay, f, *a, **kw) request.notifyFinish().addErrback(_cancelrequest) return d @@ -82,6 +84,7 @@ def render_POST(self, request): url = request_data["url"] domain = urlparse(url).netloc + response_data: Dict[str, Any] if domain == "e429.example": request.setResponseCode(429) response_data = {"status": 429, "type": "/limits/over-user-limit"} @@ -119,7 +122,7 @@ def render_POST(self, request): request.setResponseCode(500) return b'["foo"]' - response_data: Dict[str, Any] = { + response_data = { "url": url, } diff --git a/tests/test_async.py b/tests/test_async.py index 3f33ce7..771c0f1 100644 --- a/tests/test_async.py +++ b/tests/test_async.py @@ -3,7 +3,13 @@ import pytest -from zyte_api import AggressiveRetryFactory, AsyncZyteAPI, RequestError +from zyte_api import ( + AggressiveRetryFactory, + AsyncZyteAPI, + RequestError, + TooManyUndocumentedErrors, +) +from zyte_api._retry import ZyteAsyncRetrying from zyte_api.aio.client import AsyncClient from zyte_api.apikey import NoApiKey from zyte_api.errors import ParsedError @@ -318,4 +324,18 @@ def test_retrying_class(): """A descriptive exception is raised when creating a client with an AsyncRetrying subclass or similar instead of an instance of it.""" with pytest.raises(ValueError): - AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) + AsyncZyteAPI(api_key="foo", retrying=AggressiveRetryFactory) # type: ignore[arg-type] + + +@pytest.mark.asyncio +async def test_too_many_undocumented_errors(mockserver): + ZyteAsyncRetrying._total_outcomes = 9 + ZyteAsyncRetrying._total_undocumented_errors = 9 + + client = AsyncZyteAPI(api_key="a", api_url=mockserver.urljoin("/")) + + await client.get({"url": "https://a.example", "httpResponseBody": True}) + with pytest.raises(TooManyUndocumentedErrors): + await client.get({"url": "https://e500.example", "httpResponseBody": True}) + with pytest.raises(TooManyUndocumentedErrors): + await client.get({"url": "https://a.example", "httpResponseBody": True}) diff --git a/tests/test_main.py b/tests/test_main.py index 7e3c8d8..d8c4df2 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -11,15 +11,6 @@ from zyte_api.aio.errors import RequestError -class MockRequestError(Exception): - @property - def parsed(self): - mock = Mock( - response_body=Mock(decode=Mock(return_value=forbidden_domain_response())) - ) - return mock - - def get_json_content(file_object): if not file_object: return @@ -53,7 +44,12 @@ def forbidden_domain_response(): async def fake_exception(value=True): # Simulating an error condition if value: - raise MockRequestError() + raise RequestError( + query={"url": "https://example.com", "httpResponseBody": True}, + response_content=json.dumps(forbidden_domain_response()).encode(), + request_info=None, + history=None, + ) create_session_mock = AsyncMock() return await create_session_mock.coroutine() diff --git a/tests/test_retry.py b/tests/test_retry.py index 86e0293..a86fc9b 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -11,13 +11,20 @@ AsyncZyteAPI, RequestError, RetryFactory, + TooManyUndocumentedErrors, aggressive_retrying, zyte_api_retrying, ) +from zyte_api._retry import ZyteAsyncRetrying from .mockserver import DropResource, MockServer +def reset_totals(): + ZyteAsyncRetrying._total_outcomes = 0 + ZyteAsyncRetrying._total_undocumented_errors = 0 + + def test_deprecated_imports(): from zyte_api import RetryFactory, zyte_api_retrying from zyte_api.aio.retry import RetryFactory as DeprecatedRetryFactory @@ -74,10 +81,11 @@ def broken_stop(_): ) @pytest.mark.asyncio async def test_retry_wait(retry_factory, status, waiter, mockserver): + def broken_wait(self, retry_state): raise OutlierException - class CustomRetryFactory(retry_factory): + class CustomRetryFactory(retry_factory): # type: ignore[valid-type, misc] pass setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) @@ -105,7 +113,7 @@ async def test_retry_wait_network_error(retry_factory): def broken_wait(self, retry_state): raise OutlierException - class CustomRetryFactory(retry_factory): + class CustomRetryFactory(retry_factory): # type: ignore[valid-type, misc] pass setattr(CustomRetryFactory, f"{waiter}_wait", broken_wait) @@ -401,6 +409,7 @@ def __call__(self, number, add=0): @pytest.mark.asyncio @patch("time.monotonic") async def test_retry_stop(monotonic_mock, retrying, outcomes, exhausted): + reset_totals() monotonic_mock.return_value = 0 last_outcome = outcomes[-1] outcomes = deque(outcomes) @@ -431,3 +440,85 @@ async def run(): assert outcome is last_outcome else: assert not exhausted + + +mock_good_response = object() + + +@pytest.mark.parametrize( + ("retrying", "outcome_sequences", "exhausted"), + ( + # A ZyteAPIError exception is raised when, of all responses, + # undocumented 5xx responses are at least 10 and at least 1%. + # + # 9, 100%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9, + False, + ), + # 10, 100%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 10, + True, + ), + # 10, <1%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9 # 9 / 18 (50%) + + ((mock_good_response,),) * (982) # + 0 / 982 = 9 / 1000 (0.9%) + + ((mock_request_error(status=500),),) * 1, # + 1 / 1 = 10 / 1001 (0.999…%) + False, + ), + # 10, ≥1%: + ( + zyte_api_retrying, + ((mock_request_error(status=500),),) * 9 # 9 / 18 (50%) + + ((mock_good_response,),) * (981) # + 0 / 981 = 9 / 999 (0.9%) + + ((mock_request_error(status=500),),) * 1, # + 1 / 1 = 10 / 1000 (1%) + True, + ), + ), +) +@pytest.mark.asyncio +@patch("time.monotonic") +async def test_retry_stop_global_parallel( + monotonic_mock, retrying, outcome_sequences, exhausted +): + reset_totals() + monotonic_mock.return_value = 0 + last_outcome = outcome_sequences[-1][-1] + outcome_sequences = tuple(deque(outcomes) for outcomes in outcome_sequences) + + def wait(retry_state): + return 0.0 + + retrying = copy(retrying) + retrying.wait = wait + + async def run(outcomes): + while True: + try: + outcome = outcomes.popleft() + except IndexError: + return + else: + if isinstance(outcome, fast_forward): + monotonic_mock.return_value += outcome.time + continue + if outcome is mock_good_response: + continue + raise outcome + + run = retrying.wraps(run) + + try: + for outcomes in outcome_sequences: + await run(outcomes) + except Exception as exc: + assert exhausted, exc + assert isinstance(exc, TooManyUndocumentedErrors) + assert exc.outcome is last_outcome + else: + assert not exhausted diff --git a/zyte_api/__init__.py b/zyte_api/__init__.py index 1f97fd2..0e9b55b 100644 --- a/zyte_api/__init__.py +++ b/zyte_api/__init__.py @@ -4,7 +4,7 @@ from ._async import AsyncZyteAPI from ._errors import RequestError -from ._retry import AggressiveRetryFactory, RetryFactory +from ._retry import AggressiveRetryFactory, RetryFactory, TooManyUndocumentedErrors from ._retry import aggressive_retrying as _aggressive_retrying from ._retry import ( stop_after_uninterrupted_delay, diff --git a/zyte_api/__main__.py b/zyte_api/__main__.py index 776f9c1..94bab28 100644 --- a/zyte_api/__main__.py +++ b/zyte_api/__main__.py @@ -11,6 +11,7 @@ import tqdm from tenacity import retry_if_exception +from zyte_api import RequestError from zyte_api._async import AsyncZyteAPI from zyte_api._retry import RetryFactory, _is_throttling_error from zyte_api._utils import create_session @@ -71,13 +72,13 @@ def write_output(content): try: result = await fut except Exception as e: - if store_errors: - write_output(e.parsed.response_body.decode()) + if store_errors and isinstance(e, RequestError): + write_output(e.parsed.data) if stop_on_errors: raise - logger.error(str(e)) + logger.exception("Exception raised during response handling") else: write_output(result) finally: diff --git a/zyte_api/_async.py b/zyte_api/_async.py index afa20ff..f6f30e6 100644 --- a/zyte_api/_async.py +++ b/zyte_api/_async.py @@ -10,7 +10,7 @@ from tenacity import AsyncRetrying from ._errors import RequestError -from ._retry import zyte_api_retrying +from ._retry import TooManyUndocumentedErrors, zyte_api_retrying from ._utils import _AIO_API_TIMEOUT, create_session from .apikey import get_apikey from .constants import API_URL @@ -103,6 +103,7 @@ def __init__( self.retrying = retrying or zyte_api_retrying self.user_agent = user_agent or USER_AGENT self._semaphore = asyncio.Semaphore(n_conn) + self._disabling_exception: TooManyUndocumentedErrors | None = None async def get( self, @@ -114,6 +115,9 @@ async def get( retrying: Optional[AsyncRetrying] = None, ) -> _ResponseFuture: """Asynchronous equivalent to :meth:`ZyteAPI.get`.""" + if self._disabling_exception is not None: + raise self._disabling_exception + retrying = retrying or self.retrying post = _post_func(session) auth = aiohttp.BasicAuth(self.api_key) @@ -172,7 +176,9 @@ async def request(): # Try to make a request result = await request() self.agg_stats.n_success += 1 - except Exception: + except Exception as exc: + if isinstance(exc, TooManyUndocumentedErrors): + self._disabling_exception = exc self.agg_stats.n_fatal_errors += 1 raise diff --git a/zyte_api/_errors.py b/zyte_api/_errors.py index 6476c39..13e33be 100644 --- a/zyte_api/_errors.py +++ b/zyte_api/_errors.py @@ -31,11 +31,11 @@ def __init__(self, *args, **kwargs): @property def parsed(self): """Response as a :class:`ParsedError` object.""" - return ParsedError.from_body(self.response_content) + return ParsedError.from_body(self.response_content or b"") def __str__(self): return ( f"RequestError: {self.status}, message={self.message}, " - f"headers={self.headers}, body={self.response_content}, " + f"headers={self.headers}, body={self.response_content!r}, " f"request_id={self.request_id}" ) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 90bdcad..ad8156e 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -1,13 +1,17 @@ +from __future__ import annotations + import asyncio import logging from collections import Counter from datetime import timedelta from itertools import count -from typing import Union +from typing import TYPE_CHECKING, Any, Union from aiohttp import client_exceptions from tenacity import ( AsyncRetrying, + DoAttempt, + DoSleep, RetryCallState, after_log, before_log, @@ -23,6 +27,12 @@ from ._errors import RequestError +if TYPE_CHECKING: + from tenacity import RetryBaseT as SyncRetryBaseT + from tenacity.asyncio import RetryBaseT + from tenacity.stop import StopBaseT + from tenacity.wait import WaitBaseT + logger = logging.getLogger(__name__) _IDS = count() @@ -160,6 +170,66 @@ def _undocumented_error(exc: BaseException) -> bool: ) +class TooManyUndocumentedErrors(RuntimeError): + def __init__(self, outcome, errors, total): + msg = ( + f"Too many undocumented error responses received from Zyte API " + f"({errors} out of {total}, {errors / total:.2%}). This process " + f"will no longer be able to send Zyte API requests. Please, " + f"monitor https://status.zyte.com/ or contact support " + f"(https://support.zyte.com/support/tickets/new) before sending " + f"more requests like the ones causing these error responses.\n" + f"Last offending query: {outcome.query}\n" + f"Last offending response: {outcome}" + ) + self.outcome = outcome + super().__init__(msg) + + +class ZyteAsyncRetrying(AsyncRetrying): + _total_outcomes = 0 + _total_undocumented_errors = 0 + + def __init__( + self, + stop: "StopBaseT", + wait: "WaitBaseT", + retry: "SyncRetryBaseT | RetryBaseT", + reraise: bool, + **kwargs, + ): + kwargs.setdefault("before", before_log(logger, logging.DEBUG)) + kwargs.setdefault("after", after_log(logger, logging.DEBUG)) + kwargs.setdefault("before_sleep", before_sleep_log(logger, logging.DEBUG)) + super().__init__( + stop=stop, + wait=wait, + retry=retry, + reraise=reraise, + **kwargs, + ) + + async def iter(self, retry_state: RetryCallState) -> DoAttempt | DoSleep | Any: + do = await super().iter(retry_state) + retry_cls = retry_state.retry_object.__class__ + if retry_state.outcome is not None: + retry_cls._total_outcomes += 1 # type: ignore[attr-defined] + try: + retry_state.outcome.result() + except Exception as exc: + if _undocumented_error(exc): + retry_cls._total_undocumented_errors += 1 # type: ignore[attr-defined] + errors = retry_cls._total_undocumented_errors # type: ignore[attr-defined] + total = retry_cls._total_outcomes # type: ignore[attr-defined] + if errors >= 10 and errors / total >= 0.01: + raise TooManyUndocumentedErrors( + outcome=exc, + errors=errors, # type: ignore[attr-defined] + total=total, # type: ignore[attr-defined] + ) + return do + + class RetryFactory: """Factory class that builds the :class:`tenacity.AsyncRetrying` object that defines the :ref:`default retry policy `. @@ -250,14 +320,11 @@ def reraise(self) -> bool: return True def build(self) -> AsyncRetrying: - return AsyncRetrying( + return ZyteAsyncRetrying( wait=self.wait, retry=self.retry_condition, stop=self.stop, reraise=self.reraise(), - before=before_log(logger, logging.DEBUG), - after=after_log(logger, logging.DEBUG), - before_sleep=before_sleep_log(logger, logging.DEBUG), ) diff --git a/zyte_api/stats.py b/zyte_api/stats.py index 42c7b6a..6b23107 100644 --- a/zyte_api/stats.py +++ b/zyte_api/stats.py @@ -37,9 +37,9 @@ def __init__(self): self.n_429 = 0 # number of 429 (throttling) responses self.n_errors = 0 # number of errors, including errors which were retried - self.status_codes = Counter() - self.exception_types = Counter() - self.api_error_types = Counter() + self.status_codes: Counter = Counter() + self.exception_types: Counter = Counter() + self.api_error_types: Counter = Counter() def __str__(self): return "conn:{:0.2f}s, resp:{:0.2f}s, throttle:{:.1%}, err:{}+{}({:.1%}) | success:{}/{}({:.1%})".format( From 2d2c56f3d6e780c808f454ed41f11861b7b11a39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 3 Jan 2025 16:16:39 +0100 Subject: [PATCH 125/126] Remove unnecessary comments --- zyte_api/_retry.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index ad8156e..5ccee03 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -224,8 +224,8 @@ async def iter(self, retry_state: RetryCallState) -> DoAttempt | DoSleep | Any: if errors >= 10 and errors / total >= 0.01: raise TooManyUndocumentedErrors( outcome=exc, - errors=errors, # type: ignore[attr-defined] - total=total, # type: ignore[attr-defined] + errors=errors, + total=total, ) return do From 6394d94db481146637e8220f5d4595b926ae54b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 10 Jan 2025 15:01:02 +0100 Subject: [PATCH 126/126] unquote type hints --- zyte_api/_retry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/zyte_api/_retry.py b/zyte_api/_retry.py index 5ccee03..d7cd5ce 100644 --- a/zyte_api/_retry.py +++ b/zyte_api/_retry.py @@ -192,9 +192,9 @@ class ZyteAsyncRetrying(AsyncRetrying): def __init__( self, - stop: "StopBaseT", - wait: "WaitBaseT", - retry: "SyncRetryBaseT | RetryBaseT", + stop: StopBaseT, + wait: WaitBaseT, + retry: SyncRetryBaseT | RetryBaseT, reraise: bool, **kwargs, ):