From af9d34d148e6be5e82355614dad6a0c539d21f5a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 May 2022 14:43:28 +0200 Subject: [PATCH 1/2] Propose a minimal specialization for extract --- zyte_api/aio/client.py | 79 ++++++++++++++++++++++++++++++++++++++++-- zyte_api/utils.py | 8 ++++- 2 files changed, 84 insertions(+), 3 deletions(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index e82c832..830ff56 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -4,8 +4,10 @@ import asyncio import time +from base64 import b64decode +from collections.abc import Mapping from functools import partial -from typing import Optional, Iterator, List +from typing import Awaitable, Iterator, List, Optional import aiohttp from aiohttp import TCPConnector @@ -16,7 +18,7 @@ from ..apikey import get_apikey from ..constants import API_URL, API_TIMEOUT from ..stats import AggStats, ResponseStats -from ..utils import user_agent +from ..utils import _to_lower_camel_case, user_agent # 120 seconds is probably too long, but we are concerned about the case with @@ -43,6 +45,38 @@ def _post_func(session): return session.post +class ExtractResult(Mapping): + """Result of a call to AsyncClient.extract. + + It can be used as a dictionary to access the raw API response. + + It also provides some helper properties for easier access to some of its + underlying data. + """ + + def __init__(self, api_response: dict): + self._api_response = api_response + + def __getitem__(self, key): + return self._api_response[key] + + def __iter__(self): + yield from self._api_response + + def __len__(self): + return len(self._api_response) + + @property + def http_response_body(self): -> bytes: + if hasattr(self, "_http_response_body"): + return self._http_response_body + base64_body = self._api_response.get("httpResponseBody", None) + if base64_body is None: + raise ValueError("API response has no httpResponseBody key.") + self._http_response_body = b64decode(base64_body) + return self._http_response_body + + class AsyncClient: def __init__(self, *, api_key=None, @@ -148,3 +182,44 @@ async def _request(query): session=session) return asyncio.as_completed([_request(query) for query in queries]) + + @staticmethod + def _build_extract_query(raw_query): + return { + _to_lower_camel_case(k): v + for k, v in raw_query.items() + } + + async def extract( + self, + url: str, + *, + session: Optional[aiohttp.ClientSession] = None, + handle_retries: bool = True, + retrying: Optional[AsyncRetrying] = None, + **kwargs, + ) -> Awaitable[ExtractResult]: + """…""" + query = self._build_extract_query({**kwargs, 'url'=url}) + response = await self.request_raw( + query=query, + endpoint='extract', + session=session, + handle_retries=handle_retries, + retrying=retrying, + ) + return ExtractResult(response) + + def extract_in_parallel( + self, + queries: List[dict], + *, + session: Optional[aiohttp.ClientSession] = None, + ) -> Iterator[asyncio.Future]: + """…""" + queries = [self._build_extract_query(query) for query in queries] + return self.request_parallel_as_completed( + queries, + endpoint='extract', + session=session, + ) diff --git a/zyte_api/utils.py b/zyte_api/utils.py index 707d9b4..1467dff 100644 --- a/zyte_api/utils.py +++ b/zyte_api/utils.py @@ -1,7 +1,13 @@ -# -*- coding: utf-8 -*- from .__version__ import __version__ +def _to_lower_camel_case(snake_case_string): + """Convert from snake case (foo_bar) to lower-case-initial camel case + (fooBar).""" + prefix, *rest = snake_case_string.split('_') + return prefix + ''.join(part.title() for part in rest) + + def user_agent(library): return 'python-zyte-api/{} {}/{}'.format( __version__, From 9270f500b6b08483a5e97ba8765d289d55839534 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Tue, 17 May 2022 15:12:43 +0200 Subject: [PATCH 2/2] Fix syntax and typing issues --- zyte_api/aio/client.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/zyte_api/aio/client.py b/zyte_api/aio/client.py index 830ff56..1463c59 100644 --- a/zyte_api/aio/client.py +++ b/zyte_api/aio/client.py @@ -7,7 +7,7 @@ from base64 import b64decode from collections.abc import Mapping from functools import partial -from typing import Awaitable, Iterator, List, Optional +from typing import Awaitable, Iterator, List, Optional, Union import aiohttp from aiohttp import TCPConnector @@ -21,6 +21,12 @@ from ..utils import _to_lower_camel_case, user_agent +class _NotLoaded: + pass + + +_NOT_LOADED = _NotLoaded() + # 120 seconds is probably too long, but we are concerned about the case with # many concurrent requests and some processing logic running in the same reactor, # thus, saturating the CPU. This will make timeouts more likely. @@ -56,6 +62,7 @@ class ExtractResult(Mapping): def __init__(self, api_response: dict): self._api_response = api_response + self._http_response_body: Union[bytes|_NotLoaded] = _NOT_LOADED def __getitem__(self, key): return self._api_response[key] @@ -67,13 +74,12 @@ def __len__(self): return len(self._api_response) @property - def http_response_body(self): -> bytes: - if hasattr(self, "_http_response_body"): - return self._http_response_body - base64_body = self._api_response.get("httpResponseBody", None) - if base64_body is None: - raise ValueError("API response has no httpResponseBody key.") - self._http_response_body = b64decode(base64_body) + def http_response_body(self) -> Union[bytes|_NotLoaded]: + if self._http_response_body is _NOT_LOADED: + base64_body = self._api_response.get("httpResponseBody", None) + if base64_body is None: + raise ValueError("API response has no httpResponseBody key.") + self._http_response_body = b64decode(base64_body) return self._http_response_body @@ -198,9 +204,9 @@ async def extract( handle_retries: bool = True, retrying: Optional[AsyncRetrying] = None, **kwargs, - ) -> Awaitable[ExtractResult]: + ) -> ExtractResult: """…""" - query = self._build_extract_query({**kwargs, 'url'=url}) + query = self._build_extract_query({**kwargs, 'url': url}) response = await self.request_raw( query=query, endpoint='extract',