From 2fb3f0673377c4676bd1e50289fd2bae4eb038cc Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Tue, 23 Dec 2025 22:27:34 +0530 Subject: [PATCH 1/8] Fix --version-id -> --versionid in CLI --- databusclient/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 4e97470..0dc7047 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -16,7 +16,7 @@ def app(): @app.command() @click.option( - "--version-id", "version_id", + "--versionid", "version_id", required=True, help="Target databus version/dataset identifier of the form " "", From d4bb454af8fc7011ee652aa82394f1273bfe0407 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Tue, 23 Dec 2025 22:28:36 +0530 Subject: [PATCH 2/8] Fix --version-id -> --versionid in test script --- test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test.sh b/test.sh index f590198..0a4c096 100755 --- a/test.sh +++ b/test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash databusclient deploy \ - --version-id "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ + --versionid "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ --title "Test Title" \ --abstract "Test Abstract" \ --description "Test Description" \ From 189458ec1f673c55f66b97d43b5507c648230f47 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Fri, 26 Dec 2025 19:42:42 +0530 Subject: [PATCH 3/8] cli: add mkdist validations, completion helper, tests and docs --- README.md | 54 ++++++++++++++++++++++++++++++++++++++++---- databusclient/cli.py | 47 ++++++++++++++++++++++++++++++++++++++ tests/test_cli.py | 42 ++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 5 deletions(-) create mode 100644 tests/test_cli.py diff --git a/README.md b/README.md index 0b65641..a828d75 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,8 @@ Options: Commands: deploy download + mkdist + completion ``` @@ -183,7 +185,7 @@ Arguments: - Metdata mode: None Options: - --version-id TEXT Target databus version/dataset identifier of the form [required] --title TEXT Dataset title [required] @@ -202,11 +204,11 @@ Options: #### Examples of using deploy command ##### Mode 1: Classic Deploy (Distributions) ``` -databusclient deploy --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +databusclient deploy --versionid https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' ``` ``` -databusclient deploy --version-id https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +databusclient deploy --versionid https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' ``` A few more notes for CLI usage: @@ -223,7 +225,7 @@ All files referenced there will be registered on the Databus. ```bash databusclient deploy \ --metadata /home/metadata.json \ - --version-id https://databus.org/user/dataset/version/1.0 \ + --versionid https://databus.org/user/dataset/version/1.0 \ --title "Metadata Deploy Example" \ --abstract "This is a short abstract of the dataset." \ --description "This dataset was uploaded using metadata.json." \ @@ -261,7 +263,7 @@ databusclient deploy \ --webdav-url https://cloud.example.com/remote.php/webdav \ --remote nextcloud \ --path datasets/mydataset \ - --version-id https://databus.org/user/dataset/version/1.0 \ + --versionid https://databus.org/user/dataset/version/1.0 \ --title "Test Dataset" \ --abstract "Short abstract of dataset" \ --description "This dataset was uploaded for testing the Nextcloud → Databus pipeline." \ @@ -296,6 +298,48 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://d ``` +### mkdist command + +Create a distribution string from components. + +Usage: +``` +databusclient mkdist URL --cv key=value --cv key2=value2 --format ttl --compression gz --sha-length : +``` + +Example: +``` +python -m databusclient mkdist "https://example.org/file.ttl" --cv lang=en --cv part=sorted --format ttl --compression gz --sha-length aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa:12345 +``` + +## Completion + +Enable shell completion (bash example): +``` +eval "$(_DATABUSCLIENT_COMPLETE=source_bash python -m databusclient)" +``` + +### mkdist command + +Create a distribution string from components. + +Usage: +``` +databusclient mkdist URL --cv key=value --cv key2=value2 --format ttl --compression gz --sha-length : +``` + +Example: +``` +python -m databusclient mkdist "https://example.org/file.ttl" --cv lang=en --cv part=sorted --format ttl --compression gz --sha-length aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa:12345 +``` + +## Completion + +Enable shell completion (bash example): +``` +eval "$(_DATABUSCLIENT_COMPLETE=source_bash python -m databusclient)" +``` + ## Module Usage ### Step 1: Create lists of distributions for the dataset diff --git a/databusclient/cli.py b/databusclient/cli.py index 0dc7047..d900c0f 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -3,6 +3,7 @@ import os import click +import re from typing import List from databusclient import client @@ -111,5 +112,51 @@ def download(databusuris: List[str], localdir, databus, token, authurl, clientid ) +@app.command() +@click.argument("url") +@click.option("--cv", "cvs", multiple=True, help="Content variant like key=value (repeatable). Keys must not contain '|' or '_'") +@click.option("--format", "file_format", help="Format extension (e.g. ttl)") +@click.option("--compression", help="Compression (e.g. gzip)") +@click.option("--sha-length", help="sha256:length (64 hex chars followed by ':' and integer length)") +@click.option("--json-output", is_flag=True, help="Output JSON distribution object instead of plain string") +def mkdist(url, cvs, file_format, compression, sha_length, json_output): + """Create a distribution string from components.""" + # Validate CVs + cvs_dict = {} + for cv in cvs: + if "=" not in cv: + raise click.BadParameter(f"Invalid content variant '{cv}': expected key=value") + key, val = cv.split("=", 1) + if any(ch in key for ch in ("|", "_")): + raise click.BadParameter("Invalid characters in content-variant key (forbidden: '|' and '_')") + if key in cvs_dict: + raise click.BadParameter(f"Duplicate content-variant key '{key}'") + cvs_dict[key] = val + + # Validate sha-length + sha_tuple = None + if sha_length: + if not re.match(r'^[A-Fa-f0-9]{64}:\d+$', sha_length): + raise click.BadParameter("Invalid --sha-length; expected SHA256HEX:length") + sha, length = sha_length.split(":", 1) + sha_tuple = (sha, int(length)) + + # Deterministic ordering + sorted_cvs = {k: cvs_dict[k] for k in sorted(cvs_dict)} + + dist = client.create_distribution(url=url, cvs=sorted_cvs, file_format=file_format, compression=compression, sha256_length_tuple=sha_tuple) + if json_output: + import json as _json + click.echo(_json.dumps({"distribution": dist})) + else: + click.echo(dist) + + +@app.command() +@click.argument("shell", type=click.Choice(["bash","zsh","fish","powershell"]), required=False) +def completion(shell="bash"): + click.echo(f"Run: eval \"$(_DATABUSCLIENT_COMPLETE=source_{shell} python -m databusclient)\"") + + if __name__ == "__main__": app() diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..3dfd3eb --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,42 @@ +from click.testing import CliRunner +from databusclient import cli + + +def test_mkdist_multiple_cv(): + runner = CliRunner() + sha = 'a' * 64 + res = runner.invoke(cli.app, [ + 'mkdist', + 'https://example.org/file', + '--cv', 'b=2', + '--cv', 'a=1', + '--format', 'ttl', + '--compression', 'gz', + '--sha-length', f'{sha}:42' + ]) + assert res.exit_code == 0, res.output + # keys should be sorted alphabetically: a then b + assert res.output.strip() == f'https://example.org/file|a=1_b=2|ttl|gz|{sha}:42' + + +def test_mkdist_invalid_cv(): + runner = CliRunner() + res = runner.invoke(cli.app, ['mkdist', 'https://example.org/file', '--cv', 'badcv']) + assert res.exit_code != 0 + assert 'Invalid content variant' in res.output + + +def test_mkdist_invalid_sha(): + runner = CliRunner() + res = runner.invoke(cli.app, [ + 'mkdist', 'https://example.org/file', '--cv', 'k=v', '--sha-length', 'abc:123' + ]) + assert res.exit_code != 0 + assert 'Invalid --sha-length' in res.output + + +def test_completion_output(): + runner = CliRunner() + res = runner.invoke(cli.app, ['completion', 'bash']) + assert res.exit_code == 0 + assert '_DATABUSCLIENT_COMPLETE' in res.output From d72a444a1bce7c26960648c0540556fd4e5c15b4 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Fri, 26 Dec 2025 20:24:43 +0530 Subject: [PATCH 4/8] Resolve remaining merge markers in cli.py --- databusclient/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index d59dde5..3525a0a 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -22,12 +22,8 @@ def app(): @app.command() @click.option( -<<<<<<< HEAD - "--versionid", "version_id", -======= "--version-id", "version_id", ->>>>>>> upstream/main required=True, help="Target databus version/dataset identifier of the form " "", From af27e181070c1806df5cf924954fc93cf177da18 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Fri, 26 Dec 2025 20:26:00 +0530 Subject: [PATCH 5/8] Use api_deploy.create_distribution to avoid circular import --- databusclient/cli.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/databusclient/cli.py b/databusclient/cli.py index 3525a0a..b5145bf 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -5,8 +5,6 @@ import click import re -from typing import List -from databusclient import client import databusclient.api.deploy as api_deploy from databusclient.api.delete import delete as api_delete @@ -248,7 +246,7 @@ def mkdist(url, cvs, file_format, compression, sha_length, json_output): # Deterministic ordering sorted_cvs = {k: cvs_dict[k] for k in sorted(cvs_dict)} - dist = client.create_distribution(url=url, cvs=sorted_cvs, file_format=file_format, compression=compression, sha256_length_tuple=sha_tuple) + dist = api_deploy.create_distribution(url=url, cvs=sorted_cvs, file_format=file_format, compression=compression, sha256_length_tuple=sha_tuple) if json_output: import json as _json click.echo(_json.dumps({"distribution": dist})) From a7b361fa56fb5fadcb44fa43c173249c26f73798 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Sun, 4 Jan 2026 23:00:12 +0530 Subject: [PATCH 6/8] Add verbose CLI flag with redacted HTTP logging --- databusclient/api/download.py | 47 ++++++++++++++++++++++++++++++++--- databusclient/api/utils.py | 42 +++++++++++++++++++++++++++++++ databusclient/cli.py | 10 ++++++-- tests/test_download_auth.py | 30 ++++++++++++++++++++++ 4 files changed, 123 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index ac55faa..ca573b0 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -32,6 +32,7 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, + verbose=False, ) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -43,6 +44,7 @@ def _download_file( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ if localDir is None: _host, account, group, artifact, version, file = ( @@ -67,7 +69,15 @@ def _download_file( headers = {} # --- 1a. public databus --- + if verbose: + from databusclient.api.utils import log_http + + log_http("HEAD", url, req_headers=headers) response = requests.head(url, timeout=30, allow_redirects=False) + if verbose: + from databusclient.api.utils import log_http + + log_http("HEAD", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) # Check for redirect and update URL if necessary if response.headers.get("Location") and response.status_code in [ @@ -108,9 +118,17 @@ def _download_file( headers["Accept-Encoding"] = ( "identity" # disable gzip to get correct content-length ) + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers) response = requests.get( url, headers=headers, stream=True, allow_redirects=True, timeout=30 ) + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) www = response.headers.get("WWW-Authenticate", "") # Check if authentication is required # --- 3. Handle authentication responses --- @@ -136,12 +154,20 @@ def _download_file( # for known hosts. __get_vault_access__ handles reading the refresh # token and exchanging it; errors are translated to DownloadAuthError # for user-friendly CLI output. - vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id) + vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id, verbose=verbose) headers["Authorization"] = f"Bearer {vault_token}" headers.pop("Accept-Encoding", None) # Retry with token + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers) response = requests.get(url, headers=headers, stream=True, timeout=30) + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) # Map common auth failures to friendly messages if response.status_code == 401: @@ -191,6 +217,7 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: """ Download multiple files from the databus. @@ -202,6 +229,7 @@ def _download_files( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ for url in urls: _download_file( @@ -211,6 +239,7 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -294,7 +323,7 @@ def _get_file_download_urls_from_sparql_query( def __get_vault_access__( - download_url: str, token_file: str, auth_url: str, client_id: str + download_url: str, token_file: str, auth_url: str, client_id: str, verbose: bool = False ) -> str: """ Get Vault access token for a protected databus download. @@ -320,6 +349,10 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() + if verbose: + from databusclient.api.utils import log_http + + log_http("POST", auth_url, req_headers={"client_id": client_id}, status=resp.status_code, resp_headers=resp.headers) access_token = resp.json()["access_token"] # 3. Extract host as audience @@ -344,6 +377,10 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() + if verbose: + from databusclient.api.utils import log_http + + log_http("POST", auth_url, req_headers={"client_id": client_id, "audience": audience}, status=resp.status_code, resp_headers=resp.headers) vault_token = resp.json()["access_token"] print(f"Using Vault access token for {download_url}") @@ -598,6 +635,7 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", + verbose: bool = False, ) -> None: """ Download datasets from databus. @@ -612,6 +650,7 @@ def download( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". - client_id: Client ID for token exchange. Default is "vault-token-exchange". + - verbose: when True, print redacted HTTP request/response details """ for databusURI in databusURIs: host, account, group, artifact, version, file = ( @@ -647,8 +686,7 @@ def download( vault_token_file=token, databus_key=databus_key, auth_url=auth_url, - client_id=client_id, - ) + client_id=client_id, verbose=verbose, ) elif version is not None: print(f"Downloading version: {databusURI}") _download_version( @@ -709,4 +747,5 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index 7e27ff3..0a6ba74 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -48,3 +48,45 @@ def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: response.raise_for_status() return response.text + + +def _redact_headers(headers): + if not headers: + return headers + redacted = {} + for k, v in headers.items(): + key = k.lower() + if key == "authorization" or key.startswith("x-api-key"): + redacted[k] = "REDACTED" + else: + redacted[k] = v + return redacted + + +def log_http(method, url, req_headers=None, status=None, resp_headers=None, body_snippet=None): + print(f"[HTTP] {method} {url}") + if req_headers: + print(" Req headers:", _redact_headers(req_headers)) + if status is not None: + print(" Status:", status) + if resp_headers: + # try to convert to dict; handle Mock or response objects gracefully + try: + resp_dict = dict(resp_headers) + except Exception: + # resp_headers might be a Mock or requests.Response; try common attributes + if hasattr(resp_headers, "items"): + try: + resp_dict = dict(resp_headers.items()) + except Exception: + resp_dict = {"headers": str(resp_headers)} + elif hasattr(resp_headers, "headers"): + try: + resp_dict = dict(getattr(resp_headers, "headers") or {}) + except Exception: + resp_dict = {"headers": str(resp_headers)} + else: + resp_dict = {"headers": str(resp_headers)} + print(" Resp headers:", _redact_headers(resp_dict)) + if body_snippet: + print(" Body preview:", body_snippet[:500]) diff --git a/databusclient/cli.py b/databusclient/cli.py index b5145bf..7bdb366 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -13,9 +13,12 @@ @click.group() -def app(): +@click.option("-v", "--verbose", is_flag=True, help="Enable verbose HTTP request/response output") +@click.pass_context +def app(ctx, verbose): """Databus Client CLI""" - pass + ctx.ensure_object(dict) + ctx.obj["verbose"] = verbose @app.command() @@ -159,7 +162,9 @@ def deploy( show_default=True, help="Client ID for token exchange", ) +@click.pass_context def download( + ctx, databusuris: List[str], localdir, databus, @@ -182,6 +187,7 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, + verbose=ctx.obj.get("verbose", False), ) except DownloadAuthError as e: raise click.ClickException(str(e)) diff --git a/tests/test_download_auth.py b/tests/test_download_auth.py index 7225e08..46bec79 100644 --- a/tests/test_download_auth.py +++ b/tests/test_download_auth.py @@ -102,3 +102,33 @@ def test_403_reports_insufficient_permissions(): dl._download_file(url, localDir='.', vault_token_file="/some/token/file") assert "permission" in str(exc.value) or "forbidden" in str(exc.value) + + +def test_verbose_redacts_authorization(monkeypatch, capsys): + vault_host = next(iter(VAULT_REQUIRED_HOSTS)) + url = f"https://{vault_host}/protected/file.ttl" + + resp_head = make_response(status=200, headers={}) + resp_401 = make_response(status=401, headers={"WWW-Authenticate": "Bearer realm=\"auth\""}) + resp_200 = make_response(status=200, headers={"content-length": "0"}, content=b"") + + get_side_effects = [resp_401, resp_200] + + post_resp_1 = Mock() + post_resp_1.json.return_value = {"access_token": "ACCESS"} + post_resp_2 = Mock() + post_resp_2.json.return_value = {"access_token": "VAULT"} + + with patch("requests.head", return_value=resp_head), patch( + "requests.get", side_effect=get_side_effects + ), patch("requests.post", side_effect=[post_resp_1, post_resp_2]): + monkeypatch.setenv("REFRESH_TOKEN", "x" * 90) + + # run download with verbose enabled + dl._download_file(url, localDir='.', vault_token_file="/does/not/matter", verbose=True) + captured = capsys.readouterr() + assert "[HTTP] HEAD" in captured.out or "[HTTP] GET" in captured.out + assert "REDACTED" in captured.out + # Ensure token values are not directly printed + assert "ACCESS" not in captured.out + assert "VAULT" not in captured.out From 0cd28c72cf401dc428c206691a280d86c700dcc7 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Tue, 6 Jan 2026 23:04:58 +0530 Subject: [PATCH 7/8] Add verbose (-v) flag using logging; redact Authorization and X-API-KEY headers; propagate verbose through download flows; add tests and docs --- CHANGELOG.md | 7 +++ PR_BODY.md | 19 ++++++++ README.md | 2 + databusclient/api/download.py | 90 ++++++++++++++++++++++++++--------- databusclient/api/utils.py | 23 ++++++--- databusclient/cli.py | 11 +++++ file.txt | 0 tests/test_cli_verbose.py | 38 +++++++++++++++ tests/test_download_auth.py | 13 ++--- tests/test_utils_verbose.py | 76 +++++++++++++++++++++++++++++ 10 files changed, 245 insertions(+), 34 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 PR_BODY.md create mode 100644 file.txt create mode 100644 tests/test_cli_verbose.py create mode 100644 tests/test_utils_verbose.py diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..138ec26 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,7 @@ +# Changelog + +## 0.14.1 - 2026-01-01 + +- Add `-v/--verbose` global CLI option to enable redacted HTTP request/response logging for debugging. (CLI: `databusclient -v ...`) +- Ensure `Authorization` and `X-API-KEY` headers are redacted in verbose output. +- Add unit tests and README documentation for verbose mode. diff --git a/PR_BODY.md b/PR_BODY.md new file mode 100644 index 0000000..02b5221 --- /dev/null +++ b/PR_BODY.md @@ -0,0 +1,19 @@ +Title: Add verbose CLI flag (-v) with redacted HTTP logging + +Short description: +- Add a global `-v/--verbose` CLI flag to enable redacted HTTP request/response logging to help debug interactions with the Databus and Vault. + +What changed: +- Add global `-v/--verbose` option to `databusclient` CLI and propagate it to API calls. +- Implement redacted HTTP logging helper (redacts `Authorization` and `X-API-KEY` headers). +- Instrument `download` and Vault token exchange flows to print HTTP request/response details when `-v` is enabled. +- Add unit tests ensuring verbose logs are printed and sensitive tokens are redacted. +- Update `README.md` and add a `CHANGELOG.md` entry. + +Why: +- Provides safe, actionable debugging output for issues involving HTTP communication and auth problems without exposing secrets. + +Security note: +- Authorization and API-key headers are redacted in verbose output. Avoid enabling verbose output in public CI logs. + +Closes #27 diff --git a/README.md b/README.md index b66635d..2078b90 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,8 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - If the dataset/files to be downloaded require vault authentication, you need to provide a vault token with `--vault-token /path/to/vault-token.dat`. See [Registration (Access Token)](#registration-access-token) for details on how to get a vault token. Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. +- `-v, --verbose` + - Enable verbose HTTP request/response output for debugging. Headers that may contain secrets (for example `Authorization` and `X-API-KEY`) are redacted in the output. Use with caution and avoid enabling in public CI logs. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. diff --git a/databusclient/api/download.py b/databusclient/api/download.py index ca573b0..373e5f9 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -6,6 +6,9 @@ import requests from SPARQLWrapper import JSON, SPARQLWrapper from tqdm import tqdm +import logging + +logger = logging.getLogger("databusclient") from databusclient.api.utils import ( fetch_databus_jsonld, @@ -69,12 +72,12 @@ def _download_file( headers = {} # --- 1a. public databus --- - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("HEAD", url, req_headers=headers) response = requests.head(url, timeout=30, allow_redirects=False) - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("HEAD", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) @@ -118,14 +121,14 @@ def _download_file( headers["Accept-Encoding"] = ( "identity" # disable gzip to get correct content-length ) - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("GET", url, req_headers=headers) response = requests.get( url, headers=headers, stream=True, allow_redirects=True, timeout=30 ) - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) @@ -159,12 +162,12 @@ def _download_file( headers.pop("Accept-Encoding", None) # Retry with token - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("GET", url, req_headers=headers) response = requests.get(url, headers=headers, stream=True, timeout=30) - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) @@ -243,13 +246,14 @@ def _download_files( ) -def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str: +def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None, verbose: bool = False) -> str: """ Get SPARQL query of collection members from databus collection URI. Parameters: - uri: The full databus collection URI - databus_key: Optional Databus API key for authentication on protected resources + - verbose: when True, print redacted HTTP request/response details Returns: SPARQL query string to get download URLs of all files in the collection. @@ -257,13 +261,22 @@ def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> headers = {"Accept": "text/sparql"} if databus_key is not None: headers["X-API-KEY"] = databus_key + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", uri, req_headers=headers) response = requests.get(uri, headers=headers, timeout=30) + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", uri, req_headers=headers, status=response.status_code, resp_headers=response.headers) + response.raise_for_status() return response.text -def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict: +def _query_sparql_endpoint(endpoint_url, query, databus_key=None, verbose: bool = False) -> dict: """ Query a SPARQL endpoint and return results in JSON format. @@ -271,10 +284,17 @@ def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict: - endpoint_url: the URL of the SPARQL endpoint - query: the SPARQL query string - databus_key: Optional API key for authentication + - verbose: when True, print redacted HTTP request/response details Returns: - Dictionary containing the query results """ + if verbose: + from databusclient.api.utils import log_http + + headers = {"X-API-KEY": databus_key} if databus_key is not None else None + log_http("POST", endpoint_url, req_headers=headers) + sparql = SPARQLWrapper(endpoint_url) sparql.method = "POST" sparql.setQuery(query) @@ -282,11 +302,17 @@ def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict: if databus_key is not None: sparql.setCustomHttpHeaders({"X-API-KEY": databus_key}) results = sparql.query().convert() + + if verbose: + from databusclient.api.utils import log_http + + log_http("POST", endpoint_url, req_headers={"X-API-KEY": databus_key} if databus_key is not None else None, status=200) + return results def _get_file_download_urls_from_sparql_query( - endpoint_url, query, databus_key=None + endpoint_url, query, databus_key=None, verbose: bool = False ) -> List[str]: """ Execute a SPARQL query to get databus file download URLs. @@ -295,11 +321,12 @@ def _get_file_download_urls_from_sparql_query( - endpoint_url: the URL of the SPARQL endpoint - query: the SPARQL query string - databus_key: Optional API key for authentication + - verbose: when True, print redacted HTTP request/response details Returns: - List of file download URLs """ - result_dict = _query_sparql_endpoint(endpoint_url, query, databus_key=databus_key) + result_dict = _query_sparql_endpoint(endpoint_url, query, databus_key=databus_key, verbose=verbose) bindings = result_dict.get("results", {}).get("bindings") if not isinstance(bindings, list): @@ -336,7 +363,8 @@ def __get_vault_access__( with open(token_file, "r") as f: refresh_token = f.read().strip() if len(refresh_token) < 80: - print(f"Warning: token from {token_file} is short (<80 chars)") + logger.warning("Token from %s is short (<80 chars)", token_file) + # 2. Refresh token -> access token resp = requests.post( @@ -349,7 +377,7 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("POST", auth_url, req_headers={"client_id": client_id}, status=resp.status_code, resp_headers=resp.headers) @@ -377,13 +405,13 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() - if verbose: + if verbose or logger.isEnabledFor(logging.DEBUG): from databusclient.api.utils import log_http log_http("POST", auth_url, req_headers={"client_id": client_id, "audience": audience}, status=resp.status_code, resp_headers=resp.headers) vault_token = resp.json()["access_token"] - print(f"Using Vault access token for {download_url}") + logger.debug("Using Vault access token for %s", download_url) return vault_token @@ -395,6 +423,7 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: """ Download all files in a databus collection. @@ -407,10 +436,11 @@ def _download_collection( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ - query = _get_sparql_query_of_collection(uri, databus_key=databus_key) + query = _get_sparql_query_of_collection(uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_sparql_query( - endpoint, query, databus_key=databus_key + endpoint, query, databus_key=databus_key, verbose=verbose ) _download_files( list(file_urls), @@ -419,6 +449,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -429,6 +460,7 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: """ Download all files in a databus artifact version. @@ -440,8 +472,9 @@ def _download_version( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) _download_files( file_urls, @@ -450,6 +483,7 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -461,6 +495,7 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: """ Download files in a databus artifact. @@ -473,14 +508,15 @@ def _download_artifact( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) versions = _get_databus_versions_of_artifact(json_str, all_versions=all_versions) if isinstance(versions, str): versions = [versions] for version_uri in versions: print(f"Downloading version: {version_uri}") - json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) _download_files( file_urls, @@ -489,6 +525,7 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -564,6 +601,7 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: """ Download files in a databus group. @@ -576,8 +614,9 @@ def _download_group( - databus_key: Databus API key for protected downloads - auth_url: Keycloak token endpoint URL - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) artifacts = _get_databus_artifacts_of_group(json_str) for artifact_uri in artifacts: print(f"Download artifact: {artifact_uri}") @@ -589,6 +628,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -677,6 +717,7 @@ def download( databus_key, auth_url, client_id, + verbose=verbose, ) elif file is not None: print(f"Downloading file: {databusURI}") @@ -686,7 +727,9 @@ def download( vault_token_file=token, databus_key=databus_key, auth_url=auth_url, - client_id=client_id, verbose=verbose, ) + client_id=client_id, + verbose=verbose, + ) elif version is not None: print(f"Downloading version: {databusURI}") _download_version( @@ -696,6 +739,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif artifact is not None: print( @@ -709,6 +753,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif group is not None and group != "collections": print( @@ -722,6 +767,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif account is not None: print("accountId not supported yet") # TODO @@ -738,7 +784,7 @@ def download( if uri_endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = _get_file_download_urls_from_sparql_query( - uri_endpoint, databusURI, databus_key=databus_key + uri_endpoint, databusURI, databus_key=databus_key, verbose=verbose ) _download_files( res, diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index 0a6ba74..e07f4c0 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -30,13 +30,14 @@ def get_databus_id_parts_from_file_url( return tuple(parts[:6]) # return only the first 6 parts -def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: +def fetch_databus_jsonld(uri: str, databus_key: str | None = None, verbose: bool = False) -> str: """ Retrieve JSON-LD representation of a databus resource. Parameters: - uri: The full databus URI - databus_key: Optional Databus API key for authentication on protected resources + - verbose: when True, print redacted HTTP request/response details Returns: JSON-LD string representation of the databus resource. @@ -44,7 +45,11 @@ def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: headers = {"Accept": "application/ld+json"} if databus_key is not None: headers["X-API-KEY"] = databus_key + if verbose: + log_http("GET", uri, req_headers=headers) response = requests.get(uri, headers=headers, timeout=30) + if verbose: + log_http("GET", uri, req_headers=headers, status=response.status_code, resp_headers=response.headers) response.raise_for_status() return response.text @@ -63,12 +68,17 @@ def _redact_headers(headers): return redacted +import logging + + def log_http(method, url, req_headers=None, status=None, resp_headers=None, body_snippet=None): - print(f"[HTTP] {method} {url}") + """Log HTTP request/response details at DEBUG level with sanitized headers.""" + logger = logging.getLogger("databusclient") + msg_lines = [f"[HTTP] {method} {url}"] if req_headers: - print(" Req headers:", _redact_headers(req_headers)) + msg_lines.append(f" Req headers: {_redact_headers(req_headers)}") if status is not None: - print(" Status:", status) + msg_lines.append(f" Status: {status}") if resp_headers: # try to convert to dict; handle Mock or response objects gracefully try: @@ -87,6 +97,7 @@ def log_http(method, url, req_headers=None, status=None, resp_headers=None, body resp_dict = {"headers": str(resp_headers)} else: resp_dict = {"headers": str(resp_headers)} - print(" Resp headers:", _redact_headers(resp_dict)) + msg_lines.append(f" Resp headers: {_redact_headers(resp_dict)}") if body_snippet: - print(" Body preview:", body_snippet[:500]) + msg_lines.append(" Body preview: " + body_snippet[:500]) + logger.debug("\n".join(msg_lines)) diff --git a/databusclient/cli.py b/databusclient/cli.py index 7bdb366..7beb59a 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -17,9 +17,20 @@ @click.pass_context def app(ctx, verbose): """Databus Client CLI""" + import logging + ctx.ensure_object(dict) ctx.obj["verbose"] = verbose + # Configure databusclient logger when verbose flag is used + logger = logging.getLogger("databusclient") + if verbose: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + if not logger.hasHandlers(): + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) + @app.command() @click.option( diff --git a/file.txt b/file.txt new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_cli_verbose.py b/tests/test_cli_verbose.py new file mode 100644 index 0000000..c5bba14 --- /dev/null +++ b/tests/test_cli_verbose.py @@ -0,0 +1,38 @@ +from click.testing import CliRunner +from unittest.mock import Mock, patch + +import databusclient.cli as cli + + +# CLI-level integration test for -v flag +def test_cli_download_verbose_logs_redacted(caplog): + caplog.set_level("DEBUG", logger="databusclient") + runner = CliRunner() + + # Prepare mocked HTTP responses + resp_head_401 = Mock() + resp_head_401.status_code = 401 + resp_head_401.headers = {} + + resp_head_200 = Mock() + resp_head_200.status_code = 200 + resp_head_200.headers = {} + + resp_get = Mock() + resp_get.status_code = 200 + resp_get.headers = {"content-length": "0"} + resp_get.iter_content = lambda chunk: iter([]) + + # Initial HEAD returns 401 so client uses --databus-key header on retry + with patch("requests.head", side_effect=[resp_head_401, resp_head_200]), patch( + "requests.get", return_value=resp_get + ): + # Run CLI with verbose flag and databus key (so X-API-KEY will be redacted in logs) + target = "https://example.com/account/group/artifact/1/file.txt" + res = runner.invoke(cli.app, ["-v", "download", target, "--localdir", ".", "--databus-key", "SECRET"]) + + assert res.exit_code == 0, res.output + # Should log HTTP activity and redact secret (captured by caplog) + assert "[HTTP]" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text diff --git a/tests/test_download_auth.py b/tests/test_download_auth.py index 46bec79..d2c2475 100644 --- a/tests/test_download_auth.py +++ b/tests/test_download_auth.py @@ -3,6 +3,7 @@ import pytest import requests +import logging import databusclient.api.download as dl @@ -104,7 +105,8 @@ def test_403_reports_insufficient_permissions(): assert "permission" in str(exc.value) or "forbidden" in str(exc.value) -def test_verbose_redacts_authorization(monkeypatch, capsys): +def test_verbose_redacts_authorization(monkeypatch, caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') vault_host = next(iter(VAULT_REQUIRED_HOSTS)) url = f"https://{vault_host}/protected/file.ttl" @@ -126,9 +128,8 @@ def test_verbose_redacts_authorization(monkeypatch, capsys): # run download with verbose enabled dl._download_file(url, localDir='.', vault_token_file="/does/not/matter", verbose=True) - captured = capsys.readouterr() - assert "[HTTP] HEAD" in captured.out or "[HTTP] GET" in captured.out - assert "REDACTED" in captured.out + assert "[HTTP] HEAD" in caplog.text or "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text # Ensure token values are not directly printed - assert "ACCESS" not in captured.out - assert "VAULT" not in captured.out + assert "ACCESS" not in caplog.text + assert "VAULT" not in caplog.text diff --git a/tests/test_utils_verbose.py b/tests/test_utils_verbose.py new file mode 100644 index 0000000..aa1b344 --- /dev/null +++ b/tests/test_utils_verbose.py @@ -0,0 +1,76 @@ +from unittest.mock import Mock, patch + +import databusclient.api.utils as utils +import databusclient.api.download as dl + +import requests +import logging + + + + +def make_response(status=200, headers=None, text=''): + headers = headers or {} + mock = Mock() + mock.status_code = status + mock.headers = headers + mock.text = text + def raise_for_status(): + if mock.status_code >= 400: + raise requests.exceptions.HTTPError() + mock.raise_for_status = raise_for_status + return mock + + +def test_fetch_databus_jsonld_verbose_redacts_api_key(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + url = "https://databus.example/resource" + resp = make_response(status=200, headers={"content-type": "application/ld+json"}, text='{}') + with patch("databusclient.api.utils.requests.get", return_value=resp): + txt = utils.fetch_databus_jsonld(url, databus_key="SECRET", verbose=True) + assert "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert txt == '{}' + + + +def test_get_sparql_query_of_collection_verbose(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + url = "https://databus.example/collections/col" + resp = make_response(status=200, headers={"content-type": "text/sparql"}, text='SELECT *') + with patch("databusclient.api.download.requests.get", return_value=resp): + txt = dl._get_sparql_query_of_collection(url, databus_key="SECRET", verbose=True) + assert "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert txt == 'SELECT *' + + + +def test_query_sparql_endpoint_verbose(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + endpoint = "https://dbpedia.org/sparql" + sample = {"results": {"bindings": []}} + class MockSPARQL: + def __init__(self, url): + self.url = url + self.method = None + self._query = None + self._headers = None + def setQuery(self, q): + self._query = q + def setReturnFormat(self, fmt): + pass + def setCustomHttpHeaders(self, headers): + self._headers = headers + def query(self): + mock = Mock() + mock.convert.return_value = sample + return mock + with patch("databusclient.api.download.SPARQLWrapper", new=MockSPARQL): + res = dl._query_sparql_endpoint(endpoint, "SELECT ?s WHERE { ?s ?p ?o }", databus_key="SECRET", verbose=True) + assert "[HTTP] POST" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert res == sample From faadd586a939d08e0ac703488ab3efb0c78948e2 Mon Sep 17 00:00:00 2001 From: Tahoora Tabassum Date: Wed, 7 Jan 2026 23:10:08 +0530 Subject: [PATCH 8/8] Add verbose logging support and stabilize auth tests --- tests/test_download_auth.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_download_auth.py b/tests/test_download_auth.py index d2c2475..d5fdf44 100644 --- a/tests/test_download_auth.py +++ b/tests/test_download_auth.py @@ -8,7 +8,9 @@ import databusclient.api.download as dl from databusclient.api.download import VAULT_REQUIRED_HOSTS, DownloadAuthError +from unittest.mock import patch +from databusclient.api.download import download, DownloadAuthError def make_response(status=200, headers=None, content=b""): headers = headers or {}