From 350617cb0756a49e7858fe722c73a5a5e6028675 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 16:44:44 +0530 Subject: [PATCH 1/9] feature added: --validate checksum flag --- databusclient/api/download.py | 144 ++++++++++++++++++++++++++++++++++ databusclient/cli.py | 12 ++- 2 files changed, 154 insertions(+), 2 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index ac55faa..7fe69a9 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -13,6 +13,53 @@ ) +def _extract_checksum_from_node(node) -> str | None: + """ + Try to extract a 64-char hex checksum from a JSON-LD file node. + Handles these common shapes: + - checksum or sha256sum fields as plain string + - checksum fields as dict with '@value' + - nested values (recursively search strings for a 64-char hex) + """ + def find_in_value(v): + if isinstance(v, str): + s = v.strip() + if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s): + return s + if isinstance(v, dict): + # common JSON-LD value object + if "@value" in v and isinstance(v["@value"], str): + res = find_in_value(v["@value"]) + if res: + return res + # try all nested dict values + for vv in v.values(): + res = find_in_value(vv) + if res: + return res + if isinstance(v, list): + for item in v: + res = find_in_value(item) + if res: + return res + return None + + # direct keys to try first + for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): + if key in node: + res = find_in_value(node[key]) + if res: + return res + + # fallback: search all values recursively for a 64-char hex string + for v in node.values(): + res = find_in_value(v) + if res: + return res + return None + + + # Hosts that require Vault token based authentication. Central source of truth. VAULT_REQUIRED_HOSTS = { "data.dbpedia.io", @@ -32,6 +79,8 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, + validate_checksum: bool = False, + expected_checksum: str | None = None, ) -> None: """ Download a file from the internet with a progress bar using tqdm. @@ -183,6 +232,26 @@ def _download_file( if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: raise IOError("Downloaded size does not match Content-Length header") + # --- 6. Optional checksum validation --- + if validate_checksum: + # reuse compute_sha256_and_length from webdav extension + try: + from databusclient.extensions.webdav import compute_sha256_and_length + + actual, _ = compute_sha256_and_length(filename) + except Exception: + actual = None + + if expected_checksum is None: + print(f"WARNING: no expected checksum available for {filename}; skipping validation") + elif actual is None: + print(f"WARNING: could not compute checksum for {filename}; skipping validation") + else: + if actual.lower() != expected_checksum.lower(): + raise IOError( + f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" + ) + def _download_files( urls: List[str], @@ -191,6 +260,8 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, + checksums: dict | None = None, ) -> None: """ Download multiple files from the databus. @@ -204,6 +275,9 @@ def _download_files( - client_id: Client ID for token exchange """ for url in urls: + expected = None + if checksums and isinstance(checksums, dict): + expected = checksums.get(url) _download_file( url=url, localDir=localDir, @@ -211,6 +285,8 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) @@ -358,6 +434,7 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False ) -> None: """ Download all files in a databus collection. @@ -382,6 +459,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) @@ -392,6 +470,7 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download all files in a databus artifact version. @@ -406,6 +485,22 @@ def _download_version( """ json_str = fetch_databus_jsonld(uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) + # build url -> checksum mapping from JSON-LD when available + checksums: dict = {} + try: + json_dict = json.loads(json_str) + graph = json_dict.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected: + checksums[file_uri] = expected + except Exception: + checksums = {} + _download_files( file_urls, localDir, @@ -413,6 +508,8 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + checksums=checksums, ) @@ -424,6 +521,7 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download files in a databus artifact. @@ -445,6 +543,22 @@ def _download_artifact( print(f"Downloading version: {version_uri}") json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) + # extract checksums for this version + checksums: dict = {} + try: + jd = json.loads(json_str) + graph = jd.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected: + checksums[file_uri] = expected + except Exception: + checksums = {} + _download_files( file_urls, localDir, @@ -452,6 +566,8 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + checksums=checksums, ) @@ -527,6 +643,7 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, + validate_checksum: bool = False, ) -> None: """ Download files in a databus group. @@ -552,6 +669,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) @@ -598,6 +716,7 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", + validate_checksum: bool = False ) -> None: """ Download datasets from databus. @@ -638,9 +757,27 @@ def download( databus_key, auth_url, client_id, + validate_checksum=validate_checksum, ) elif file is not None: print(f"Downloading file: {databusURI}") + # Try to fetch expected checksum from the parent Version metadata + expected = None + if validate_checksum: + try: + version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + json_dict = json.loads(json_str) + graph = json_dict.get("@graph", []) + for node in graph: + if node.get("file") == databusURI or node.get("@id") == databusURI: + expected = _extract_checksum_from_node(node) + if expected: + break + except Exception as e: + print(f"WARNING: Could not fetch checksum for single file: {e}") + + # Call the worker to download the single file (passes expected checksum) _download_file( databusURI, localDir, @@ -648,6 +785,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) elif version is not None: print(f"Downloading version: {databusURI}") @@ -658,6 +797,8 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, + expected_checksum=expected, ) elif artifact is not None: print( @@ -671,6 +812,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) elif group is not None and group != "collections": print( @@ -684,6 +826,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) elif account is not None: print("accountId not supported yet") # TODO @@ -709,4 +852,5 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + validate_checksum=validate_checksum, ) diff --git a/databusclient/cli.py b/databusclient/cli.py index 069408e..8c70c4e 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -158,6 +158,11 @@ def deploy( show_default=True, help="Client ID for token exchange", ) +@click.option( + "--validate-checksum", + is_flag=True, + help="Validate checksums of downloaded files" +) def download( databusuris: List[str], localdir, @@ -167,7 +172,9 @@ def download( all_versions, authurl, clientid, + validate_checksum, ): + """ Download datasets from databus, optionally using vault access if vault options are provided. """ @@ -181,7 +188,8 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, - ) + validate_checksum=validate_checksum + ) except DownloadAuthError as e: raise click.ClickException(str(e)) @@ -214,4 +222,4 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) if __name__ == "__main__": - app() + download() From 743c6237a761a4d0625c404bf8955d49e76692e7 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:13:04 +0530 Subject: [PATCH 2/9] refactor: address CodeRabbit review comments --- databusclient/api/download.py | 4 +--- databusclient/cli.py | 5 ++--- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 7fe69a9..76ff3c0 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -12,6 +12,7 @@ get_databus_id_parts_from_file_url, ) +from databusclient.extensions.webdav import compute_sha256_and_length def _extract_checksum_from_node(node) -> str | None: """ @@ -236,8 +237,6 @@ def _download_file( if validate_checksum: # reuse compute_sha256_and_length from webdav extension try: - from databusclient.extensions.webdav import compute_sha256_and_length - actual, _ = compute_sha256_and_length(filename) except Exception: actual = None @@ -798,7 +797,6 @@ def download( auth_url=auth_url, client_id=client_id, validate_checksum=validate_checksum, - expected_checksum=expected, ) elif artifact is not None: print( diff --git a/databusclient/cli.py b/databusclient/cli.py index 8c70c4e..420530d 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -173,8 +173,7 @@ def download( authurl, clientid, validate_checksum, -): - +): """ Download datasets from databus, optionally using vault access if vault options are provided. """ @@ -222,4 +221,4 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) if __name__ == "__main__": - download() + app() From e33ab8cc8f1b5f926ad258cbb860a89701c91341 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:29:58 +0530 Subject: [PATCH 3/9] refactor: address CodeRabbit review comments (2) --- databusclient/api/download.py | 40 ++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 76ff3c0..e01f3b7 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -238,7 +238,8 @@ def _download_file( # reuse compute_sha256_and_length from webdav extension try: actual, _ = compute_sha256_and_length(filename) - except Exception: + except (OSError, IOError) as e: + print(f"WARNING: error computing checksum for {filename}: {e}") actual = None if expected_checksum is None: @@ -451,6 +452,42 @@ def _download_collection( file_urls = _get_file_download_urls_from_sparql_query( endpoint, query, databus_key=databus_key ) + + # If checksum validation requested, attempt to build url->checksum mapping + # by fetching the Version JSON-LD for each file's version. We group files + # by their version URI to avoid fetching the same metadata repeatedly. + checksums: dict = {} + if validate_checksum: + # Map version_uri -> list of file urls + versions_map: dict = {} + for fu in file_urls: + try: + h, acc, grp, art, ver, f = get_databus_id_parts_from_file_url(fu) + except Exception: + continue + if ver is None: + continue + version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" + versions_map.setdefault(version_uri, []).append(fu) + + # Fetch each version's JSON-LD once and extract checksums for its files + for version_uri, urls_in_version in versions_map.items(): + try: + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + jd = json.loads(json_str) + graph = jd.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + file_uri = node.get("file") + if not isinstance(file_uri, str): + continue + expected = _extract_checksum_from_node(node) + if expected and file_uri in urls_in_version: + checksums[file_uri] = expected + except Exception: + # Best-effort: if fetching a version fails, skip it + continue + _download_files( list(file_urls), localDir, @@ -459,6 +496,7 @@ def _download_collection( auth_url=auth_url, client_id=client_id, validate_checksum=validate_checksum, + checksums=checksums if checksums else None, ) From dc51aa9306dcd649bc555530659be626c014c94c Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:36:26 +0530 Subject: [PATCH 4/9] refactor: address CodeRabbit review comments (3) --- databusclient/api/download.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index e01f3b7..9881533 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -467,6 +467,8 @@ def _download_collection( continue if ver is None: continue + if h is None or acc is None or grp is None or art is None: + continue version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" versions_map.setdefault(version_uri, []).append(fu) From 5875a8210395e5376248d8f8a6e21de3f863b8d3 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Wed, 31 Dec 2025 17:54:05 +0530 Subject: [PATCH 5/9] refactor: address CodeRabbit review comments (4) --- databusclient/api/download.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 9881533..4af27c4 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -248,6 +248,8 @@ def _download_file( print(f"WARNING: could not compute checksum for {filename}; skipping validation") else: if actual.lower() != expected_checksum.lower(): + try: os.remove(filename) # delete corrupted file + except OSError: pass raise IOError( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) @@ -878,6 +880,8 @@ def download( # query as argument else: print("QUERY {}", databusURI.replace("\n", " ")) + if validate_checksum: + print("WARNING: Checksum validation is not supported for user-defined SPARQL queries.") if uri_endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = _get_file_download_urls_from_sparql_query( From e253b812912834c79ac83c864fd9608fc090eecb Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Sun, 11 Jan 2026 15:36:20 +0530 Subject: [PATCH 6/9] refactor: address review comments (naming, strict checksum extraction) --- databusclient/api/download.py | 167 +++++++++++++++++++--------------- databusclient/api/utils.py | 14 ++- file.txt | 0 3 files changed, 105 insertions(+), 76 deletions(-) create mode 100644 file.txt diff --git a/databusclient/api/download.py b/databusclient/api/download.py index 4af27c4..dde1db4 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -1,5 +1,6 @@ import json import os +import re from typing import List from urllib.parse import urlparse @@ -12,7 +13,10 @@ get_databus_id_parts_from_file_url, ) -from databusclient.extensions.webdav import compute_sha256_and_length +from databusclient.api.utils import compute_sha256_and_length + +# compiled regex for SHA-256 hex strings +_SHA256_RE = re.compile(r"^[0-9a-fA-F]{64}$") def _extract_checksum_from_node(node) -> str | None: """ @@ -20,12 +24,12 @@ def _extract_checksum_from_node(node) -> str | None: Handles these common shapes: - checksum or sha256sum fields as plain string - checksum fields as dict with '@value' - - nested values (recursively search strings for a 64-char hex) + - nested values under the allowed keys (lists or '@value' objects) """ def find_in_value(v): if isinstance(v, str): s = v.strip() - if len(s) == 64 and all(c in "0123456789abcdefABCDEF" for c in s): + if _SHA256_RE.match(s): return s if isinstance(v, dict): # common JSON-LD value object @@ -45,18 +49,13 @@ def find_in_value(v): return res return None - # direct keys to try first + # Only inspect the explicitly allowed keys to avoid false positives. for key in ("checksum", "sha256sum", "sha256", "databus:checksum"): if key in node: res = find_in_value(node[key]) if res: return res - # fallback: search all values recursively for a 64-char hex string - for v in node.values(): - res = find_in_value(v) - if res: - return res return None @@ -73,6 +72,67 @@ class DownloadAuthError(Exception): +def _extract_checksums_from_jsonld(json_str: str) -> dict: + """ + Parse a JSON-LD string and return a mapping of file URI (and @id) -> checksum. + + Uses the existing _extract_checksum_from_node logic to extract checksums + from `Part` nodes. Both the node's `file` and `@id` (if present and a + string) are mapped to the checksum to preserve existing lookup behavior. + """ + try: + jd = json.loads(json_str) + except Exception: + return {} + graph = jd.get("@graph", []) + checksums: dict = {} + for node in graph: + if node.get("@type") == "Part": + expected = _extract_checksum_from_node(node) + if not expected: + continue + file_uri = node.get("file") + if isinstance(file_uri, str): + checksums[file_uri] = expected + node_id = node.get("@id") + if isinstance(node_id, str): + checksums[node_id] = expected + return checksums + + +def _resolve_checksums_for_urls(file_urls: List[str], databus_key: str | None) -> dict: + """ + Group file URLs by their Version URI, fetch each Version JSON-LD once, + and return a combined url->checksum mapping for the provided URLs. + + Best-effort: failures to fetch or parse individual versions are skipped. + """ + versions_map: dict = {} + for file_url in file_urls: + try: + host, accountId, groupId, artifactId, versionId, fileId = get_databus_id_parts_from_file_url(file_url) + except Exception: + continue + if versionId is None: + continue + if host is None or accountId is None or groupId is None or artifactId is None: + continue + version_uri = f"https://{host}/{accountId}/{groupId}/{artifactId}/{versionId}" + versions_map.setdefault(version_uri, []).append(file_url) + + checksums: dict = {} + for version_uri, urls_in_version in versions_map.items(): + try: + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + extracted_checksums = _extract_checksums_from_jsonld(json_str) + for url in urls_in_version: + if url in extracted_checksums: + checksums[url] = extracted_checksums[url] + except Exception: + # Best-effort: skip versions we cannot fetch or parse + continue + return checksums + def _download_file( url, localDir, @@ -248,8 +308,10 @@ def _download_file( print(f"WARNING: could not compute checksum for {filename}; skipping validation") else: if actual.lower() != expected_checksum.lower(): - try: os.remove(filename) # delete corrupted file - except OSError: pass + try: + os.remove(filename) # delete corrupted file + except OSError: + pass raise IOError( f"Checksum mismatch for {filename}: expected {expected_checksum}, got {actual}" ) @@ -456,41 +518,9 @@ def _download_collection( ) # If checksum validation requested, attempt to build url->checksum mapping - # by fetching the Version JSON-LD for each file's version. We group files - # by their version URI to avoid fetching the same metadata repeatedly. checksums: dict = {} if validate_checksum: - # Map version_uri -> list of file urls - versions_map: dict = {} - for fu in file_urls: - try: - h, acc, grp, art, ver, f = get_databus_id_parts_from_file_url(fu) - except Exception: - continue - if ver is None: - continue - if h is None or acc is None or grp is None or art is None: - continue - version_uri = f"https://{h}/{acc}/{grp}/{art}/{ver}" - versions_map.setdefault(version_uri, []).append(fu) - - # Fetch each version's JSON-LD once and extract checksums for its files - for version_uri, urls_in_version in versions_map.items(): - try: - json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) - jd = json.loads(json_str) - graph = jd.get("@graph", []) - for node in graph: - if node.get("@type") == "Part": - file_uri = node.get("file") - if not isinstance(file_uri, str): - continue - expected = _extract_checksum_from_node(node) - if expected and file_uri in urls_in_version: - checksums[file_uri] = expected - except Exception: - # Best-effort: if fetching a version fails, skip it - continue + checksums = _resolve_checksums_for_urls(list(file_urls), databus_key) _download_files( list(file_urls), @@ -529,16 +559,7 @@ def _download_version( # build url -> checksum mapping from JSON-LD when available checksums: dict = {} try: - json_dict = json.loads(json_str) - graph = json_dict.get("@graph", []) - for node in graph: - if node.get("@type") == "Part": - file_uri = node.get("file") - if not isinstance(file_uri, str): - continue - expected = _extract_checksum_from_node(node) - if expected: - checksums[file_uri] = expected + checksums = _extract_checksums_from_jsonld(json_str) except Exception: checksums = {} @@ -587,16 +608,7 @@ def _download_artifact( # extract checksums for this version checksums: dict = {} try: - jd = json.loads(json_str) - graph = jd.get("@graph", []) - for node in graph: - if node.get("@type") == "Part": - file_uri = node.get("file") - if not isinstance(file_uri, str): - continue - expected = _extract_checksum_from_node(node) - if expected: - checksums[file_uri] = expected + checksums = _extract_checksums_from_jsonld(json_str) except Exception: checksums = {} @@ -806,15 +818,13 @@ def download( expected = None if validate_checksum: try: - version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" - json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) - json_dict = json.loads(json_str) - graph = json_dict.get("@graph", []) - for node in graph: - if node.get("file") == databusURI or node.get("@id") == databusURI: - expected = _extract_checksum_from_node(node) - if expected: - break + version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + checks = _extract_checksums_from_jsonld(json_str) + expected = checks.get(databusURI) + if expected is None: + # fallback: try lookup by @id (helper already maps @id too) + expected = checks.get(databusURI) except Exception as e: print(f"WARNING: Could not fetch checksum for single file: {e}") @@ -880,13 +890,19 @@ def download( # query as argument else: print("QUERY {}", databusURI.replace("\n", " ")) - if validate_checksum: - print("WARNING: Checksum validation is not supported for user-defined SPARQL queries.") if uri_endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = _get_file_download_urls_from_sparql_query( uri_endpoint, databusURI, databus_key=databus_key ) + + # If checksum validation requested, try to build url->checksum mapping + checksums: dict = {} + if validate_checksum: + checksums = _resolve_checksums_for_urls(res, databus_key) + if not checksums: + print("WARNING: Checksum validation enabled but no checksums found for query results.") + _download_files( res, localDir, @@ -895,4 +911,5 @@ def download( auth_url=auth_url, client_id=client_id, validate_checksum=validate_checksum, + checksums=checksums if checksums else None, ) diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index 7e27ff3..a1a1063 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -1,5 +1,5 @@ from typing import Optional, Tuple - +import hashlib import requests @@ -48,3 +48,15 @@ def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: response.raise_for_status() return response.text + +def compute_sha256_and_length(filepath): + sha256 = hashlib.sha256() + total_length = 0 + with open(filepath, "rb") as f: + while True: + chunk = f.read(4096) + if not chunk: + break + sha256.update(chunk) + total_length += len(chunk) + return sha256.hexdigest(), total_length diff --git a/file.txt b/file.txt new file mode 100644 index 0000000..e69de29 From 278ee5e93489a9fb0c024034550ff0811c78bf4b Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Sun, 11 Jan 2026 16:29:40 +0530 Subject: [PATCH 7/9] refactor: addressed coderabbit suggestions --- databusclient/api/download.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index dde1db4..a5d130c 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -84,7 +84,13 @@ def _extract_checksums_from_jsonld(json_str: str) -> dict: jd = json.loads(json_str) except Exception: return {} - graph = jd.get("@graph", []) + if isinstance(jd, list): + graph = jd.get("@graph", []) + elif isinstance(jd, list): + graph = jd + else: + return{} + checksums: dict = {} for node in graph: if node.get("@type") == "Part": @@ -248,7 +254,7 @@ def _download_file( # for user-friendly CLI output. vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id) headers["Authorization"] = f"Bearer {vault_token}" - headers.pop("Accept-Encoding", None) + headers["Accept-Encoding"] = "identity" # Retry with token response = requests.get(url, headers=headers, stream=True, timeout=30) @@ -818,13 +824,13 @@ def download( expected = None if validate_checksum: try: + if version is not None: version_uri = f"https://{host}/{account}/{group}/{artifact}/{version}" json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) checks = _extract_checksums_from_jsonld(json_str) - expected = checks.get(databusURI) - if expected is None: - # fallback: try lookup by @id (helper already maps @id too) - expected = checks.get(databusURI) + expected = checks.get(databusURI) or checks.get( + "https://" + databusURI.removeprefix("http://").removeprefix("https://") + ) except Exception as e: print(f"WARNING: Could not fetch checksum for single file: {e}") From ab28258880319bb2a31b6c22d6c241aa2f564fca Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Sun, 11 Jan 2026 16:36:22 +0530 Subject: [PATCH 8/9] refactor: final edits --- databusclient/api/download.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/databusclient/api/download.py b/databusclient/api/download.py index a5d130c..60b845b 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -84,7 +84,7 @@ def _extract_checksums_from_jsonld(json_str: str) -> dict: jd = json.loads(json_str) except Exception: return {} - if isinstance(jd, list): + if isinstance(jd, dict): graph = jd.get("@graph", []) elif isinstance(jd, list): graph = jd @@ -830,7 +830,7 @@ def download( checks = _extract_checksums_from_jsonld(json_str) expected = checks.get(databusURI) or checks.get( "https://" + databusURI.removeprefix("http://").removeprefix("https://") - ) + ) except Exception as e: print(f"WARNING: Could not fetch checksum for single file: {e}") From 18250caa875aa407662fc67e1241c276d87a31b7 Mon Sep 17 00:00:00 2001 From: agrim-git-hub Date: Sun, 11 Jan 2026 16:38:06 +0530 Subject: [PATCH 9/9] fix: removing file.txt --- file.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 file.txt diff --git a/file.txt b/file.txt deleted file mode 100644 index e69de29..0000000