diff --git a/cycode/cli/apps/report/sbom/repository_url/repository_url_command.py b/cycode/cli/apps/report/sbom/repository_url/repository_url_command.py index 9e2f4885..e0955871 100644 --- a/cycode/cli/apps/report/sbom/repository_url/repository_url_command.py +++ b/cycode/cli/apps/report/sbom/repository_url/repository_url_command.py @@ -8,6 +8,10 @@ from cycode.cli.utils.get_api_client import get_report_cycode_client from cycode.cli.utils.progress_bar import SbomReportProgressBarSection from cycode.cli.utils.sentry import add_breadcrumb +from cycode.cli.utils.url_utils import sanitize_repository_url +from cycode.logger import get_logger + +logger = get_logger('Repository URL Command') def repository_url_command( @@ -28,8 +32,13 @@ def repository_url_command( start_scan_time = time.time() report_execution_id = -1 + # Sanitize repository URL to remove any embedded credentials/tokens before sending to API + sanitized_uri = sanitize_repository_url(uri) + if sanitized_uri != uri: + logger.debug('Sanitized repository URL to remove credentials') + try: - report_execution = client.request_sbom_report_execution(report_parameters, repository_url=uri) + report_execution = client.request_sbom_report_execution(report_parameters, repository_url=sanitized_uri) report_execution_id = report_execution.id create_sbom_report(progress_bar, client, report_execution_id, output_file, output_format) diff --git a/cycode/cli/apps/scan/remote_url_resolver.py b/cycode/cli/apps/scan/remote_url_resolver.py index 967e6ea0..870115e2 100644 --- a/cycode/cli/apps/scan/remote_url_resolver.py +++ b/cycode/cli/apps/scan/remote_url_resolver.py @@ -3,6 +3,7 @@ from cycode.cli import consts from cycode.cli.utils.git_proxy import git_proxy from cycode.cli.utils.shell_executor import shell +from cycode.cli.utils.url_utils import sanitize_repository_url from cycode.logger import get_logger logger = get_logger('Remote URL Resolver') @@ -102,7 +103,11 @@ def _try_get_git_remote_url(path: str) -> Optional[str]: repo = git_proxy.get_repo(path, search_parent_directories=True) remote_url = repo.remotes[0].config_reader.get('url') logger.debug('Found Git remote URL, %s', {'remote_url': remote_url, 'repo_path': repo.working_dir}) - return remote_url + # Sanitize URL to remove any embedded credentials/tokens before returning + sanitized_url = sanitize_repository_url(remote_url) + if sanitized_url != remote_url: + logger.debug('Sanitized repository URL to remove credentials') + return sanitized_url except Exception as e: logger.debug('Failed to get Git remote URL. Probably not a Git repository', exc_info=e) return None @@ -124,7 +129,9 @@ def get_remote_url_scan_parameter(paths: tuple[str, ...]) -> Optional[str]: # - len(paths)*2 Plastic SCM subprocess calls remote_url = _try_get_any_remote_url(path) if remote_url: - remote_urls.add(remote_url) + # URLs are already sanitized in _try_get_git_remote_url, but sanitize again as safety measure + sanitized_url = sanitize_repository_url(remote_url) + remote_urls.add(sanitized_url) if len(remote_urls) == 1: # we are resolving remote_url only if all paths belong to the same repo (identical remote URLs), diff --git a/cycode/cli/utils/url_utils.py b/cycode/cli/utils/url_utils.py new file mode 100644 index 00000000..91e50f77 --- /dev/null +++ b/cycode/cli/utils/url_utils.py @@ -0,0 +1,64 @@ +from typing import Optional +from urllib.parse import urlparse, urlunparse + +from cycode.logger import get_logger + +logger = get_logger('URL Utils') + + +def sanitize_repository_url(url: Optional[str]) -> Optional[str]: + """Remove credentials (username, password, tokens) from repository URL. + + This function sanitizes repository URLs to prevent sending PAT tokens or other + credentials to the API. It handles both HTTP/HTTPS URLs with embedded credentials + and SSH URLs (which are returned as-is since they don't contain credentials in the URL). + + Args: + url: Repository URL that may contain credentials (e.g., https://token@github.com/user/repo.git) + + Returns: + Sanitized URL without credentials (e.g., https://github.com/user/repo.git), or None if input is None + + Examples: + >>> sanitize_repository_url('https://token@github.com/user/repo.git') + 'https://github.com/user/repo.git' + >>> sanitize_repository_url('https://user:token@github.com/user/repo.git') + 'https://github.com/user/repo.git' + >>> sanitize_repository_url('git@github.com:user/repo.git') + 'git@github.com:user/repo.git' + >>> sanitize_repository_url(None) + None + """ + if not url: + return url + + # Handle SSH URLs - no credentials to remove + # ssh:// URLs have the format ssh://git@host/path + if url.startswith('ssh://'): + return url + # git@host:path format (scp-style) + if '@' in url and '://' not in url and url.startswith('git@'): + return url + + try: + parsed = urlparse(url) + # Remove username and password from netloc + # Reconstruct URL without credentials + sanitized_netloc = parsed.hostname + if parsed.port: + sanitized_netloc = f'{sanitized_netloc}:{parsed.port}' + + return urlunparse( + ( + parsed.scheme, + sanitized_netloc, + parsed.path, + parsed.params, + parsed.query, + parsed.fragment, + ) + ) + except Exception as e: + logger.debug('Failed to sanitize repository URL, returning original, %s', {'url': url, 'error': str(e)}) + # If parsing fails, return original URL to avoid breaking functionality + return url diff --git a/cycode/cyclient/report_client.py b/cycode/cyclient/report_client.py index e8107827..a55b5c40 100644 --- a/cycode/cyclient/report_client.py +++ b/cycode/cyclient/report_client.py @@ -6,8 +6,12 @@ from cycode.cli.exceptions.custom_exceptions import CycodeError from cycode.cli.files_collector.models.in_memory_zip import InMemoryZip +from cycode.cli.utils.url_utils import sanitize_repository_url from cycode.cyclient import models from cycode.cyclient.cycode_client_base import CycodeClientBase +from cycode.logger import get_logger + +logger = get_logger('Report Client') @dataclasses.dataclass @@ -49,7 +53,11 @@ def request_sbom_report_execution( # entity type required only for zipped-file request_data = {'report_parameters': params.to_json(without_entity_type=zip_file is None)} if repository_url: - request_data['repository_url'] = repository_url + # Sanitize repository URL to remove any embedded credentials/tokens before sending to API + sanitized_url = sanitize_repository_url(repository_url) + if sanitized_url != repository_url: + logger.debug('Sanitized repository URL to remove credentials') + request_data['repository_url'] = sanitized_url request_args = { 'url_path': url_path, diff --git a/tests/utils/test_url_utils.py b/tests/utils/test_url_utils.py new file mode 100644 index 00000000..f7f6b6b0 --- /dev/null +++ b/tests/utils/test_url_utils.py @@ -0,0 +1,80 @@ +from cycode.cli.utils.url_utils import sanitize_repository_url + + +def test_sanitize_repository_url_with_token() -> None: + """Test that PAT tokens are removed from HTTPS URLs.""" + url = 'https://token@github.com/user/repo.git' + expected = 'https://github.com/user/repo.git' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_with_username_and_token() -> None: + """Test that username and token are removed from HTTPS URLs.""" + url = 'https://user:token@github.com/user/repo.git' + expected = 'https://github.com/user/repo.git' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_with_port() -> None: + """Test that URLs with ports are handled correctly.""" + url = 'https://token@github.com:443/user/repo.git' + expected = 'https://github.com:443/user/repo.git' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_ssh_format() -> None: + """Test that SSH URLs are returned as-is (no credentials in URL format).""" + url = 'git@github.com:user/repo.git' + assert sanitize_repository_url(url) == url + + +def test_sanitize_repository_url_ssh_protocol() -> None: + """Test that ssh:// URLs are returned as-is.""" + url = 'ssh://git@github.com/user/repo.git' + assert sanitize_repository_url(url) == url + + +def test_sanitize_repository_url_no_credentials() -> None: + """Test that URLs without credentials are returned unchanged.""" + url = 'https://github.com/user/repo.git' + assert sanitize_repository_url(url) == url + + +def test_sanitize_repository_url_none() -> None: + """Test that None input returns None.""" + assert sanitize_repository_url(None) is None + + +def test_sanitize_repository_url_empty_string() -> None: + """Test that empty string is returned as-is.""" + assert sanitize_repository_url('') == '' + + +def test_sanitize_repository_url_gitlab() -> None: + """Test that GitLab URLs are sanitized correctly.""" + url = 'https://oauth2:token@gitlab.com/user/repo.git' + expected = 'https://gitlab.com/user/repo.git' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_bitbucket() -> None: + """Test that Bitbucket URLs are sanitized correctly.""" + url = 'https://x-token-auth:token@bitbucket.org/user/repo.git' + expected = 'https://bitbucket.org/user/repo.git' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_with_path_and_query() -> None: + """Test that URLs with paths, query params, and fragments are preserved.""" + url = 'https://token@github.com/user/repo.git?ref=main#section' + expected = 'https://github.com/user/repo.git?ref=main#section' + assert sanitize_repository_url(url) == expected + + +def test_sanitize_repository_url_invalid_url() -> None: + """Test that invalid URLs are returned as-is (graceful degradation).""" + # This should not raise an exception, but return the original + url = 'not-a-valid-url' + result = sanitize_repository_url(url) + # Should return original since parsing fails + assert result == url