Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 30 additions & 2 deletions seqBackupLib/illumina.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,45 @@
import csv
import re
import warnings
from io import TextIOWrapper
from pathlib import Path
from urllib.error import URLError
from urllib.request import urlopen


MACHINE_TYPES = {
MACHINE_TYPES_FALLBACK = {
"VH": "Illumina-NextSeq",
"D": "Illumina-HiSeq",
"M": "Illumina-MiSeq",
"A": "Illumina-NovaSeq",
"NB": "Illumina-MiniSeq",
"LH": "Illumina-NovaSeqX",
"SH": "Illumina-MiSeq",
}
} # Fallback mapping if machine_types.tsv is unavailable.
MACHINE_TYPES_URL = (
"https://raw.githubusercontent.com/PennChopMicrobiomeProgram/"
"SampleRegistry/master/sample_registry/data/machine_types.tsv"
)
try:
with urlopen(MACHINE_TYPES_URL, timeout=10) as response:
rows = list(
csv.reader(response.read().decode("utf-8").splitlines(), delimiter="\t")
)
if rows and rows[0] and rows[0][0].lower() in {"instrument_code", "code"}:
rows = rows[1:]
MACHINE_TYPES = {
row[0].strip(): row[1].strip()
for row in rows
if len(row) >= 2 and row[0].strip() and row[1].strip()
}
if not MACHINE_TYPES:
raise ValueError("machine_types.tsv contained no usable rows")
except (URLError, TimeoutError, ValueError) as exc:
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception handler should catch OSError instead of just URLError and TimeoutError. While URLError is a subclass of OSError and will catch many network errors, other OSError subclasses like ConnectionRefusedError, ConnectionResetError, and SSL-related errors can also be raised by urlopen. Catching OSError would provide more comprehensive error handling and ensure the fallback is used for all network-related failures. Alternatively, catch Exception if you want to ensure any failure triggers the fallback.

Suggested change
except (URLError, TimeoutError, ValueError) as exc:
except (OSError, ValueError) as exc:

Copilot uses AI. Check for mistakes.
warnings.warn(
f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}",
RuntimeWarning,
)
MACHINE_TYPES = MACHINE_TYPES_FALLBACK
Comment on lines +37 to +42
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Catching ValueError in the exception handler (line 37) will catch not only the "machine_types.tsv contained no usable rows" error but also any ValueError raised by csv.reader or during string decoding. While this is likely the intended behavior, it may mask unexpected errors. Consider being more specific about which ValueError instances should trigger the fallback, or add logging to distinguish between different failure modes.

Suggested change
except (URLError, TimeoutError, ValueError) as exc:
warnings.warn(
f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}",
RuntimeWarning,
)
MACHINE_TYPES = MACHINE_TYPES_FALLBACK
except (URLError, TimeoutError) as exc:
warnings.warn(
f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}",
RuntimeWarning,
)
MACHINE_TYPES = MACHINE_TYPES_FALLBACK
except ValueError as exc:
if str(exc) == "machine_types.tsv contained no usable rows":
warnings.warn(
f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}",
RuntimeWarning,
)
MACHINE_TYPES = MACHINE_TYPES_FALLBACK
else:
raise

Copilot uses AI. Check for mistakes.
Comment on lines +23 to +42
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The import-time network fetch is the stated design goal, but be aware of operational implications: (1) Python caches modules after first import, so subsequent imports in the same process won't make new requests; (2) However, every new Python process will make a request, which could cause issues in high-frequency scenarios like AWS Lambda cold starts or parallel test execution; (3) The 10-second timeout means slow networks could delay application startup. These tradeoffs are acceptable for many use cases but worth documenting.

Copilot uses AI. Check for mistakes.


def extract_instrument_code(instrument: str) -> str:
Expand Down
97 changes: 86 additions & 11 deletions test/test_illumina.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import gzip
import importlib
from urllib.error import URLError

import pytest
from pathlib import Path

from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE
from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES


machine_fixtures = {
Expand All @@ -16,8 +18,44 @@
}


@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
def test_illumina_fastq(machine_type, request):
@pytest.fixture
def illumina_module(monkeypatch):
tsv_rows = ["instrument_code\tmachine_type"]
fallback = {
"VH": "Illumina-NextSeq",
"D": "Illumina-HiSeq",
"M": "Illumina-MiSeq",
"A": "Illumina-NovaSeq",
"NB": "Illumina-MiniSeq",
"LH": "Illumina-NovaSeqX",
"SH": "Illumina-MiSeq",
}
tsv_rows.extend(f"{code}\t{machine}" for code, machine in fallback.items())
tsv = "\n".join(tsv_rows) + "\n"

class FakeResponse:
def __init__(self, data: str):
self._data = data

def read(self):
return self._data.encode("utf-8")

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

monkeypatch.setattr(
"urllib.request.urlopen", lambda *args, **kwargs: FakeResponse(tsv)
)
import seqBackupLib.illumina as illumina

return importlib.reload(illumina)
Comment on lines +22 to +54
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Using importlib.reload() modifies global module state and can cause test isolation issues if tests run in parallel. Consider using a more isolated approach such as extracting the TSV loading logic into a testable function, or document that these tests must run serially. The current approach where illumina_module fixture reloads the module means any test using this fixture will affect the global MACHINE_TYPES state.

Copilot uses AI. Check for mistakes.


@pytest.mark.parametrize("machine_type", machine_fixtures.keys())
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The parametrization was changed from MACHINE_TYPES.keys() to machine_fixtures.keys(). This means if new machine types are added to MACHINE_TYPES_FALLBACK but not to machine_fixtures, they won't be tested. Consider adding a separate test that verifies all keys in MACHINE_TYPES_FALLBACK have corresponding fixtures in machine_fixtures, or document that machine_fixtures must be kept in sync with MACHINE_TYPES_FALLBACK.

Copilot uses AI. Check for mistakes.
def test_illumina_fastq(machine_type, request, illumina_module):
fixture_name = machine_fixtures.get(machine_type)
if not fixture_name:
raise ValueError(
Expand All @@ -27,18 +65,18 @@ def test_illumina_fastq(machine_type, request):
fp = request.getfixturevalue(fixture_name)

with gzip.open(fp / "Undetermined_S0_L001_R1_001.fastq.gz", "rt") as f:
r1 = IlluminaFastq(f)
r1 = illumina_module.IlluminaFastq(f)

print("FASTQ info: ", r1.fastq_info, "\nFolder info: ", r1.folder_info)
assert r1.machine_type == MACHINE_TYPES[machine_type]
assert r1.machine_type == illumina_module.MACHINE_TYPES[machine_type]
assert r1.check_fp_vs_content()[0], r1.check_fp_vs_content()
assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE)
assert r1.check_file_size(100)
assert r1.check_index_read_exists()


@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
def test_illumina_dir(machine_type, request):
@pytest.mark.parametrize("machine_type", machine_fixtures.keys())
def test_illumina_dir(machine_type, request, illumina_module):
fixture_name = machine_fixtures.get(machine_type)
if not fixture_name:
raise ValueError(
Expand All @@ -47,14 +85,51 @@ def test_illumina_dir(machine_type, request):

fp = request.getfixturevalue(fixture_name)

d = IlluminaDir(fp.name)
d = illumina_module.IlluminaDir(fp.name)
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Variable d is not used.

Suggested change
d = illumina_module.IlluminaDir(fp.name)
illumina_module.IlluminaDir(fp.name)

Copilot uses AI. Check for mistakes.


def test_illumina_fastq_without_lane(novaseq_dir):
def test_illumina_fastq_without_lane(novaseq_dir, illumina_module):
original = novaseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz"
renamed = novaseq_dir / "Undetermined_S0_R1_001.fastq.gz"
original.rename(renamed)
with gzip.open(renamed, "rt") as f:
r1 = IlluminaFastq(f)
r1 = illumina_module.IlluminaFastq(f)
assert r1.check_fp_vs_content()[0]
assert r1.build_archive_dir().endswith("L001")


def test_load_machine_types_from_tsv(monkeypatch):
tsv = "instrument_code\tmachine_type\nZZ\tIllumina-Test\n"

class FakeResponse:
def __init__(self, data: str):
self._data = data

def read(self):
return self._data.encode("utf-8")

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False
Comment on lines +104 to +115
Copy link

Copilot AI Jan 30, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The FakeResponse class is duplicated in both illumina_module fixture and test_load_machine_types_from_tsv. Consider extracting this into a helper function or a separate fixture to reduce duplication and improve maintainability.

Copilot uses AI. Check for mistakes.

monkeypatch.setattr(
"urllib.request.urlopen", lambda *args, **kwargs: FakeResponse(tsv)
)
import seqBackupLib.illumina as illumina

illumina = importlib.reload(illumina)
assert illumina.MACHINE_TYPES["ZZ"] == "Illumina-Test"


def test_load_machine_types_fallback_warning(monkeypatch):
def raise_url_error(*args, **kwargs):
raise URLError("network down")

monkeypatch.setattr("urllib.request.urlopen", raise_url_error)
import seqBackupLib.illumina as illumina

with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"):
illumina = importlib.reload(illumina)
assert illumina.MACHINE_TYPES == illumina.MACHINE_TYPES_FALLBACK
Loading