From 48f3b6ae01ab1876ad36c870f02a160375b5eb8b Mon Sep 17 00:00:00 2001 From: Charlie Date: Fri, 30 Jan 2026 17:20:09 -0500 Subject: [PATCH] Simplify machine type loading --- seqBackupLib/illumina.py | 32 ++++++++++++- test/test_illumina.py | 97 +++++++++++++++++++++++++++++++++++----- 2 files changed, 116 insertions(+), 13 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index fbe2b8e..c448594 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -1,9 +1,13 @@ +import csv import re +import warnings from io import TextIOWrapper from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen -MACHINE_TYPES = { +MACHINE_TYPES_FALLBACK = { "VH": "Illumina-NextSeq", "D": "Illumina-HiSeq", "M": "Illumina-MiSeq", @@ -11,7 +15,31 @@ "NB": "Illumina-MiniSeq", "LH": "Illumina-NovaSeqX", "SH": "Illumina-MiSeq", -} +} # Fallback mapping if machine_types.tsv is unavailable. +MACHINE_TYPES_URL = ( + "https://raw.githubusercontent.com/PennChopMicrobiomeProgram/" + "SampleRegistry/master/sample_registry/data/machine_types.tsv" +) +try: + with urlopen(MACHINE_TYPES_URL, timeout=10) as response: + rows = list( + csv.reader(response.read().decode("utf-8").splitlines(), delimiter="\t") + ) + if rows and rows[0] and rows[0][0].lower() in {"instrument_code", "code"}: + rows = rows[1:] + MACHINE_TYPES = { + row[0].strip(): row[1].strip() + for row in rows + if len(row) >= 2 and row[0].strip() and row[1].strip() + } + if not MACHINE_TYPES: + raise ValueError("machine_types.tsv contained no usable rows") +except (URLError, TimeoutError, ValueError) as exc: + warnings.warn( + f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}", + RuntimeWarning, + ) + MACHINE_TYPES = MACHINE_TYPES_FALLBACK def extract_instrument_code(instrument: str) -> str: diff --git a/test/test_illumina.py b/test/test_illumina.py index ada9adc..3ac920e 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -1,8 +1,10 @@ import gzip +import importlib +from urllib.error import URLError + import pytest -from pathlib import Path + from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE -from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES machine_fixtures = { @@ -16,8 +18,44 @@ } -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) -def test_illumina_fastq(machine_type, request): +@pytest.fixture +def illumina_module(monkeypatch): + tsv_rows = ["instrument_code\tmachine_type"] + fallback = { + "VH": "Illumina-NextSeq", + "D": "Illumina-HiSeq", + "M": "Illumina-MiSeq", + "A": "Illumina-NovaSeq", + "NB": "Illumina-MiniSeq", + "LH": "Illumina-NovaSeqX", + "SH": "Illumina-MiSeq", + } + tsv_rows.extend(f"{code}\t{machine}" for code, machine in fallback.items()) + tsv = "\n".join(tsv_rows) + "\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr( + "urllib.request.urlopen", lambda *args, **kwargs: FakeResponse(tsv) + ) + import seqBackupLib.illumina as illumina + + return importlib.reload(illumina) + + +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) +def test_illumina_fastq(machine_type, request, illumina_module): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: raise ValueError( @@ -27,18 +65,18 @@ def test_illumina_fastq(machine_type, request): fp = request.getfixturevalue(fixture_name) with gzip.open(fp / "Undetermined_S0_L001_R1_001.fastq.gz", "rt") as f: - r1 = IlluminaFastq(f) + r1 = illumina_module.IlluminaFastq(f) print("FASTQ info: ", r1.fastq_info, "\nFolder info: ", r1.folder_info) - assert r1.machine_type == MACHINE_TYPES[machine_type] + assert r1.machine_type == illumina_module.MACHINE_TYPES[machine_type] assert r1.check_fp_vs_content()[0], r1.check_fp_vs_content() assert not r1.check_file_size(DEFAULT_MIN_FILE_SIZE) assert r1.check_file_size(100) assert r1.check_index_read_exists() -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) -def test_illumina_dir(machine_type, request): +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) +def test_illumina_dir(machine_type, request, illumina_module): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: raise ValueError( @@ -47,14 +85,51 @@ def test_illumina_dir(machine_type, request): fp = request.getfixturevalue(fixture_name) - d = IlluminaDir(fp.name) + d = illumina_module.IlluminaDir(fp.name) -def test_illumina_fastq_without_lane(novaseq_dir): +def test_illumina_fastq_without_lane(novaseq_dir, illumina_module): original = novaseq_dir / "Undetermined_S0_L001_R1_001.fastq.gz" renamed = novaseq_dir / "Undetermined_S0_R1_001.fastq.gz" original.rename(renamed) with gzip.open(renamed, "rt") as f: - r1 = IlluminaFastq(f) + r1 = illumina_module.IlluminaFastq(f) assert r1.check_fp_vs_content()[0] assert r1.build_archive_dir().endswith("L001") + + +def test_load_machine_types_from_tsv(monkeypatch): + tsv = "instrument_code\tmachine_type\nZZ\tIllumina-Test\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr( + "urllib.request.urlopen", lambda *args, **kwargs: FakeResponse(tsv) + ) + import seqBackupLib.illumina as illumina + + illumina = importlib.reload(illumina) + assert illumina.MACHINE_TYPES["ZZ"] == "Illumina-Test" + + +def test_load_machine_types_fallback_warning(monkeypatch): + def raise_url_error(*args, **kwargs): + raise URLError("network down") + + monkeypatch.setattr("urllib.request.urlopen", raise_url_error) + import seqBackupLib.illumina as illumina + + with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"): + illumina = importlib.reload(illumina) + assert illumina.MACHINE_TYPES == illumina.MACHINE_TYPES_FALLBACK