diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index fbe2b8e..931dd19 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -1,9 +1,14 @@ +import csv import re +import warnings +from collections.abc import Mapping from io import TextIOWrapper from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen -MACHINE_TYPES = { +MACHINE_TYPES_FALLBACK = { "VH": "Illumina-NextSeq", "D": "Illumina-HiSeq", "M": "Illumina-MiSeq", @@ -11,7 +16,70 @@ "NB": "Illumina-MiniSeq", "LH": "Illumina-NovaSeqX", "SH": "Illumina-MiSeq", -} +} # Fallback mapping if machine_types.tsv is unavailable. +MACHINE_TYPES_URL = ( + "https://raw.githubusercontent.com/PennChopMicrobiomeProgram/" + "SampleRegistry/master/sample_registry/data/machine_types.tsv" +) + + +class MachineTypesMapping(Mapping): + def _mapping(self) -> dict[str, str]: + return load_machine_types() + + def __getitem__(self, key: str) -> str: + return self._mapping()[key] + + def __iter__(self): + return iter(self._mapping()) + + def __len__(self) -> int: + return len(self._mapping()) + + +def load_machine_types() -> dict[str, str]: + try: + with urlopen(MACHINE_TYPES_URL, timeout=10) as response: + content = response.read().decode("utf-8") + except (URLError, TimeoutError) as exc: + warnings.warn( + f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}", + RuntimeWarning, + ) + return MACHINE_TYPES_FALLBACK + + reader = csv.reader(content.splitlines(), delimiter="\t") + rows = [row for row in reader if row] + if not rows: + warnings.warn( + "Falling back to bundled machine types; received empty machine_types.tsv.", + RuntimeWarning, + ) + return MACHINE_TYPES_FALLBACK + + if rows[0][0].lower() in {"instrument_code", "code"}: + rows = rows[1:] + + mapping: dict[str, str] = {} + for row in rows: + if len(row) < 2: + continue + code = row[0].strip() + machine_type = row[1].strip() + if code and machine_type: + mapping[code] = machine_type + + if not mapping: + warnings.warn( + "Falling back to bundled machine types; no valid rows in machine_types.tsv.", + RuntimeWarning, + ) + return MACHINE_TYPES_FALLBACK + + return mapping + + +MACHINE_TYPES = MachineTypesMapping() def extract_instrument_code(instrument: str) -> str: diff --git a/test/test_illumina.py b/test/test_illumina.py index ada9adc..22592d0 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -1,8 +1,16 @@ import gzip +from urllib.error import URLError + import pytest -from pathlib import Path + from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE -from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES +from seqBackupLib.illumina import ( + IlluminaDir, + IlluminaFastq, + MACHINE_TYPES, + MACHINE_TYPES_FALLBACK, + load_machine_types, +) machine_fixtures = { @@ -16,7 +24,33 @@ } -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) +@pytest.fixture(autouse=True) +def machine_types_urlopen(monkeypatch): + tsv_rows = ["instrument_code\tmachine_type"] + tsv_rows.extend( + f"{code}\t{machine}" for code, machine in MACHINE_TYPES_FALLBACK.items() + ) + tsv = "\n".join(tsv_rows) + "\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr( + "seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv) + ) + + +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) def test_illumina_fastq(machine_type, request): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: @@ -37,7 +71,7 @@ def test_illumina_fastq(machine_type, request): assert r1.check_index_read_exists() -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) def test_illumina_dir(machine_type, request): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: @@ -58,3 +92,38 @@ def test_illumina_fastq_without_lane(novaseq_dir): r1 = IlluminaFastq(f) assert r1.check_fp_vs_content()[0] assert r1.build_archive_dir().endswith("L001") + + +def test_load_machine_types_from_tsv(monkeypatch): + tsv = "instrument_code\tmachine_type\nZZ\tIllumina-Test\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr( + "seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv) + ) + + machine_types = load_machine_types() + assert machine_types["ZZ"] == "Illumina-Test" + + +def test_load_machine_types_fallback_warning(monkeypatch): + def raise_url_error(*args, **kwargs): + raise URLError("network down") + + monkeypatch.setattr("seqBackupLib.illumina.urlopen", raise_url_error) + + with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"): + machine_types = load_machine_types() + assert machine_types == MACHINE_TYPES_FALLBACK