From e2d52becce96b643bff512fd495b756ee318b9c0 Mon Sep 17 00:00:00 2001 From: Charlie Date: Thu, 29 Jan 2026 16:26:24 -0500 Subject: [PATCH 1/4] Load machine types from remote TSV --- seqBackupLib/illumina.py | 73 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 67 insertions(+), 6 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index fbe2b8e..b049ba0 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -1,9 +1,13 @@ +import csv import re +import warnings from io import TextIOWrapper from pathlib import Path +from urllib.error import URLError +from urllib.request import urlopen -MACHINE_TYPES = { +MACHINE_TYPES_FALLBACK = { "VH": "Illumina-NextSeq", "D": "Illumina-HiSeq", "M": "Illumina-MiSeq", @@ -11,7 +15,62 @@ "NB": "Illumina-MiniSeq", "LH": "Illumina-NovaSeqX", "SH": "Illumina-MiSeq", -} +} # Fallback mapping if machine_types.tsv is unavailable. +MACHINE_TYPES_URL = ( + "https://raw.githubusercontent.com/PennChopMicrobiomeProgram/" + "SampleRegistry/master/sample_registry/data/machine_types.tsv" +) +_machine_types_cache: dict[str, str] | None = None + + +def load_machine_types() -> dict[str, str]: + global _machine_types_cache + if _machine_types_cache is not None: + return _machine_types_cache + + try: + with urlopen(MACHINE_TYPES_URL, timeout=10) as response: + content = response.read().decode("utf-8") + except (URLError, TimeoutError) as exc: + warnings.warn( + f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}", + RuntimeWarning, + ) + _machine_types_cache = MACHINE_TYPES_FALLBACK + return _machine_types_cache + + reader = csv.reader(content.splitlines(), delimiter="\t") + rows = [row for row in reader if row] + if not rows: + warnings.warn( + "Falling back to bundled machine types; received empty machine_types.tsv.", + RuntimeWarning, + ) + _machine_types_cache = MACHINE_TYPES_FALLBACK + return _machine_types_cache + + if rows[0][0].lower() in {"instrument_code", "code"}: + rows = rows[1:] + + mapping: dict[str, str] = {} + for row in rows: + if len(row) < 2: + continue + code = row[0].strip() + machine_type = row[1].strip() + if code and machine_type: + mapping[code] = machine_type + + if not mapping: + warnings.warn( + "Falling back to bundled machine types; no valid rows in machine_types.tsv.", + RuntimeWarning, + ) + _machine_types_cache = MACHINE_TYPES_FALLBACK + return _machine_types_cache + + _machine_types_cache = mapping + return _machine_types_cache def extract_instrument_code(instrument: str) -> str: @@ -37,9 +96,10 @@ def _parse_folder(self) -> dict[str, str]: instrument = parts[1] instrument_code = extract_instrument_code(instrument) - if instrument_code not in MACHINE_TYPES: + machine_types = load_machine_types() + if instrument_code not in machine_types: raise ValueError(f"Invalid instrument code in run name: {instrument}") - self.machine_type = MACHINE_TYPES[instrument_code] + self.machine_type = machine_types[instrument_code] run_number = parts[2] if not run_number.isdigit(): @@ -129,7 +189,8 @@ def filepath(self) -> Path: @property def machine_type(self) -> str: - return MACHINE_TYPES[extract_instrument_code(self.fastq_info["instrument"])] + machine_types = load_machine_types() + return machine_types[extract_instrument_code(self.fastq_info["instrument"])] @property def run_name(self) -> str: @@ -138,7 +199,7 @@ def run_name(self) -> str: if ( len(segments) >= 4 and segments[0].isdigit() - and extract_instrument_code(segments[1]) in MACHINE_TYPES + and extract_instrument_code(segments[1]) in load_machine_types() and segments[2].isdigit() ): return part From 492861ffcc33b9ea7b2d0fe4a9138dc3a59e82e4 Mon Sep 17 00:00:00 2001 From: Charlie Date: Fri, 30 Jan 2026 13:39:14 -0500 Subject: [PATCH 2/4] Load machine types via lazy mapping --- seqBackupLib/illumina.py | 28 +++++++++++++++---- test/test_illumina.py | 60 +++++++++++++++++++++++++++++++++++++--- 2 files changed, 78 insertions(+), 10 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index b049ba0..dc6847b 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -1,6 +1,7 @@ import csv import re import warnings +from collections.abc import Mapping from io import TextIOWrapper from pathlib import Path from urllib.error import URLError @@ -23,6 +24,20 @@ _machine_types_cache: dict[str, str] | None = None +class MachineTypesMapping(Mapping): + def _mapping(self) -> dict[str, str]: + return load_machine_types() + + def __getitem__(self, key: str) -> str: + return self._mapping()[key] + + def __iter__(self): + return iter(self._mapping()) + + def __len__(self) -> int: + return len(self._mapping()) + + def load_machine_types() -> dict[str, str]: global _machine_types_cache if _machine_types_cache is not None: @@ -73,6 +88,9 @@ def load_machine_types() -> dict[str, str]: return _machine_types_cache +MACHINE_TYPES = MachineTypesMapping() + + def extract_instrument_code(instrument: str) -> str: return "".join(filter(lambda x: not x.isdigit(), instrument)) @@ -96,10 +114,9 @@ def _parse_folder(self) -> dict[str, str]: instrument = parts[1] instrument_code = extract_instrument_code(instrument) - machine_types = load_machine_types() - if instrument_code not in machine_types: + if instrument_code not in MACHINE_TYPES: raise ValueError(f"Invalid instrument code in run name: {instrument}") - self.machine_type = machine_types[instrument_code] + self.machine_type = MACHINE_TYPES[instrument_code] run_number = parts[2] if not run_number.isdigit(): @@ -189,8 +206,7 @@ def filepath(self) -> Path: @property def machine_type(self) -> str: - machine_types = load_machine_types() - return machine_types[extract_instrument_code(self.fastq_info["instrument"])] + return MACHINE_TYPES[extract_instrument_code(self.fastq_info["instrument"])] @property def run_name(self) -> str: @@ -199,7 +215,7 @@ def run_name(self) -> str: if ( len(segments) >= 4 and segments[0].isdigit() - and extract_instrument_code(segments[1]) in load_machine_types() + and extract_instrument_code(segments[1]) in MACHINE_TYPES and segments[2].isdigit() ): return part diff --git a/test/test_illumina.py b/test/test_illumina.py index ada9adc..e9f52f9 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -1,8 +1,16 @@ import gzip +from urllib.error import URLError + import pytest -from pathlib import Path + from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE -from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES +from seqBackupLib.illumina import ( + IlluminaDir, + IlluminaFastq, + MACHINE_TYPES, + MACHINE_TYPES_FALLBACK, + load_machine_types, +) machine_fixtures = { @@ -16,7 +24,14 @@ } -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) +@pytest.fixture(autouse=True) +def machine_types_cache(monkeypatch): + monkeypatch.setattr( + "seqBackupLib.illumina._machine_types_cache", MACHINE_TYPES_FALLBACK + ) + + +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) def test_illumina_fastq(machine_type, request): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: @@ -37,7 +52,7 @@ def test_illumina_fastq(machine_type, request): assert r1.check_index_read_exists() -@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys()) +@pytest.mark.parametrize("machine_type", machine_fixtures.keys()) def test_illumina_dir(machine_type, request): fixture_name = machine_fixtures.get(machine_type) if not fixture_name: @@ -58,3 +73,40 @@ def test_illumina_fastq_without_lane(novaseq_dir): r1 = IlluminaFastq(f) assert r1.check_fp_vs_content()[0] assert r1.build_archive_dir().endswith("L001") + + +def test_load_machine_types_from_tsv(monkeypatch): + tsv = "instrument_code\tmachine_type\nZZ\tIllumina-Test\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + monkeypatch.setattr( + "seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv) + ) + monkeypatch.setattr("seqBackupLib.illumina._machine_types_cache", None) + + machine_types = load_machine_types() + assert machine_types["ZZ"] == "Illumina-Test" + + +def test_load_machine_types_fallback_warning(monkeypatch): + def raise_url_error(*args, **kwargs): + raise URLError("network down") + + monkeypatch.setattr("seqBackupLib.illumina.urlopen", raise_url_error) + monkeypatch.setattr("seqBackupLib.illumina._machine_types_cache", None) + + with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"): + machine_types = load_machine_types() + assert machine_types == MACHINE_TYPES_FALLBACK From 415aa5f4f12c89d14c4330c9faeb0eedc89938c2 Mon Sep 17 00:00:00 2001 From: Charlie Date: Fri, 30 Jan 2026 13:52:09 -0500 Subject: [PATCH 3/4] Fix Python 3.9 typing in machine types loader --- seqBackupLib/illumina.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index dc6847b..fdba4bf 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -2,6 +2,7 @@ import re import warnings from collections.abc import Mapping +from typing import Optional from io import TextIOWrapper from pathlib import Path from urllib.error import URLError @@ -21,7 +22,7 @@ "https://raw.githubusercontent.com/PennChopMicrobiomeProgram/" "SampleRegistry/master/sample_registry/data/machine_types.tsv" ) -_machine_types_cache: dict[str, str] | None = None +_machine_types_cache: Optional[dict[str, str]] = None class MachineTypesMapping(Mapping): From f35d9b2e54857c456f0af4fa6b0e4381b0db6bcf Mon Sep 17 00:00:00 2001 From: Charlie Date: Fri, 30 Jan 2026 17:19:52 -0500 Subject: [PATCH 4/4] Remove machine type cache --- seqBackupLib/illumina.py | 18 ++++-------------- test/test_illumina.py | 25 +++++++++++++++++++++---- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/seqBackupLib/illumina.py b/seqBackupLib/illumina.py index fdba4bf..931dd19 100644 --- a/seqBackupLib/illumina.py +++ b/seqBackupLib/illumina.py @@ -2,7 +2,6 @@ import re import warnings from collections.abc import Mapping -from typing import Optional from io import TextIOWrapper from pathlib import Path from urllib.error import URLError @@ -22,7 +21,6 @@ "https://raw.githubusercontent.com/PennChopMicrobiomeProgram/" "SampleRegistry/master/sample_registry/data/machine_types.tsv" ) -_machine_types_cache: Optional[dict[str, str]] = None class MachineTypesMapping(Mapping): @@ -40,10 +38,6 @@ def __len__(self) -> int: def load_machine_types() -> dict[str, str]: - global _machine_types_cache - if _machine_types_cache is not None: - return _machine_types_cache - try: with urlopen(MACHINE_TYPES_URL, timeout=10) as response: content = response.read().decode("utf-8") @@ -52,8 +46,7 @@ def load_machine_types() -> dict[str, str]: f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}", RuntimeWarning, ) - _machine_types_cache = MACHINE_TYPES_FALLBACK - return _machine_types_cache + return MACHINE_TYPES_FALLBACK reader = csv.reader(content.splitlines(), delimiter="\t") rows = [row for row in reader if row] @@ -62,8 +55,7 @@ def load_machine_types() -> dict[str, str]: "Falling back to bundled machine types; received empty machine_types.tsv.", RuntimeWarning, ) - _machine_types_cache = MACHINE_TYPES_FALLBACK - return _machine_types_cache + return MACHINE_TYPES_FALLBACK if rows[0][0].lower() in {"instrument_code", "code"}: rows = rows[1:] @@ -82,11 +74,9 @@ def load_machine_types() -> dict[str, str]: "Falling back to bundled machine types; no valid rows in machine_types.tsv.", RuntimeWarning, ) - _machine_types_cache = MACHINE_TYPES_FALLBACK - return _machine_types_cache + return MACHINE_TYPES_FALLBACK - _machine_types_cache = mapping - return _machine_types_cache + return mapping MACHINE_TYPES = MachineTypesMapping() diff --git a/test/test_illumina.py b/test/test_illumina.py index e9f52f9..22592d0 100644 --- a/test/test_illumina.py +++ b/test/test_illumina.py @@ -25,9 +25,28 @@ @pytest.fixture(autouse=True) -def machine_types_cache(monkeypatch): +def machine_types_urlopen(monkeypatch): + tsv_rows = ["instrument_code\tmachine_type"] + tsv_rows.extend( + f"{code}\t{machine}" for code, machine in MACHINE_TYPES_FALLBACK.items() + ) + tsv = "\n".join(tsv_rows) + "\n" + + class FakeResponse: + def __init__(self, data: str): + self._data = data + + def read(self): + return self._data.encode("utf-8") + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + monkeypatch.setattr( - "seqBackupLib.illumina._machine_types_cache", MACHINE_TYPES_FALLBACK + "seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv) ) @@ -94,7 +113,6 @@ def __exit__(self, exc_type, exc, tb): monkeypatch.setattr( "seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv) ) - monkeypatch.setattr("seqBackupLib.illumina._machine_types_cache", None) machine_types = load_machine_types() assert machine_types["ZZ"] == "Illumina-Test" @@ -105,7 +123,6 @@ def raise_url_error(*args, **kwargs): raise URLError("network down") monkeypatch.setattr("seqBackupLib.illumina.urlopen", raise_url_error) - monkeypatch.setattr("seqBackupLib.illumina._machine_types_cache", None) with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"): machine_types = load_machine_types()