Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 70 additions & 2 deletions seqBackupLib/illumina.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,85 @@
import csv
import re
import warnings
from collections.abc import Mapping
from io import TextIOWrapper
from pathlib import Path
from urllib.error import URLError
from urllib.request import urlopen


MACHINE_TYPES = {
MACHINE_TYPES_FALLBACK = {
"VH": "Illumina-NextSeq",
"D": "Illumina-HiSeq",
"M": "Illumina-MiSeq",
"A": "Illumina-NovaSeq",
"NB": "Illumina-MiniSeq",
"LH": "Illumina-NovaSeqX",
"SH": "Illumina-MiSeq",
}
} # Fallback mapping if machine_types.tsv is unavailable.
MACHINE_TYPES_URL = (
"https://raw.githubusercontent.com/PennChopMicrobiomeProgram/"
"SampleRegistry/master/sample_registry/data/machine_types.tsv"
)


class MachineTypesMapping(Mapping):
def _mapping(self) -> dict[str, str]:
return load_machine_types()

def __getitem__(self, key: str) -> str:
return self._mapping()[key]

def __iter__(self):
return iter(self._mapping())

def __len__(self) -> int:
return len(self._mapping())


def load_machine_types() -> dict[str, str]:
try:
with urlopen(MACHINE_TYPES_URL, timeout=10) as response:
content = response.read().decode("utf-8")
except (URLError, TimeoutError) as exc:
warnings.warn(
f"Falling back to bundled machine types; unable to load {MACHINE_TYPES_URL}: {exc}",
RuntimeWarning,
)
return MACHINE_TYPES_FALLBACK

reader = csv.reader(content.splitlines(), delimiter="\t")
rows = [row for row in reader if row]
if not rows:
warnings.warn(
"Falling back to bundled machine types; received empty machine_types.tsv.",
RuntimeWarning,
)
return MACHINE_TYPES_FALLBACK

if rows[0][0].lower() in {"instrument_code", "code"}:
rows = rows[1:]

mapping: dict[str, str] = {}
for row in rows:
if len(row) < 2:
continue
code = row[0].strip()
machine_type = row[1].strip()
if code and machine_type:
mapping[code] = machine_type

if not mapping:
warnings.warn(
"Falling back to bundled machine types; no valid rows in machine_types.tsv.",
RuntimeWarning,
)
return MACHINE_TYPES_FALLBACK

return mapping


MACHINE_TYPES = MachineTypesMapping()


def extract_instrument_code(instrument: str) -> str:
Expand Down
77 changes: 73 additions & 4 deletions test/test_illumina.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
import gzip
from urllib.error import URLError

import pytest
from pathlib import Path

from seqBackupLib.backup import DEFAULT_MIN_FILE_SIZE
from seqBackupLib.illumina import IlluminaDir, IlluminaFastq, MACHINE_TYPES
from seqBackupLib.illumina import (
IlluminaDir,
IlluminaFastq,
MACHINE_TYPES,
MACHINE_TYPES_FALLBACK,
load_machine_types,
)


machine_fixtures = {
Expand All @@ -16,7 +24,33 @@
}


@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
@pytest.fixture(autouse=True)
def machine_types_urlopen(monkeypatch):
tsv_rows = ["instrument_code\tmachine_type"]
tsv_rows.extend(
f"{code}\t{machine}" for code, machine in MACHINE_TYPES_FALLBACK.items()
)
tsv = "\n".join(tsv_rows) + "\n"

class FakeResponse:
def __init__(self, data: str):
self._data = data

def read(self):
return self._data.encode("utf-8")

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

monkeypatch.setattr(
"seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv)
)


@pytest.mark.parametrize("machine_type", machine_fixtures.keys())
def test_illumina_fastq(machine_type, request):
fixture_name = machine_fixtures.get(machine_type)
if not fixture_name:
Expand All @@ -37,7 +71,7 @@ def test_illumina_fastq(machine_type, request):
assert r1.check_index_read_exists()


@pytest.mark.parametrize("machine_type", MACHINE_TYPES.keys())
@pytest.mark.parametrize("machine_type", machine_fixtures.keys())
def test_illumina_dir(machine_type, request):
fixture_name = machine_fixtures.get(machine_type)
if not fixture_name:
Expand All @@ -58,3 +92,38 @@ def test_illumina_fastq_without_lane(novaseq_dir):
r1 = IlluminaFastq(f)
assert r1.check_fp_vs_content()[0]
assert r1.build_archive_dir().endswith("L001")


def test_load_machine_types_from_tsv(monkeypatch):
tsv = "instrument_code\tmachine_type\nZZ\tIllumina-Test\n"

class FakeResponse:
def __init__(self, data: str):
self._data = data

def read(self):
return self._data.encode("utf-8")

def __enter__(self):
return self

def __exit__(self, exc_type, exc, tb):
return False

monkeypatch.setattr(
"seqBackupLib.illumina.urlopen", lambda *args, **kwargs: FakeResponse(tsv)
)

machine_types = load_machine_types()
assert machine_types["ZZ"] == "Illumina-Test"


def test_load_machine_types_fallback_warning(monkeypatch):
def raise_url_error(*args, **kwargs):
raise URLError("network down")

monkeypatch.setattr("seqBackupLib.illumina.urlopen", raise_url_error)

with pytest.warns(RuntimeWarning, match="Falling back to bundled machine types"):
machine_types = load_machine_types()
assert machine_types == MACHINE_TYPES_FALLBACK
Loading