From dc021ef316b1fbe00a15ca7e3f51aa96d2c8a45c Mon Sep 17 00:00:00 2001 From: Dimitri Yatsenko Date: Fri, 16 Jan 2026 14:37:56 -0600 Subject: [PATCH] docs: auto-generate llms-full.txt during build with version metadata - Add version metadata (timestamp, commit, branch) to generated file - Generate llms-full.txt during docker-compose BUILD mode - Exclude llms-full.txt from git tracking (30,869+ lines removed) - Add to .gitignore to prevent future commits Benefits: - Cleaner git history without 30K line diffs on doc changes - No merge conflicts from generated files - Always fresh with current build metadata - Follows documentation best practices --- .gitignore | 6 +- docker-compose.yaml | 2 + scripts/gen_llms_full.py | 43 +- src/llms-full.txt | 30869 ------------------------------------- 4 files changed, 49 insertions(+), 30871 deletions(-) delete mode 100644 src/llms-full.txt diff --git a/.gitignore b/.gitignore index e1dff8dc..25f19a19 100644 --- a/.gitignore +++ b/.gitignore @@ -4,4 +4,8 @@ site temp* # DataJoint secrets (credentials) -.secrets/ \ No newline at end of file +.secrets/ + +# Generated documentation files +src/llms-full.txt +site/llms-full.txt \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index 6e80e3e0..acfaa903 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -67,6 +67,8 @@ services: # BUILD mode: build static site from pre-executed notebooks # Install datajoint-python for mkdocstrings (needs to import for API docs) pip install -e /datajoint-python + # Generate llms-full.txt with current git info + python scripts/gen_llms_full.py mkdocs build --config-file ./mkdocs.yaml else echo "Unexpected mode..." diff --git a/scripts/gen_llms_full.py b/scripts/gen_llms_full.py index 01a0c00a..a8d11616 100644 --- a/scripts/gen_llms_full.py +++ b/scripts/gen_llms_full.py @@ -4,9 +4,14 @@ This script concatenates all markdown documentation into a single file optimized for LLM consumption. + +The generated file is NOT committed to git - it's auto-generated during +the build process with current version metadata. """ import json +import subprocess +from datetime import datetime, timezone from pathlib import Path # Documentation root @@ -24,6 +29,10 @@ HEADER = """# DataJoint Documentation (Full) +Generated: {timestamp} +Commit: {commit} +Branch: {branch} + > DataJoint is a Python framework for building scientific data pipelines with automated computation, integrity constraints, and seamless integration of relational databases with object storage. This documentation covers DataJoint 2.0. > This file contains the complete documentation for LLM consumption. For an index with links, see /llms.txt @@ -33,6 +42,35 @@ """ +def get_git_info() -> dict[str, str]: + """Get current git commit hash and branch name.""" + try: + commit = subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], + cwd=Path(__file__).parent.parent, + stderr=subprocess.DEVNULL, + ).decode().strip() + except (subprocess.CalledProcessError, FileNotFoundError): + commit = "unknown" + + try: + branch = subprocess.check_output( + ["git", "rev-parse", "--abbrev-ref", "HEAD"], + cwd=Path(__file__).parent.parent, + stderr=subprocess.DEVNULL, + ).decode().strip() + except (subprocess.CalledProcessError, FileNotFoundError): + branch = "unknown" + + timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC") + + return { + "timestamp": timestamp, + "commit": commit, + "branch": branch, + } + + def read_markdown_file(filepath: Path) -> str: """Read a markdown file and return its content.""" try: @@ -79,7 +117,10 @@ def get_doc_files(directory: Path) -> list[Path]: def generate_llms_full(): """Generate the llms-full.txt file.""" - content_parts = [HEADER] + # Get current git info for version metadata + git_info = get_git_info() + header = HEADER.format(**git_info) + content_parts = [header] for section_name, section_dir in SECTIONS: section_path = DOCS_DIR / section_dir diff --git a/src/llms-full.txt b/src/llms-full.txt deleted file mode 100644 index 30a57336..00000000 --- a/src/llms-full.txt +++ /dev/null @@ -1,30869 +0,0 @@ -# DataJoint Documentation (Full) - -> DataJoint is a Python framework for building scientific data pipelines with automated computation, integrity constraints, and seamless integration of relational databases with object storage. This documentation covers DataJoint 2.0. - -> This file contains the complete documentation for LLM consumption. For an index with links, see /llms.txt - ---- - - -============================================================ -# Concepts -============================================================ - - ---- -## File: explanation/computation-model.md - -# Computation Model - -DataJoint's computation model enables automated, reproducible data processing -through the `populate()` mechanism and Jobs 2.0 system. - -## AutoPopulate: The Core Concept - -Tables that inherit from `dj.Imported` or `dj.Computed` can automatically -populate themselves based on upstream data. - -```python -@schema -class Segmentation(dj.Computed): - definition = """ - -> Scan - --- - num_cells : uint32 - cell_masks : - """ - - def make(self, key): - # key contains primary key of one Scan - scan_data = (Scan & key).fetch1('image_data') - - # Your computation - masks, num_cells = segment_cells(scan_data) - - # Insert result - self.insert1({ - **key, - 'num_cells': num_cells, - 'cell_masks': masks - }) -``` - -## The `make()` Contract - -The `make(self, key)` method: - -1. **Receives** the primary key of one upstream entity -2. **Computes** results for that entity -3. **Inserts** results into the table - -DataJoint guarantees: - -- `make()` is called once per upstream entity -- Failed computations can be retried -- Parallel execution is safe - -## Key Source - -The **key source** determines what needs to be computed: - -```python -# Default: all upstream keys not yet in this table -key_source = Scan - Segmentation - -# Custom key source -@property -def key_source(self): - return (Scan & 'quality > 0.8') - self -``` - -## Calling `populate()` - -```python -# Populate all missing entries -Segmentation.populate() - -# Populate specific subset -Segmentation.populate(restriction) - -# Limit number of jobs -Segmentation.populate(limit=100) - -# Show progress -Segmentation.populate(display_progress=True) - -# Suppress errors, continue processing -Segmentation.populate(suppress_errors=True) -``` - -## Jobs 2.0: Distributed Computing - -For parallel and distributed execution, Jobs 2.0 provides: - -### Job States - -```mermaid -stateDiagram-v2 - [*] --> pending : key_source - table - pending --> reserved : reserve() - reserved --> success : complete() - reserved --> error : error() - reserved --> pending : timeout - success --> [*] - error --> pending : ignore/clear -``` - -### Job Table - -Each auto-populated table has an associated jobs table: - -```python -# View job status -Segmentation.jobs() - -# View errors -Segmentation.jobs & 'status = "error"' - -# Clear errors to retry -(Segmentation.jobs & 'status = "error"').delete() -``` - -### Parallel Execution - -```python -# Multiple workers can run simultaneously -# Each reserves different keys - -# Worker 1 -Segmentation.populate(reserve_jobs=True) - -# Worker 2 (different process/machine) -Segmentation.populate(reserve_jobs=True) -``` - -Jobs are reserved atomically—no two workers process the same key. - -### Error Handling - -```python -# Populate with error suppression -Segmentation.populate(suppress_errors=True) - -# Check what failed -errors = (Segmentation.jobs & 'status = "error"').to_dicts() - -# Clear specific error to retry -(Segmentation.jobs & error_key).delete() - -# Clear all errors -(Segmentation.jobs & 'status = "error"').delete() -``` - -## Imported vs. Computed - -| Aspect | `dj.Imported` | `dj.Computed` | -|--------|---------------|---------------| -| Data source | External (files, APIs) | Other tables | -| Typical use | Load raw data | Derive results | -| Diagram color | Blue | Red | - -Both use the same `make()` mechanism. - -## Workflow Integrity - -The computation model maintains **workflow integrity**: - -1. **Dependency order** — Upstream tables populate before downstream -2. **Cascade deletes** — Deleting upstream deletes downstream -3. **Recomputation** — Delete and re-populate to update results - -```python -# Correct an upstream error -(Scan & problem_key).delete() # Cascades to Segmentation - -# Reinsert corrected data -Scan.insert1(corrected_data) - -# Recompute -Segmentation.populate() -``` - -## Job Metadata (Optional) - -Track computation metadata with hidden columns: - -```python -dj.config['jobs.add_job_metadata'] = True -``` - -This adds to computed tables: - -- `_job_start_time` — When computation started -- `_job_duration` — How long it took -- `_job_version` — Code version (if configured) - -## The Three-Part Make Model - -For long-running computations (hours or days), holding a database transaction -open for the entire duration causes problems: - -- Database locks block other operations -- Transaction timeouts may occur -- Resources are held unnecessarily - -The **three-part make pattern** solves this by separating the computation from -the transaction: - -```python -@schema -class SignalAverage(dj.Computed): - definition = """ - -> RawSignal - --- - avg_signal : float64 - """ - - def make_fetch(self, key): - """Step 1: Fetch input data (outside transaction)""" - raw_signal = (RawSignal & key).fetch1("signal") - return (raw_signal,) - - def make_compute(self, key, fetched): - """Step 2: Perform computation (outside transaction)""" - (raw_signal,) = fetched - avg = raw_signal.mean() - return (avg,) - - def make_insert(self, key, fetched, computed): - """Step 3: Insert results (inside brief transaction)""" - (avg,) = computed - self.insert1({**key, "avg_signal": avg}) -``` - -### How It Works - -DataJoint executes the three parts with verification: - -``` -fetched = make_fetch(key) # Outside transaction -computed = make_compute(key, fetched) # Outside transaction - - -fetched_again = make_fetch(key) # Re-fetch to verify -if fetched != fetched_again: - # Inputs changed—abort -else: - make_insert(key, fetched, computed) - -``` - -The key insight: **the computation runs outside any transaction**, but -referential integrity is preserved by re-fetching and verifying inputs before -insertion. If upstream data changed during computation, the job is cancelled -rather than inserting inconsistent results. - -### Benefits - -| Aspect | Standard `make()` | Three-Part Pattern | -|--------|-------------------|--------------------| -| Transaction duration | Entire computation | Only final insert | -| Database locks | Held throughout | Minimal | -| Suitable for | Short computations | Hours/days | -| Integrity guarantee | Transaction | Re-fetch verification | - -### When to Use Each Pattern - -| Computation Time | Pattern | Rationale | -|------------------|---------|-----------| -| Seconds to minutes | Standard `make()` | Simple, transaction overhead acceptable | -| Minutes to hours | Three-part | Avoid long transactions | -| Hours to days | Three-part | Essential for stability | - -The three-part pattern trades off fetching data twice for dramatically reduced -transaction duration. Use it when computation time significantly exceeds fetch -time. - -## Best Practices - -### 1. Keep `make()` Focused - -```python -def make(self, key): - # Good: One clear computation - data = (UpstreamTable & key).fetch1('data') - result = process(data) - self.insert1({**key, 'result': result}) -``` - -### 2. Handle Large Data Efficiently - -```python -def make(self, key): - # Stream large data instead of loading all at once - for row in (LargeTable & key): - process_chunk(row['data']) -``` - -### 3. Use Transactions for Multi-Row Inserts - -```python -def make(self, key): - results = compute_multiple_results(key) - - # All-or-nothing insertion - with dj.conn().transaction: - self.insert(results) -``` - -### 4. Test with Single Keys First - -```python -# Test make() on one key -key = (Scan - Segmentation).fetch1('KEY') -Segmentation().make(key) - -# Then populate all -Segmentation.populate() -``` - -## Summary - -1. **`make(key)`** — Computes one entity at a time -2. **`populate()`** — Executes `make()` for all missing entities -3. **Jobs 2.0** — Enables parallel, distributed execution -4. **Three-part make** — For long computations without long transactions -5. **Cascade deletes** — Maintain workflow integrity -6. **Error handling** — Robust retry mechanisms - - ---- -## File: explanation/custom-codecs.md - -# Extending DataJoint with Custom Codecs - -DataJoint's type system is extensible through **codecs**—plugins that define -how domain-specific Python objects are stored and retrieved. This enables -seamless integration of specialized data types without modifying DataJoint itself. - -## Why Codecs? - -Scientific computing involves diverse data types: - -- **Neuroscience**: Spike trains, neural networks, connectivity graphs -- **Imaging**: Medical images, microscopy stacks, point clouds -- **Genomics**: Sequence alignments, phylogenetic trees, variant calls -- **Physics**: Simulation meshes, particle systems, field data - -Rather than forcing everything into NumPy arrays or JSON, codecs let you work -with native data structures while DataJoint handles storage transparently. - -## The Codec Contract - -A codec defines two operations: - -```mermaid -graph LR - A[Python Object] -->|encode| B[Storable Form] - B -->|decode| A -``` - -| Method | Input | Output | When Called | -|--------|-------|--------|-------------| -| `encode()` | Python object | bytes, dict, or another codec's input | On `insert()` | -| `decode()` | Stored data | Python object | On `fetch()` | - -## Creating a Custom Codec - -### Basic Structure - -```python -import datajoint as dj - -class MyCodec(dj.Codec): - """Store custom objects.""" - name = "mytype" # Used as in definitions - - def get_dtype(self, is_external: bool) -> str: - """Return storage type.""" - return "" # Chain to blob serialization - - def encode(self, value, *, key=None, store_name=None): - """Convert Python object to storable form.""" - return serialize(value) - - def decode(self, stored, *, key=None): - """Convert stored form back to Python object.""" - return deserialize(stored) -``` - -### Auto-Registration - -Codecs register automatically when the class is defined—no decorator needed: - -```python -class GraphCodec(dj.Codec): - name = "graph" # Immediately available as - ... - -# Check registration -assert "graph" in dj.list_codecs() -``` - -## Example: NetworkX Graphs - -```python -import networkx as nx -import datajoint as dj - -class GraphCodec(dj.Codec): - """Store NetworkX graphs as adjacency data.""" - name = "graph" - - def get_dtype(self, is_external: bool) -> str: - # Store as blob (internal) or hash-addressed (external) - return "" if is_external else "" - - def encode(self, graph, *, key=None, store_name=None): - """Serialize graph to dict.""" - return { - 'directed': graph.is_directed(), - 'nodes': list(graph.nodes(data=True)), - 'edges': list(graph.edges(data=True)), - } - - def decode(self, stored, *, key=None): - """Reconstruct graph from dict.""" - cls = nx.DiGraph if stored['directed'] else nx.Graph - G = cls() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G -``` - -Usage: - -```python -@schema -class Connectivity(dj.Computed): - definition = """ - -> Neurons - --- - network : # Small graphs in database - full_network : # Large graphs in object storage - """ - - def make(self, key): - # Build connectivity graph - G = nx.DiGraph() - G.add_edges_from(compute_connections(key)) - - self.insert1({**key, 'network': G, 'full_network': G}) - -# Fetch returns NetworkX graph directly -graph = (Connectivity & key).fetch1('network') -print(f"Nodes: {graph.number_of_nodes()}") -``` - -## Example: Domain-Specific Formats - -### Genomics: Pysam Alignments - -```python -import pysam -import tempfile -from pathlib import Path - -class BamCodec(dj.Codec): - """Store BAM alignments.""" - name = "bam" - - def get_dtype(self, is_external: bool) -> str: - if not is_external: - raise dj.DataJointError(" requires external storage: use ") - return "" # Path-addressed storage for file structure - - def encode(self, alignments, *, key=None, store_name=None): - """Write alignments to BAM format.""" - # alignments is a pysam.AlignmentFile or list of reads - # Storage handled by codec - return alignments - - def decode(self, stored, *, key=None): - """Return ObjectRef for lazy BAM access.""" - return stored # ObjectRef with .open() method -``` - -### Medical Imaging: SimpleITK - -```python -import SimpleITK as sitk -import io - -class MedicalImageCodec(dj.Codec): - """Store medical images with metadata.""" - name = "medimg" - - def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "" - - def encode(self, image, *, key=None, store_name=None): - """Serialize SimpleITK image.""" - # Preserve spacing, origin, direction - buffer = io.BytesIO() - sitk.WriteImage(image, buffer, imageIO='NrrdImageIO') - return { - 'data': buffer.getvalue(), - 'spacing': image.GetSpacing(), - 'origin': image.GetOrigin(), - } - - def decode(self, stored, *, key=None): - """Reconstruct SimpleITK image.""" - buffer = io.BytesIO(stored['data']) - return sitk.ReadImage(buffer) -``` - -## Codec Chaining - -Codecs can chain to other codecs via `get_dtype()`: - -```mermaid -graph LR - A["‹graph›"] -->|get_dtype| B["‹blob›"] - B -->|get_dtype| C["bytes"] - C -->|MySQL| D["LONGBLOB"] -``` - -```python -class CompressedGraphCodec(dj.Codec): - name = "cgraph" - - def get_dtype(self, is_external: bool) -> str: - return "" # Chain to graph codec - - def encode(self, graph, *, key=None, store_name=None): - # Simplify before passing to graph codec - return nx.to_sparse6_bytes(graph) - - def decode(self, stored, *, key=None): - return nx.from_sparse6_bytes(stored) -``` - -## Storage Mode Support - -### Internal Only - -```python -class SmallDataCodec(dj.Codec): - name = "small" - - def get_dtype(self, is_external: bool) -> str: - if is_external: - raise dj.DataJointError(" is internal-only") - return "json" -``` - -### External Only - -```python -class LargeDataCodec(dj.Codec): - name = "large" - - def get_dtype(self, is_external: bool) -> str: - if not is_external: - raise dj.DataJointError(" requires @: use ") - return "" -``` - -### Both Modes - -```python -class FlexibleCodec(dj.Codec): - name = "flex" - - def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "" -``` - -## Validation - -Add validation to catch errors early: - -```python -class StrictGraphCodec(dj.Codec): - name = "strictgraph" - - def validate(self, value): - """Called before encode().""" - if not isinstance(value, nx.Graph): - raise dj.DataJointError( - f"Expected NetworkX graph, got {type(value).__name__}" - ) - if value.number_of_nodes() == 0: - raise dj.DataJointError("Graph must have at least one node") - - def encode(self, graph, *, key=None, store_name=None): - self.validate(graph) - return {...} -``` - -## Best Practices - -### 1. Choose Appropriate Storage - -| Data Size | Recommendation | -|-----------|----------------| -| < 1 KB | `json` or `` | -| 1 KB - 10 MB | `` or `` | -| > 10 MB | `` or `` | -| File structures | `` | - -### 2. Preserve Metadata - -```python -def encode(self, obj, *, key=None, store_name=None): - return { - 'data': serialize(obj), - 'version': '1.0', # For future compatibility - 'dtype': str(obj.dtype), - 'shape': obj.shape, - } -``` - -### 3. Handle Versioning - -```python -def decode(self, stored, *, key=None): - version = stored.get('version', '0.9') - if version == '1.0': - return deserialize_v1(stored) - else: - return deserialize_legacy(stored) -``` - -### 4. Document Your Codec - -```python -class WellDocumentedCodec(dj.Codec): - """ - Store XYZ data structures. - - Supports both internal () and external () storage. - - Examples - -------- - >>> @schema - ... class Results(dj.Computed): - ... definition = ''' - ... -> Experiment - ... --- - ... output : - ... ''' - """ - name = "xyz" -``` - -## Summary - -Custom codecs enable: - -1. **Domain-specific types** — Work with native data structures -2. **Transparent storage** — DataJoint handles serialization -3. **Flexible backends** — Internal, external, or both -4. **Composability** — Chain codecs for complex transformations -5. **Validation** — Catch errors before storage - -The codec system makes DataJoint extensible to any scientific domain without -modifying the core framework. - - ---- -## File: explanation/data-pipelines.md - -# Scientific Data Pipelines - -A **scientific data pipeline** extends beyond a database with computations. It is a comprehensive system that: - -- Manages the complete lifecycle of scientific data from acquisition to delivery -- Integrates diverse tools for data entry, visualization, and analysis -- Provides infrastructure for secure, scalable computation -- Enables collaboration across teams and institutions -- Supports reproducibility and provenance tracking throughout - -## Pipeline Architecture - -A DataJoint pipeline integrates three core components: - -![DataJoint Platform Architecture](../images/dj-platform.png) - -| Component | Purpose | -|-----------|---------| -| **Code Repository** | Version-controlled pipeline definitions, `make` methods, configuration | -| **Relational Database** | System of record for metadata, relationships, and integrity enforcement | -| **Object Store** | Scalable storage for large scientific data (images, recordings, signals) | - -These components work together: code defines the schema and computations, the database tracks all metadata and relationships, and object storage holds the large scientific data files. - -## Pipeline as a DAG - -A DataJoint pipeline forms a **Directed Acyclic Graph (DAG)** at two levels: - -![Pipeline DAG Structure](../images/pipeline-illustration.png) - -**Nodes** represent Python modules, which correspond to database schemas. - -**Edges** represent: - -- Python import dependencies between modules -- Bundles of foreign key references between schemas - -This dual structure ensures that both code dependencies and data dependencies flow in the same direction. - -### DAG Constraints - -> **All foreign key relationships within a schema MUST form a DAG.** -> -> **Dependencies between schemas (foreign keys + imports) MUST also form a DAG.** - -This constraint is fundamental to DataJoint's design. It ensures: - -- **Unidirectional data flow** — Data enters at the top and flows downstream -- **Clear provenance** — Every result traces back to its inputs -- **Safe deletion** — Cascading deletes follow the DAG without cycles -- **Predictable computation** — `populate()` can determine correct execution order - -## The Relational Workflow Model - -DataJoint pipelines are built on the [Relational Workflow Model](relational-workflow-model.md)—a paradigm that extends relational databases with native support for computational workflows. In this model: - -- **Tables represent workflow steps**, not just data storage -- **Foreign keys encode dependencies**, prescribing the order of operations -- **Table tiers** (Lookup, Manual, Imported, Computed) classify how data enters the pipeline -- **The schema forms a DAG** that defines valid execution sequences - -This model treats the database schema as an **executable workflow specification**—defining not just what data exists but when and how it comes into existence. - -## Schema Organization - -Each schema corresponds to a dedicated Python module. The module import structure mirrors the foreign key dependencies between schemas: - -![Schema Structure](../images/schema-illustration.png) - -``` -my_pipeline/ -├── src/ -│ └── my_pipeline/ -│ ├── __init__.py -│ ├── subject.py # subject schema (no dependencies) -│ ├── session.py # session schema (depends on subject) -│ ├── acquisition.py # acquisition schema (depends on session) -│ └── analysis.py # analysis schema (depends on acquisition) -``` - -For practical guidance on organizing multi-schema pipelines, configuring repositories, and managing team access, see [Manage a Pipeline Project](../how-to/manage-pipeline-project.md). - -## Object-Augmented Schemas - -Scientific data often includes large objects—images, recordings, time series, instrument outputs—that don't fit efficiently in relational tables. DataJoint addresses this through **Object-Augmented Schemas (OAS)**, a hybrid storage architecture that preserves relational semantics while handling arbitrarily large data. - -### The OAS Philosophy - -**1. The database remains the system of record.** - -All metadata, relationships, and query logic live in the relational database. The schema defines what data exists, how entities relate, and what computations produce them. Queries operate on the relational structure; results are consistent and reproducible. - -**2. Large objects live in external stores.** - -Object storage (filesystems, S3, GCS, Azure Blob, MinIO) holds the actual bytes—arrays, images, files. The database stores only lightweight references (paths, checksums, metadata). This separation lets the database stay fast while data scales to terabytes. - -**3. Transparent access through codecs.** - -DataJoint's [type system](type-system.md) provides codec types that bridge Python objects and storage: - -| Codec | Purpose | -|-------|---------| -| `` | Serialize Python objects (NumPy arrays, dicts) | -| `` | Same, but stored externally | -| `` | Store files with preserved filenames | -| `` | Path-addressed storage for complex structures (Zarr, HDF5) | -| `` | References to externally-managed files | - -Users work with native Python objects; serialization and storage routing are invisible. - -**4. Referential integrity extends to objects.** - -When a database row is deleted, its associated external objects are garbage-collected. Foreign key cascades work correctly—delete upstream data and downstream results (including their objects) disappear. The database and object store remain synchronized without manual cleanup. - -**5. Multiple storage tiers support diverse access patterns.** - -Different attributes can route to different stores: - -```python -class Recording(dj.Imported): - definition = """ - -> Session - --- - raw_data : # Hot storage for active analysis - archive : # Cold storage for long-term retention - """ -``` - -This architecture lets teams work with terabyte-scale datasets while retaining the query power, integrity guarantees, and reproducibility of the relational model. - -## Pipeline Workflow - -A typical data pipeline workflow: - -1. **Acquisition** — Data is collected from instruments, experiments, or external sources. Raw files land in object storage; metadata populates Manual tables. - -2. **Import** — Automated processes parse raw data, extract signals, and populate Imported tables with structured results. - -3. **Computation** — The `populate()` mechanism identifies new data and triggers downstream processing. Compute resources execute transformations and populate Computed tables. - -4. **Query & Analysis** — Users query results across the pipeline, combining data from multiple stages to generate insights, reports, or visualizations. - -5. **Collaboration** — Team members access the same database concurrently, building on shared results. Foreign key constraints maintain consistency. - -6. **Delivery** — Processed results are exported, integrated into downstream systems, or archived according to project requirements. - -Throughout this process, the schema definition remains the single source of truth. - -## Comparing Approaches - -| Aspect | File-Based Approach | DataJoint Pipeline | -|--------|--------------------|--------------------| -| **Data Structure** | Implicit in filenames/folders | Explicit in schema definition | -| **Dependencies** | Encoded in scripts | Declared through foreign keys | -| **Provenance** | Manual tracking | Automatic through referential integrity | -| **Reproducibility** | Requires careful discipline | Built into the model | -| **Collaboration** | File sharing/conflicts | Concurrent database access | -| **Queries** | Custom scripts per question | Composable query algebra | -| **Scalability** | Limited by filesystem | Database + object-augmented storage | - -The pipeline approach requires upfront investment in schema design. This investment pays dividends through reduced errors, improved reproducibility, and efficient collaboration as projects scale. - -## Summary - -Scientific data pipelines extend the Relational Workflow Model into complete data operations systems: - -- **Pipeline Architecture** — Code repository, relational database, and object store working together -- **DAG Structure** — Unidirectional flow of data and dependencies -- **Object-Augmented Schemas** — Scalable storage with relational semantics - -The schema remains central—defining data structures, dependencies, and computational flow. This pipeline-centric approach lets teams focus on their science while the system handles data integrity, provenance, and reproducibility automatically. - -## See Also - -- [Relational Workflow Model](relational-workflow-model.md) — The conceptual foundation -- [Entity Integrity](entity-integrity.md) — Primary keys and dimensions -- [Type System](type-system.md) — Codec types and storage modes -- [Manage a Pipeline Project](../how-to/manage-pipeline-project.md) — Practical project organization - - ---- -## File: explanation/entity-integrity.md - -# Entity Integrity - -**Entity integrity** ensures a one-to-one correspondence between real-world -entities and their database records. This is the foundation of reliable data -management. - -## The Core Guarantee - -- Each real-world entity → exactly one database record -- Each database record → exactly one real-world entity - -Without entity integrity, databases become unreliable: - -| Integrity Failure | Consequence | -|-------------------|-------------| -| Same entity, multiple records | Fragmented data, conflicting information | -| Multiple entities, same record | Mixed data, privacy violations | -| Cannot match entity to record | Lost data, broken workflows | - -## The Three Questions - -When designing a primary key, answer these three questions: - -### 1. How do I prevent duplicate records? - -Ensure the same entity cannot appear twice in the table. - -### 2. How do I prevent record sharing? - -Ensure different entities cannot share the same record. - -### 3. How do I match entities to records? - -When an entity arrives, how do I find its corresponding record? - -## Example: Laboratory Mouse Database - -Consider a neuroscience lab tracking mice: - -| Question | Answer | -|----------|--------| -| Prevent duplicates? | Each mouse gets a unique ear tag at arrival; database rejects duplicate tags | -| Prevent sharing? | Ear tags are never reused; retired tags are archived | -| Match entities? | Read the ear tag → look up record by primary key | - -```python -@schema -class Mouse(dj.Manual): - definition = """ - ear_tag : char(6) # unique ear tag (e.g., 'M00142') - --- - date_of_birth : date - sex : enum('M', 'F', 'U') - strain : varchar(50) - """ -``` - -The database enforces the first two questions through the primary key constraint. -The third question requires a **physical identification system**—ear tags, -barcodes, or RFID chips that link physical entities to database records. - -## Primary Key Requirements - -In DataJoint, every table must have a primary key. Primary key attributes: - -- **Cannot be NULL** — Every entity must be identifiable -- **Must be unique** — No two entities share the same key -- **Cannot be changed** — Keys are immutable after insertion -- **Declared above the `---` line** — Syntactic convention - -## Natural Keys vs. Surrogate Keys - -### Natural Keys - -Use attributes that naturally identify entities in your domain: - -```python -@schema -class Gene(dj.Lookup): - definition = """ - gene_symbol : varchar(20) # Official gene symbol (e.g., 'BRCA1') - --- - full_name : varchar(200) - chromosome : varchar(5) - """ -``` - -**Advantages:** - -- Meaningful to humans -- Self-documenting -- No additional lookup needed - -### Surrogate Keys - -A **surrogate key** is an identifier used *primarily inside* the database, with minimal or no exposure to end users. Users typically don't search for entities by surrogate keys or use them in conversation. - -```python -@schema -class InternalRecord(dj.Manual): - definition = """ - record_id : uuid # internal identifier, not exposed to users - --- - created_timestamp : datetime(3) - data : - """ -``` - -**Key distinction from natural keys:** Surrogate keys don't require external identification systems because users don't need to match physical entities to records by these keys. - -**When surrogate keys are appropriate:** - -- Entities that exist only within the system (no physical counterpart) -- Privacy-sensitive contexts where natural identifiers shouldn't be stored -- Internal system records that users never reference directly - -**Generating surrogate keys:** DataJoint requires explicit key values rather than database-generated auto-increment. This is intentional: - -- Auto-increment encourages treating keys as "row numbers" rather than entity identifiers -- It's incompatible with composite keys, which DataJoint uses extensively -- It breaks reproducibility (different IDs when rebuilding pipelines) -- It prevents the client-server handshake needed for proper entity integrity - -Use client-side generation instead: - -- **UUIDs** — Generate with `uuid.uuid4()` before insertion -- **ULIDs** — Sortable unique IDs -- **Client-side counters** — Query max value and increment - -**DataJoint recommendation:** Prefer natural keys when they're stable and -meaningful. Use surrogates only when no natural identifier exists or for -privacy-sensitive contexts. - -## Composite Keys - -When no single attribute uniquely identifies an entity, combine multiple -attributes: - -```python -@schema -class Recording(dj.Manual): - definition = """ - -> Session - recording_idx : uint16 # Recording number within session - --- - duration : float32 # seconds - """ -``` - -Here, `(subject_id, session_idx, recording_idx)` together form the primary key. -Neither alone would be unique. - -## Foreign Keys and Dependencies - -Foreign keys in DataJoint serve dual purposes: - -1. **Referential integrity** — Ensures referenced entities exist -2. **Workflow dependency** — Declares that this entity depends on another - -```python -@schema -class Segmentation(dj.Computed): - definition = """ - -> Scan # Depends on Scan - --- - num_cells : uint32 - """ -``` - -The arrow `->` inherits the primary key from `Scan` and establishes both -referential integrity and workflow dependency. - -## Schema Dimensions - -A **dimension** is an independent axis of variation in your data. The fundamental principle: - -> **Any table that introduces a new primary key attribute introduces a new dimension.** - -This is true whether the table has only new attributes or also inherits attributes from foreign keys. The key is simply: new primary key attribute = new dimension. - -### Tables That Introduce Dimensions - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) # NEW dimension: subject_id - --- - species : varchar(50) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject # Inherits subject_id - session_idx : uint16 # NEW dimension: session_idx - --- - session_date : date - """ - -@schema -class Trial(dj.Manual): - definition = """ - -> Session # Inherits subject_id, session_idx - trial_idx : uint16 # NEW dimension: trial_idx - --- - outcome : enum('success', 'fail') - """ -``` - -**All three tables introduce dimensions:** - -- `Subject` introduces `subject_id` dimension -- `Session` introduces `session_idx` dimension (even though it also inherits `subject_id`) -- `Trial` introduces `trial_idx` dimension (even though it also inherits `subject_id`, `session_idx`) - -In schema diagrams, tables that introduce at least one new dimension have **underlined names**. - -### Tables That Don't Introduce Dimensions - -A table introduces **no dimensions** when its entire primary key comes from foreign keys: - -```python -@schema -class SubjectProfile(dj.Manual): - definition = """ - -> Subject # Inherits subject_id only - --- - weight : float32 - """ -``` - -`SubjectProfile` doesn't introduce any new primary key attribute—it extends the `Subject` dimension with additional attributes. There's exactly one profile per subject. - -In schema diagrams, these tables have **non-underlined names**. - -### Computed Tables and Dimensions - -**Computed tables never introduce dimensions.** Their primary key is entirely -inherited from their dependencies: - -```python -@schema -class SessionSummary(dj.Computed): - definition = """ - -> Session # PK = (subject_id, session_idx) - --- - num_trials : uint32 - accuracy : float32 - """ -``` - -This makes sense—computed tables derive data from existing entities rather -than introducing new ones. - -### Part Tables CAN Introduce Dimensions - -Unlike computed tables, **part tables can introduce new dimensions**: - -```python -@schema -class Detection(dj.Computed): - definition = """ - -> Image # Inherits image_id - -> DetectionParams # Inherits params_id - --- - num_blobs : uint32 - """ - - class Blob(dj.Part): - definition = """ - -> master # Inherits (image_id, params_id) - blob_idx : uint16 # NEW dimension within detection - --- - x : float32 - y : float32 - """ -``` - -`Detection` inherits dimensions (no underline in diagram), but `Detection.Blob` -introduces a new dimension (`blob_idx`) for individual blobs within each -detection. - -### Dimensions and Attribute Lineage - -Every foreign key attribute traces back to the dimension where it was first -defined. This is called **attribute lineage**: - -``` -Subject.subject_id → myschema.subject.subject_id (origin) -Session.subject_id → myschema.subject.subject_id (inherited via foreign key) -Session.session_idx → myschema.session.session_idx (origin) -Trial.subject_id → myschema.subject.subject_id (inherited via foreign key) -Trial.session_idx → myschema.session.session_idx (inherited via foreign key) -Trial.trial_idx → myschema.trial.trial_idx (origin) -``` - -Lineage enables **semantic matching**—DataJoint only joins attributes that -trace back to the same dimension. Two attributes named `id` from different -dimensions cannot be accidentally joined. - -See [Semantic Matching](../reference/specs/semantic-matching.md) for details. - -### Recognizing Dimensions in Diagrams - -In schema diagrams: - -| Visual | Meaning | -|--------|---------| -| **Underlined name** | Introduces at least one new dimension | -| Non-underlined name | All PK attributes inherited (no new dimensions) | -| **Thick solid line** | One-to-one extension (no new dimension) | -| **Thin solid line** | Containment (may introduce dimension) | - -Common dimensions in neuroscience: - -- **Subject** — Who/what is being studied -- **Session** — When data was collected -- **Trial** — Individual experimental unit -- **Modality** — Type of data (ephys, imaging, behavior) -- **Parameter set** — Configuration for analysis - -Understanding dimensions helps design schemas that naturally express your -experimental structure and ensures correct joins through semantic matching. - -## Best Practices - -1. **Answer the three questions** before designing any table -2. **Choose stable identifiers** that won't need to change -3. **Keep keys minimal** — Include only what's necessary for uniqueness -4. **Document key semantics** — Explain what the key represents -5. **Consider downstream queries** — Keys affect join performance - -## Common Mistakes - -### Too few key attributes - -```python -# Wrong: experiment_id alone isn't unique -class Trial(dj.Manual): - definition = """ - experiment_id : uint32 - --- - trial_number : uint16 # Should be part of key! - result : float32 - """ -``` - -### Too many key attributes - -```python -# Wrong: timestamp makes every row unique, losing entity semantics -class Measurement(dj.Manual): - definition = """ - subject_id : uint32 - timestamp : datetime(6) # Microsecond precision - --- - value : float32 - """ -``` - -### Mutable natural keys - -```python -# Risky: names can change -class Patient(dj.Manual): - definition = """ - patient_name : varchar(100) # What if they change their name? - --- - date_of_birth : date - """ -``` - -## Summary - -Entity integrity is maintained by: - -1. **Primary keys** that uniquely identify each entity -2. **Foreign keys** that establish valid references -3. **Physical systems** that link real-world entities to records - -The three questions framework ensures your primary keys provide meaningful, -stable identification for your domain entities. - - ---- -## File: explanation/faq.md - -# Frequently Asked Questions - -## Why Does DataJoint Have Its Own Definition and Query Language? - -DataJoint provides a custom data definition language and [query algebra](query-algebra.md) rather than using raw SQL or Object-Relational Mapping (ORM) patterns. This design reflects DataJoint's purpose: enabling research teams to build **[relational workflows](relational-workflow-model.md) with embedded computations** with maximum clarity. These concepts were first formalized in [Yatsenko et al., 2018](https://doi.org/10.48550/arXiv.1807.11104). - -### The Definition Language - -DataJoint's [definition language](../reference/specs/table-declaration.md) is a standalone scripting language for declaring table schemas — not Python syntax embedded in strings. It is designed for uniform support across multiple host languages (Python, MATLAB, and potentially others). The same definition works identically regardless of which language you use. - -### Composite Primary Keys: A Clarity Comparison - -Scientific workflows frequently use composite primary keys built from foreign keys. Compare how different approaches handle this common pattern: - -```python -# DataJoint - two characters declare dependency, foreign key, and inherit primary key -class Scan(dj.Manual): - definition = """ - -> Session - scan_idx : int16 - --- - depth : float32 - """ -``` - -```python -# SQLAlchemy - verbose, scattered, error-prone -class Scan(Base): - subject_id = Column(Integer, primary_key=True) - session_date = Column(Date, primary_key=True) - scan_idx = Column(SmallInteger, primary_key=True) - depth = Column(Float) - __table_args__ = ( - ForeignKeyConstraint( - ['subject_id', 'session_date'], - ['session.subject_id', 'session.session_date'] - ), - ) -``` - -```sql --- Raw SQL - maximum verbosity -CREATE TABLE scan ( - subject_id INT NOT NULL, - session_date DATE NOT NULL, - scan_idx SMALLINT NOT NULL, - depth FLOAT, - PRIMARY KEY (subject_id, session_date, scan_idx), - FOREIGN KEY (subject_id, session_date) - REFERENCES session(subject_id, session_date) -); -``` - -The `-> Session` syntax in DataJoint: - -- Inherits all primary key attributes from Session -- Declares the foreign key constraint -- Establishes the computational dependency (for `populate()`) -- Documents the data lineage - -All in two characters. As pipelines grow to dozens of tables with deep dependency chains, this clarity compounds. - -### Why Multiline Strings? - -| Aspect | Benefit | -|--------|---------| -| **Readable** | Looks like a specification: `---` separates primary from secondary attributes, `#` for comments | -| **Concise** | `mouse_id : int32` vs `mouse_id = Column(Integer, primary_key=True)` | -| **Database-first** | `table.describe()` shows the same format; virtual schemas reconstruct definitions from database metadata | -| **Language-agnostic** | Same syntax for Python, MATLAB, future implementations | -| **Separation of concerns** | Definition string = structure (what); class = behavior (how: `make()` methods) | - -The definition string **is** the specification — a declarative language that describes entities and their relationships, independent of any host language's syntax. - -### Why Custom Query Algebra? - -DataJoint's operators implement **[semantic matching](../reference/specs/semantic-matching.md)** — joins and restrictions match only on attributes connected through the foreign key graph, not arbitrary columns that happen to share a name. This prevents: -- Accidental Cartesian products -- Joins on unrelated columns -- Silent incorrect results - -Every query result has a defined **[entity type](entity-integrity.md)** with a specific [primary key](../reference/specs/primary-keys.md) (algebraic closure). SQL results are untyped bags of rows; DataJoint results are entity sets you can continue to query and compose. - -### Object-Augmented Schemas - -Object-Relational Mappers treat large objects as opaque binary blobs or leave file management to the application. DataJoint's object store **extends the relational schema** (see [Type System](type-system.md)): - -- Relational semantics apply: referential integrity, cascading deletes, query filtering -- Multiple access patterns: lazy `ObjectRef`, streaming via fsspec, explicit download -- Two addressing modes: path-addressed (by primary key) and hash-addressed (deduplicated) - -The object store is part of the relational model — queryable and integrity-protected like any other attribute. - -### Summary - -| Aspect | Raw SQL | Object-Relational Mappers | DataJoint | -|--------|---------|---------------------------|-----------| -| Schema definition | SQL Data Definition Language | Host language classes | Standalone definition language | -| Composite foreign keys | Verbose, repetitive | Verbose, scattered | `-> TableName` | -| Query model | SQL strings | Object navigation | Relational algebra operators | -| Dependencies | Implicit in application | Implicit in application | Explicit in schema | -| Large objects | Binary blobs / manual | Binary blobs / manual | Object-Augmented Schema | -| Computation | External to database | External to database | First-class ([Computed tables](computation-model.md)) | -| Target audience | Database administrators | Web developers | Research teams | - ---- - -## Is DataJoint an ORM? - -**Object-Relational Mapping (ORM)** is a technique for interacting with relational databases through object-oriented programming, abstracting direct SQL queries. Popular Python ORMs include SQLAlchemy, Django ORM, and Peewee, often used in web development. - -DataJoint shares certain ORM characteristics—tables are defined as Python classes, and queries return Python objects. However, DataJoint is fundamentally a **computational database framework** designed for scientific workflows: - -| Aspect | Traditional ORMs | DataJoint | -|--------|-----------------|-----------| -| Primary use case | Web applications | Scientific data pipelines | -| Focus | Simplify database CRUD | Data integrity + computation | -| Dependencies | Implicit (application logic) | Explicit (foreign keys define data flow) | -| Computation | External to database | First-class citizen in schema | -| Query model | Object navigation | Relational algebra | - -DataJoint can be considered an **ORM specialized for scientific databases**—purpose-built for structured experimental data and computational workflows where reproducibility and [data integrity](entity-integrity.md) are paramount. - -## Is DataJoint a Workflow Management System? - -Not exactly. DataJoint and workflow management systems (Airflow, Prefect, Flyte, Nextflow, Snakemake) solve related but distinct problems: - -| Aspect | Workflow Managers | DataJoint | -|--------|-------------------|-----------| -| Core abstraction | Tasks and DAGs | Tables and dependencies | -| State management | External (files, databases) | Integrated (relational database) | -| Scheduling | Built-in schedulers | External (or manual `populate()`) | -| Distributed execution | Built-in | Via external tools | -| Data model | Unstructured (files, blobs) | Structured (relational schema) | -| Query capability | Limited | Full relational algebra | - -**DataJoint excels at:** - -- Defining *what* needs to be computed based on data dependencies -- Ensuring computations are never duplicated -- Maintaining referential integrity across pipeline stages -- Querying intermediate and final results - -**Workflow managers excel at:** - -- Scheduling and orchestrating job execution -- Distributing work across clusters -- Retry logic and failure handling -- Resource management - -**They complement each other.** DataJoint formalizes data dependencies so that external schedulers can effectively manage computational tasks. A common pattern: - -1. DataJoint defines the pipeline structure and tracks what's computed -2. A workflow manager (or simple cron/SLURM scripts) calls [`populate()`](computation-model.md) on a schedule -3. DataJoint determines what work remains and executes it - -## Is DataJoint a Lakehouse? - -DataJoint and lakehouses share goals—integrating structured data management with scalable storage and computation. However, they differ in approach: - -| Aspect | Lakehouse | DataJoint | -|--------|-----------|-----------| -| Data model | Flexible (structured + semi-structured) | Strict relational schema | -| Schema enforcement | Schema-on-read optional | Schema-on-write enforced | -| Primary use | Analytics on diverse data | Scientific workflows | -| Computation model | SQL/Spark queries | Declarative `make()` methods | -| Dependency tracking | Limited | Explicit via foreign keys | - -A **lakehouse** merges data lake flexibility with data warehouse structure, optimized for analytics workloads. - -**DataJoint** prioritizes: - -- Rigorous schema definitions -- Explicit computational dependencies -- Data integrity and reproducibility -- Traceability within structured scientific datasets - -DataJoint can complement lakehouse architectures—using object storage for large files while maintaining relational structure for metadata and provenance. - -## Does DataJoint Require SQL Knowledge? - -No. DataJoint provides a Python API that abstracts SQL: - -| SQL | DataJoint | -|-----|-----------| -| `CREATE TABLE` | Define tables as Python classes | -| `INSERT INTO` | `.insert()` method | -| `SELECT * FROM` | `.to_arrays()`, `.to_dicts()`, `.to_pandas()` | -| `JOIN` | `table1 * table2` | -| `WHERE` | `table & condition` | -| `GROUP BY` | `.aggr()` | - -Understanding relational concepts ([primary keys](entity-integrity.md), foreign keys, [normalization](normalization.md)) is helpful but not required to start. The [tutorials](../tutorials/index.md) teach these concepts progressively. - -Since DataJoint uses standard database backends (MySQL, PostgreSQL), data remains accessible via SQL for users who prefer it or need integration with other tools. - -## How Does DataJoint Handle Large Files? - -DataJoint uses a hybrid storage model called **Object-Augmented Schemas (OAS)**: - -- **Relational database**: Stores metadata, parameters, and relationships -- **Object storage**: Stores large files (images, recordings, arrays) - -The database maintains references to external objects, preserving: - -- Referential integrity (files deleted with their parent records) -- Query capability (filter by metadata, join across tables) -- Deduplication (identical content stored once) - -See [Object Storage](../how-to/use-object-storage.md) for details. - -## Can Multiple Users Share a Pipeline? - -Yes. DataJoint pipelines are inherently collaborative: - -- **Shared database**: All users connect to the same MySQL/PostgreSQL instance -- **Shared schema**: Table definitions are stored in the database -- **Concurrent access**: ACID transactions prevent conflicts -- **Job reservation**: `populate()` coordinates work across processes - -Teams typically: - -1. Share pipeline code via Git -2. Connect to a shared database server -3. Run `populate()` from multiple machines simultaneously - -See [Distributed Computing](../how-to/distributed-computing.md) for multi-process patterns. - - ---- -## File: explanation/index.md - -# Concepts - -Understanding the principles behind DataJoint. - -DataJoint implements the **Relational Workflow Model**—a paradigm that extends -relational databases with native support for computational workflows. This section -explains the core concepts that make DataJoint pipelines reliable, reproducible, -and scalable. - -## Core Concepts - -
- -- :material-sitemap: **[Relational Workflow Model](relational-workflow-model.md)** - - How DataJoint differs from traditional databases. The paradigm shift from - storage to workflow. - -- :material-key: **[Entity Integrity](entity-integrity.md)** - - Primary keys and the three questions. Ensuring one-to-one correspondence - between entities and records. - -- :material-table-split-cell: **[Normalization](normalization.md)** - - Schema design principles. Organizing tables around workflow steps to - minimize redundancy. - -- :material-set-split: **[Query Algebra](query-algebra.md)** - - The five operators: restriction, join, projection, aggregation, union. - Workflow-aware query semantics. - -- :material-code-tags: **[Type System](type-system.md)** - - Three-layer architecture: native, core, and codec types. Internal and - external storage modes. - -- :material-cog-play: **[Computation Model](computation-model.md)** - - AutoPopulate and Jobs 2.0. Automated, reproducible, distributed computation. - -- :material-puzzle: **[Custom Codecs](custom-codecs.md)** - - Extend DataJoint with domain-specific types. The codec extensibility system. - -- :material-pipe: **[Data Pipelines](data-pipelines.md)** - - From workflows to complete data operations systems. Project structure and - object-augmented schemas. - -- :material-frequently-asked-questions: **[FAQ](faq.md)** - - How DataJoint compares to ORMs, workflow managers, and lakehouses. - Common questions answered. - -
- -## Why These Concepts Matter - -Traditional databases store data. DataJoint pipelines **process** data. Understanding -the Relational Workflow Model helps you: - -- Design schemas that naturally express your workflow -- Write queries that are both powerful and intuitive -- Build computations that scale from laptop to cluster -- Maintain data integrity throughout the pipeline lifecycle - - ---- -## File: explanation/normalization.md - -# Schema Normalization - -Schema normalization ensures data integrity by organizing tables to minimize -redundancy and prevent update anomalies. DataJoint's workflow-centric approach -makes normalization intuitive. - -## The Workflow Normalization Principle - -> **"Every table represents an entity type that is created at a specific step -> in a workflow, and all attributes describe that entity as it exists at that -> workflow step."** - -This principle naturally leads to well-normalized schemas. - -## The Intrinsic Attributes Principle - -> **"Each entity should contain only its intrinsic attributes—properties that are inherent to the entity itself. Relationships, assignments, and events that happen over time belong in separate tables."** - -**Full workflow entity normalization** is achieved when: - -1. Each row represents a single, well-defined entity -2. Each entity is entered once when first tracked -3. Events that happen at later stages belong in separate tables - -## Why Normalization Matters - -Without normalization, databases suffer from: - -- **Redundancy** — Same information stored multiple times -- **Update anomalies** — Changes require updating multiple rows -- **Insertion anomalies** — Can't add data without unrelated data -- **Deletion anomalies** — Deleting data loses unrelated information - -## DataJoint's Approach - -Traditional normalization analyzes **functional dependencies** to determine -table structure. DataJoint takes a different approach: design tables around -**workflow steps**. - -### Example: Mouse Housing - -**Problem: Cage is not intrinsic to a mouse.** A mouse's cage can change over time. The cage assignment is an **event** that happens after the mouse is first tracked. - -**Denormalized (problematic):** - -```python -# Wrong: cage info repeated for every mouse -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - cage_id : int32 - cage_location : varchar(50) # Redundant! - cage_temperature : float32 # Redundant! - weight : float32 - """ -``` - -**Partially normalized (better, but not complete):** - -```python -@schema -class Cage(dj.Manual): - definition = """ - cage_id : int32 - --- - location : varchar(50) - """ - -@schema -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - -> Cage # Still treats cage as static attribute - """ -``` - -**Fully normalized (correct):** - -```python -@schema -class Cage(dj.Manual): - definition = """ - cage_id : int32 - --- - location : varchar(50) - """ - -@schema -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - date_of_birth : date - sex : enum('M', 'F') - # Note: NO cage reference here! - # Cage is not intrinsic to the mouse - """ - -@schema -class CageAssignment(dj.Manual): - definition = """ - -> Mouse - assignment_date : date - --- - -> Cage - removal_date=null : date - """ - -@schema -class MouseWeight(dj.Manual): - definition = """ - -> Mouse - weigh_date : date - --- - weight : float32 - """ -``` - -This fully normalized design: - -- **Intrinsic attributes only** — `Mouse` contains only attributes determined at creation (birth date, sex) -- **Cage assignment as event** — `CageAssignment` tracks the temporal relationship between mice and cages -- **Single entity per row** — Each mouse is entered once when first tracked -- **Later events separate** — Cage assignments, weight measurements happen after initial tracking -- **History preserved** — Can track cage moves over time without data loss - -## The Workflow Test - -Ask these questions to determine table structure: - -### 1. "Is this an intrinsic attribute of the entity?" - -An intrinsic attribute is inherent to the entity itself and determined when the entity is first created. - -- **Intrinsic:** Mouse's date of birth, sex, genetic strain -- **Not intrinsic:** Mouse's cage (assignment that changes), weight (temporal measurement) - -If not intrinsic → separate table for the relationship or event - -### 2. "At which workflow step is this attribute determined?" - -- If an attribute is determined at a **different step**, it belongs in a **different table** -- If an attribute **changes over time**, it needs its own table with a **temporal key** - -### 3. "Is this a relationship or event?" - -- **Relationships** (cage assignment, group membership) → association table with temporal keys -- **Events** (measurements, observations) → separate table with event date/time -- **States** (approval status, processing stage) → state transition table - -## Common Patterns - -### Lookup Tables - -Store reference data that doesn't change: - -```python -@schema -class Species(dj.Lookup): - definition = """ - species : varchar(50) - --- - common_name : varchar(100) - """ - contents = [ - ('Mus musculus', 'House mouse'), - ('Rattus norvegicus', 'Brown rat'), - ] -``` - -### Parameter Sets - -Store versioned configurations: - -```python -@schema -class AnalysisParams(dj.Lookup): - definition = """ - params_id : int32 - --- - threshold : float32 - window_size : int32 - """ -``` - -### Temporal Tracking - -Track measurements or observations over time: - -```python -@schema -class SubjectWeight(dj.Manual): - definition = """ - -> Subject - weight_date : date - --- - weight : float32 # grams - """ -``` - -### Temporal Associations - -Track relationships or assignments that change over time: - -```python -@schema -class GroupAssignment(dj.Manual): - definition = """ - -> Subject - assignment_date : date - --- - -> ExperimentalGroup - removal_date=null : date - """ - -@schema -class HousingAssignment(dj.Manual): - definition = """ - -> Animal - move_date : date - --- - -> Cage - move_reason : varchar(200) - """ -``` - -**Key pattern:** The relationship itself (subject-to-group, animal-to-cage) is **not intrinsic** to either entity. It's a temporal event that happens during the workflow. - -## Benefits in DataJoint - -1. **Natural from workflow thinking** — Designing around workflow steps - naturally produces normalized schemas - -2. **Cascade deletes** — Normalization + foreign keys enable safe cascade - deletes that maintain consistency - -3. **Join efficiency** — Normalized tables with proper keys enable efficient - joins through the workflow graph - -4. **Clear provenance** — Each table represents a distinct workflow step, - making data lineage clear - -## Summary - -**Core principles:** - -1. **Intrinsic attributes only** — Each entity contains only properties inherent to itself -2. **One entity, one entry** — Each entity entered once when first tracked -3. **Events separate** — Relationships, assignments, measurements that happen later belong in separate tables -4. **Workflow steps** — Design tables around the workflow step that creates each entity -5. **Temporal keys** — Relationships and observations that change over time need temporal keys (dates, timestamps) - -**Ask yourself:** - -- Is this attribute intrinsic to the entity? (No → separate table) -- Does this attribute change over time? (Yes → temporal table) -- Is this a relationship or event? (Yes → association/event table) - -Following these principles achieves **full workflow entity normalization** where each table represents a single, well-defined entity type entered at a specific workflow step. - - ---- -## File: explanation/query-algebra.md - -# Query Algebra - -DataJoint provides a powerful query algebra built on five core operators: restriction, join, projection, aggregation, and union. These operators work on **entity sets** (query expressions) and always return entity sets, enabling arbitrary composition. - -## Algebraic Closure - -A fundamental property of DataJoint's query algebra is **algebraic closure**: every query result is itself a valid entity set with a well-defined **entity type** — you always know what kind of entity the result represents, identified by a specific primary key. Unlike SQL where query results are unstructured "bags of rows," DataJoint determines the entity type of each result based on the operator and the functional dependencies between operands. - -This means operators can be chained indefinitely — the output of any operation is a valid input to any other operation. See [Primary Keys](../reference/specs/primary-keys.md) for the precise rules. - -## Core Operators - -```mermaid -graph LR - A[Entity Set] --> R[Restriction &] - A --> J[Join *] - A --> E[Extend .extend] - A --> P[Projection .proj] - A --> G[Aggregation .aggr] - A --> U[Union +] - R --> B[Entity Set] - J --> B - E --> B - P --> B - G --> B - U --> B -``` - -## Restriction (`&` and `-`) - -Filter entities based on conditions. - -### Include (`&`) - -```python -# Mice born after 2024 -Mouse & 'date_of_birth > "2024-01-01"' - -# Sessions for a specific mouse -Session & {'mouse_id': 42} - -# Sessions matching a query -Session & (Mouse & 'strain = "C57BL/6"') -``` - -### Exclude (`-`) - -```python -# Mice NOT in the study -Mouse - StudyMouse - -# Sessions without recordings -Session - Recording -``` - -### Top N (`dj.Top`) - -Select a limited number of entities with ordering: - -```python -# Most recent 10 sessions -Session & dj.Top(10, 'session_date DESC') - -# First session by primary key -Session & dj.Top() -``` - -The `order_by` parameter accepts attribute names with optional `DESC`/`ASC`. The special value `"KEY"` is an alias for all primary key attributes (e.g., `"KEY DESC"` for reverse primary key order). - -## Join (`*`) - -Combine entity sets along shared attributes. - -```python -# All session-recording pairs -Session * Recording - -# Chain through workflow -Mouse * Session * Scan * Segmentation -``` - -DataJoint joins are **natural joins** that: - -- Match on attributes with the same name **and** lineage -- Respect declared dependencies (no accidental matches) -- Produce the intersection of matching entities - -### Extend (`.extend()`) - -Add attributes from another entity set while preserving all entities in the original set. - -```python -# Add session info to each trial -Trial.extend(Session) # Adds session_date, subject_id to Trial - -# Add neuron properties to spike times -SpikeTime.extend(Neuron) # Adds cell_type, depth to SpikeTime -``` - -**How it differs from join:** - -- **Join (`*`)**: Returns only matching entities (inner join), primary key is the union of both PKs -- **Extend**: Returns all entities from the left side (left join), primary key stays as the left side's PK - -**Primary key formation:** - -```python -# Join: PK is union of both primary keys -result = Session * Trial -# PK: (session_id, trial_num) - -# Extend: PK stays as left side's PK -result = Trial.extend(Session) -# PK: (session_id, trial_num) - same as Trial -# session_date is added as a non-primary attribute -``` - -**Requirement:** The left side must **determine** the right side. This means all primary key attributes from the right side must exist in the left side. This requirement ensures: - -1. Every entity in the left side can match at most one entity in the right side -2. The left side's primary key uniquely identifies entities in the result -3. No NULL values appear in the result's primary key - -```python -# Valid: Trial determines Session -# (session_id is in Trial's primary key) -Trial.extend(Session) ✓ -# Each trial belongs to exactly one session -# Result PK: (session_id, trial_num) - -# Invalid: Session does NOT determine Trial -# (trial_num is not in Session) -Session.extend(Trial) ✗ # Error: trial_num not in Session -# A session has multiple trials - PK would be ambiguous -``` - -**Why use extend?** - -1. **Preserve all entities**: When you need attributes from a parent but want to keep all children (even orphans) -2. **Clear intent**: Expresses "add attributes" rather than "combine entity sets" -3. **No filtering**: Guarantees the same number of entities in the result - -Think of extend as projection-like: it adds attributes to existing entities without changing which entities are present. - -## Projection (`.proj()`) - -Select and transform attributes. - -### Select attributes - -```python -# Only mouse_id and strain -Mouse.proj('strain') - -# Rename attributes -Mouse.proj(animal_id='mouse_id') -``` - -### Compute new attributes - -```python -# Calculate age -Mouse.proj( - age='DATEDIFF(CURDATE(), date_of_birth)' -) - -# Combine attributes -Session.proj( - session_label='CONCAT(subject_id, "-", session_date)' -) -``` - -### Aggregate in projection - -```python -# Count recordings per session -Session.aggr(Recording, n_recordings='COUNT(*)') -``` - -## Aggregation (`.aggr()`) - -Summarize across groups. - -```python -# Average spike rate per neuron -Neuron.aggr( - SpikeTime, - avg_rate='AVG(spike_rate)', - total_spikes='COUNT(*)' -) - -# Statistics per session -Session.aggr( - Trial, - n_trials='COUNT(*)', - success_rate='AVG(success)' -) -``` - -## Union (`+`) - -Combine entity sets with the same attributes. - -```python -# All subjects from two studies -StudyA_Subjects + StudyB_Subjects - -# Combine results from different analyses -AnalysisV1 + AnalysisV2 -``` - -Requirements: - -- Same primary key structure -- Compatible attribute types - -## Operator Composition - -Operators compose freely: - -```python -# Complex query -result = ( - (Mouse & 'strain = "C57BL/6"') # Filter mice - * Session # Join sessions - * Scan # Join scans - .proj('scan_date', 'depth') # Select attributes - & 'depth > 200' # Filter by depth -) -``` - -## Workflow-Aware Joins - -Unlike SQL's natural joins that match on **any** shared column name, DataJoint -joins match on **semantic lineage**. Two attributes match only if they: - -1. Have the same name -2. Trace back to the same source definition - -This prevents accidental joins on coincidentally-named columns. - -## Fetching Results - -Query expressions are lazy—they build SQL but don't execute until you fetch: - -```python -# Fetch as NumPy recarray -data = query.to_arrays() - -# Fetch as list of dicts -data = query.to_dicts() - -# Fetch as pandas DataFrame -df = query.to_pandas() - -# Fetch specific attributes -ids, dates = query.to_arrays('mouse_id', 'session_date') - -# Fetch single row -row = (query & key).fetch1() -``` - -## Summary - -| Operator | Symbol/Method | Purpose | -|----------|---------------|---------| -| Restriction | `&`, `-` | Filter entities | -| Join | `*` | Combine entity sets (inner join) | -| Extend | `.extend()` | Add attributes (left join) | -| Projection | `.proj()` | Select/transform attributes | -| Aggregation | `.aggr()` | Summarize groups | -| Union | `+` | Combine parallel sets | - -These core operators, combined with workflow-aware join semantics, provide -complete query capability for scientific data pipelines. - - ---- -## File: explanation/relational-workflow-model.md - -# The Relational Workflow Model - -DataJoint implements the **Relational Workflow Model**—a paradigm that extends -relational databases with native support for computational workflows. This model -defines a new class of databases called **Computational Databases**, where -computational transformations are first-class citizens of the data model. - -These concepts, along with DataJoint's schema definition language and query algebra, -were first formalized in [Yatsenko et al., 2018](https://doi.org/10.48550/arXiv.1807.11104). - -## The Problem with Traditional Approaches - -Traditional relational databases excel at storing and querying data but struggle -with computational workflows. They can store inputs and outputs, but: - -- The database doesn't understand that outputs were *computed from* inputs -- It doesn't automatically recompute when inputs change -- It doesn't track provenance - -**DataJoint solves these problems by treating your database schema as an -executable workflow specification.** - -## Three Paradigms Compared - -The relational data model has been interpreted through different conceptual -frameworks, each with distinct strengths and limitations: - -| Aspect | Mathematical (Codd) | Entity-Relationship (Chen) | **Relational Workflow (DataJoint)** | -|--------|---------------------|----------------------------|-------------------------------------| -| **Core Question** | "What functional dependencies exist?" | "What entity types exist?" | **"When/how are entities created?"** | -| **Time Dimension** | Not addressed | Not central | **Fundamental** | -| **Implementation Gap** | High (abstract to SQL) | High (ERM to SQL) | **None (unified approach)** | -| **Workflow Support** | None | None | **Native workflow modeling** | - -### Codd's Mathematical Foundation - -Edgar F. Codd's original relational model is rooted in predicate calculus and -set theory. Tables represent logical predicates; rows assert true propositions. -While mathematically rigorous, this approach requires abstract reasoning that -doesn't map to intuitive domain thinking. - -### Chen's Entity-Relationship Model - -Peter Chen's Entity-Relationship Model (ERM) shifted focus to concrete domain -modeling—entities and relationships visualized in diagrams. However, ERM: - -- Creates a gap between conceptual design and SQL implementation -- Lacks temporal dimension ("when" entities are created) -- Treats relationships as static connections, not dynamic processes - -## The Relational Workflow Model - -The Relational Workflow Model introduces four fundamental concepts: - -### 1. Workflow Entities - -Unlike traditional entities that exist independently, **workflow entities** are -artifacts of workflow execution—they represent the products of specific -operations. This temporal dimension allows us to understand not just *what* -exists, but *when* and *how* it came to exist. - -### 2. Workflow Dependencies - -**Workflow dependencies** extend foreign keys with operational semantics. They -don't just ensure referential integrity—they prescribe the order of operations. -Parent entities must be created before child entities. - -```mermaid -graph LR - A[Session] --> B[Scan] - B --> C[Segmentation] - C --> D[Analysis] -``` - -### 3. Workflow Steps (Table Tiers) - -Each table represents a distinct **workflow step** with a specific role: - -```mermaid -graph TD - subgraph "Lookup (Gray)" - L[Parameters] - end - subgraph "Manual (Green)" - M[Subject] - S[Session] - end - subgraph "Imported (Blue)" - I[Recording] - end - subgraph "Computed (Red)" - C[Analysis] - end - - L --> C - M --> S - S --> I - I --> C -``` - -| Tier | Role | Examples | -|------|------|----------| -| **Lookup** | Reference data, parameters | Species, analysis methods | -| **Manual** | Human-entered observations | Subjects, sessions | -| **Imported** | Automated data acquisition | Recordings, images | -| **Computed** | Derived results | Analyses, statistics | - -### 4. Directed Acyclic Graph (DAG) - -The schema forms a **DAG** that: - -- Prohibits circular dependencies -- Ensures valid execution sequences -- Enables efficient parallel execution -- Supports resumable computation - -## The Workflow Normalization Principle - -> **"Every table represents an entity type that is created at a specific step -> in a workflow, and all attributes describe that entity as it exists at that -> workflow step."** - -This principle extends entity normalization with temporal and operational -dimensions. - -## Why This Matters - -### Unified Design and Implementation - -Unlike the ERM-SQL gap, DataJoint provides unified: - -- **Diagramming** — Schema diagrams reflect actual structure -- **Definition** — Table definitions are executable code -- **Querying** — Operators understand workflow semantics - -No translation needed between conceptual design and implementation. - -### Temporal and Operational Awareness - -The model captures the dynamic nature of workflows: - -- Data processing sequences -- Computational dependencies -- Operation ordering - -### Immutability and Provenance - -Workflow artifacts are immutable once created: - -- Preserves execution history -- Maintains data provenance -- Enables reproducible science - -When you delete upstream data, dependent results cascade-delete automatically. -To correct errors, you delete, reinsert, and recompute—ensuring every result -represents a consistent computation from valid inputs. - -### Workflow Integrity - -The DAG structure guarantees: - -- No circular dependencies -- Valid operation sequences -- Enforced temporal order -- Computational validity - -## Query Algebra with Workflow Semantics - -DataJoint's five operators provide a complete query algebra: - -| Operator | Symbol | Purpose | -|----------|--------|---------| -| **Restriction** | `&` | Filter entities | -| **Join** | `*` | Combine from converging paths | -| **Projection** | `.proj()` | Select/compute attributes | -| **Aggregation** | `.aggr()` | Summarize groups | -| **Union** | `+` | Combine parallel branches | - -These operators: - -- Take entity sets as input, produce entity sets as output -- Preserve entity integrity -- Respect declared dependencies (no ambiguous joins) - -## From Transactions to Transformations - -The Relational Workflow Model represents a conceptual shift: - -| Traditional View | Workflow View | -|------------------|---------------| -| Tables store data | Entity sets are workflow steps | -| Rows are records | Entities are execution instances | -| Foreign keys enforce consistency | Dependencies specify information flow | -| Updates modify state | Computations create new states | -| Schemas organize storage | Schemas specify pipelines | -| Queries retrieve data | Queries trace provenance | - -This makes DataJoint feel less like a traditional database and more like a -**workflow engine with persistent state**—one that maintains computational -validity while supporting scientific flexibility. - -## Summary - -The Relational Workflow Model: - -1. **Extends** relational theory (doesn't replace it) -2. **Adds** temporal and operational semantics -3. **Eliminates** the design-implementation gap -4. **Enables** reproducible computational workflows -5. **Maintains** mathematical rigor - -It's not a departure from relational databases—it's their evolution for -computational workflows. - - ---- -## File: explanation/type-system.md - -# Type System - -DataJoint's type system provides a three-layer architecture that balances -database efficiency with Python convenience. - -## Three-Layer Architecture - -```mermaid -graph TB - subgraph "Layer 3: Codecs" - blob["‹blob›"] - attach["‹attach›"] - object["‹object@›"] - hash["‹hash@›"] - custom["‹custom›"] - end - subgraph "Layer 2: Core Types" - int32 - float64 - varchar - json - bytes - end - subgraph "Layer 1: Native" - INT["INT"] - DOUBLE["DOUBLE"] - VARCHAR["VARCHAR"] - JSON_N["JSON"] - BLOB["LONGBLOB"] - end - - blob --> bytes - attach --> bytes - object --> json - hash --> json - bytes --> BLOB - json --> JSON_N - int32 --> INT - float64 --> DOUBLE - varchar --> VARCHAR -``` - -## Layer 1: Native Database Types - -Backend-specific types (MySQL, PostgreSQL). **Discouraged for direct use.** - -```python -# Native types (avoid) -column : TINYINT UNSIGNED -column : MEDIUMBLOB -``` - -## Layer 2: Core DataJoint Types - -Standardized, scientist-friendly types that work identically across backends. - -### Numeric Types - -| Type | Description | Range | -|------|-------------|-------| -| `int8` | 8-bit signed | -128 to 127 | -| `int16` | 16-bit signed | -32,768 to 32,767 | -| `int32` | 32-bit signed | ±2 billion | -| `int64` | 64-bit signed | ±9 quintillion | -| `uint8` | 8-bit unsigned | 0 to 255 | -| `uint16` | 16-bit unsigned | 0 to 65,535 | -| `uint32` | 32-bit unsigned | 0 to 4 billion | -| `uint64` | 64-bit unsigned | 0 to 18 quintillion | -| `float32` | 32-bit float | ~7 significant digits | -| `float64` | 64-bit float | ~15 significant digits | -| `decimal(n,f)` | Fixed-point | Exact decimal | - -### String Types - -| Type | Description | -|------|-------------| -| `char(n)` | Fixed-length string | -| `varchar(n)` | Variable-length string | -| `enum(...)` | Enumeration of string labels | - -### Other Types - -| Type | Description | -|------|-------------| -| `bool` | True/False | -| `date` | Date only | -| `datetime` | Date and time (UTC) | -| `json` | JSON document | -| `uuid` | Universally unique identifier | -| `bytes` | Raw binary | - -## Layer 3: Codec Types - -Codecs provide `encode()`/`decode()` semantics for complex Python objects. - -### Syntax - -- **Angle brackets**: ``, ``, `` -- **`@` indicates external storage**: `` stores externally -- **Store name**: `` uses named store "cold" - -### Built-in Codecs - -| Codec | Internal | External | Returns | -|-------|----------|----------|---------| -| `` | ✅ | ✅ `` | Python object | -| `` | ✅ | ✅ `` | Local file path | -| `` | ❌ | ✅ | ObjectRef | -| `` | ❌ | ✅ | bytes | -| `` | ❌ | ✅ | ObjectRef | - -### `` — Serialized Python Objects - -Stores NumPy arrays, dicts, lists, and other Python objects. - -```python -class Results(dj.Computed): - definition = """ - -> Analysis - --- - spike_times : # In database - waveforms : # External, default store - raw_data : # External, 'archive' store - """ -``` - -### `` — File Attachments - -Stores files with filename preserved. - -```python -class Config(dj.Manual): - definition = """ - config_id : int - --- - settings : # Small config file - data_file : # Large file, external - """ -``` - -### `` — Path-Addressed Storage - -For large/complex file structures (Zarr, HDF5). Path derived from primary key. - -```python -class ProcessedData(dj.Computed): - definition = """ - -> Recording - --- - zarr_data : # Stored at {schema}/{table}/{pk}/ - """ -``` - -### `` — Portable References - -References to externally-managed files with portable paths. - -```python -class RawData(dj.Manual): - definition = """ - session_id : int - --- - recording : # Relative to 'raw' store - """ -``` - -## Storage Modes - -| Mode | Database Storage | External Storage | Use Case | -|------|------------------|------------------|----------| -| Internal | Yes | No | Small data | -| External | Metadata only | Yes | Large data | -| Hash-addressed | Metadata only | Deduplicated | Repeated data | -| Path-addressed | Metadata only | PK-based path | Complex files | - -## Custom Codecs - -Extend the type system for domain-specific data: - -```python -class GraphCodec(dj.Codec): - """Store NetworkX graphs.""" - name = "graph" - - def get_dtype(self, is_external): - return "" - - def encode(self, graph, *, key=None, store_name=None): - return { - 'nodes': list(graph.nodes()), - 'edges': list(graph.edges()) - } - - def decode(self, stored, *, key=None): - import networkx as nx - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G -``` - -Usage: - -```python -class Network(dj.Computed): - definition = """ - -> Analysis - --- - connectivity : - """ -``` - -## Choosing Types - -| Data | Recommended Type | -|------|------------------| -| Small scalars | Core types (`int32`, `float64`) | -| Short strings | `varchar(n)` | -| NumPy arrays (small) | `` | -| NumPy arrays (large) | `` | -| Files to attach | `` or `` | -| Zarr/HDF5 | `` | -| External file refs | `` | -| Custom objects | Custom codec | - -## Summary - -1. **Core types** for simple data — `int32`, `varchar`, `datetime` -2. **``** for Python objects — NumPy arrays, dicts -3. **`@` suffix** for external storage — ``, `` -4. **Custom codecs** for domain-specific types - - ---- -## File: explanation/whats-new-2.md - -# What's New in DataJoint 2.0 - -DataJoint 2.0 is a major release that establishes DataJoint as a mature framework for scientific data pipelines. The version jump from 0.14 to 2.0 reflects the significance of these changes. - -> **📘 Upgrading from legacy DataJoint (pre-2.0)?** -> -> This page summarizes new features and concepts. For step-by-step migration instructions, see the **[Migration Guide](../how-to/migrate-to-v20.md)**. - -## Overview - -DataJoint 2.0 introduces fundamental improvements to type handling, job coordination, and object storage while maintaining compatibility with your existing pipelines during migration. Key themes: - -- **Explicit over implicit**: All type conversions are now explicit through the codec system -- **Better distributed computing**: Per-table job coordination with improved error handling -- **Object storage integration**: Native support for large arrays and files -- **Future-proof architecture**: Portable types preparing for PostgreSQL backend support - -### Breaking Changes at a Glance - -If you're upgrading from legacy DataJoint, these changes require code updates: - -| Area | Legacy | 2.0 | -|------|--------|-----| -| **Fetch API** | `table.fetch()` | `table.to_dicts()` or `.to_arrays()` | -| **Update** | `(table & key)._update('attr', val)` | `table.update1({**key, 'attr': val})` | -| **Join** | `table1 @ table2` | `table1 * table2` (with semantic check) | -| **Type syntax** | `longblob`, `int unsigned` | ``, `uint32` | -| **Jobs** | `~jobs` table | Per-table `~~table_name` | - -See the [Migration Guide](../how-to/migrate-to-v20.md) for complete upgrade steps. - -## Object-Augmented Schema (OAS) - -DataJoint 2.0 unifies relational tables with object storage into a single coherent system. The relational database stores metadata and references while large objects (arrays, files, Zarr datasets) are stored in object storage—with full referential integrity maintained across both layers. - -→ [Type System Specification](../reference/specs/type-system.md) - -**Three storage sections:** - -| Section | Addressing | Use Case | -|---------|------------|----------| -| **Internal** | Row-based (in database) | Small objects (< 1 MB) | -| **Hash-addressed** | Content hash | Arrays, files (deduplication) | -| **Path-addressed** | Primary key path | Zarr, HDF5, streaming access | - -**New syntax:** - -```python -definition = """ -recording_id : uuid ---- -metadata : # Internal storage -raw_data : # Hash-addressed object storage -zarr_array : # Path-addressed for Zarr/HDF5 -""" -``` - -## Explicit Type System - -**Breaking change**: DataJoint 2.0 makes all type conversions explicit through a three-tier architecture. - -→ [Type System Specification](../reference/specs/type-system.md) · [Codec API Specification](../reference/specs/codec-api.md) - -### What Changed - -Legacy DataJoint overloaded MySQL types with implicit conversions: -- `longblob` could be blob serialization OR in-table attachment -- `attach` was implicitly converted to longblob -- `uuid` was used internally for external storage - -**DataJoint 2.0 makes everything explicit:** - -| Legacy (Implicit) | 2.0 (Explicit) | -|-------------------|----------------| -| `longblob` | `` | -| `attach` | `` | -| `blob@store` | `` | -| `int unsigned` | `uint32` | - -### Three-Tier Architecture - -1. **Native types**: MySQL types (`INT`, `VARCHAR`, `LONGBLOB`) -2. **Core types**: Portable aliases (`int32`, `float64`, `varchar`, `uuid`, `json`) -3. **Codecs**: Serialization for Python objects (``, ``, ``) - -### Custom Codecs - -Replace legacy AdaptedTypes with the new codec API: - -```python -class GraphCodec(dj.Codec): - name = "graph" - - def encode(self, value, **kwargs): - return list(value.edges) - - def decode(self, stored, **kwargs): - import networkx as nx - return nx.Graph(stored) -``` - -## Jobs 2.0 - -**Breaking change**: Redesigned job coordination with per-table job management. - -→ [AutoPopulate Specification](../reference/specs/autopopulate.md) · [Job Metadata Specification](../reference/specs/job-metadata.md) - -### What Changed - -| Legacy (Schema-level) | 2.0 (Per-table) | -|----------------------|-----------------| -| One `~jobs` table per schema | One `~~table_name` per Computed/Imported table | -| Opaque hashed keys | Native primary keys (readable) | -| Statuses: `reserved`, `error`, `ignore` | Added: `pending`, `success` | -| No priority support | Priority column (lower = more urgent) | - -### New Features - -- **Automatic refresh**: Job queue synchronized with pending work automatically -- **Better coordination**: Multiple workers coordinate via database without conflicts -- **Error tracking**: Built-in error table (`Table.jobs.errors`) with full stack traces -- **Priority support**: Control computation order with priority values - -```python -# Distributed mode with coordination -Analysis.populate(reserve_jobs=True, processes=4) - -# Monitor progress -Analysis.jobs.progress() # {'pending': 10, 'reserved': 2, 'error': 0} - -# Handle errors -Analysis.jobs.errors.to_dicts() - -# Set priorities -Analysis.jobs.update({'session_id': 123}, priority=1) # High priority -``` - -## Semantic Matching - -**Breaking change**: Query operations now use **lineage-based matching** by default. - -→ [Semantic Matching Specification](../reference/specs/semantic-matching.md) - -### What Changed - -Legacy DataJoint used SQL-style natural joins: attributes matched if they had the same name, regardless of meaning. - -**DataJoint 2.0 validates semantic lineage**: Attributes must share common origin through foreign key chains, not just coincidentally matching names. - -```python -# 2.0: Semantic join (default) - validates lineage -result = TableA * TableB # Only matches attributes with shared origin - -# Legacy behavior (if needed) -result = TableA.join(TableB, semantic_check=False) -``` - -**Why this matters**: Prevents accidental matches between attributes like `session_id` that happen to share a name but refer to different entities in different parts of your schema. - -**During migration**: If semantic matching fails, it often indicates a malformed join that should be reviewed rather than forced. - -## Configuration System - -A cleaner configuration approach with separation of concerns. - -→ [Configuration Reference](../reference/configuration.md) - -- **`datajoint.json`**: Non-sensitive settings (commit to version control) -- **`.secrets/`**: Credentials (never commit) -- **Environment variables**: For CI/CD and production - -```bash -export DJ_HOST=db.example.com -export DJ_USER=myuser -export DJ_PASS=mypassword -``` - -## ObjectRef API (New) - -**New feature**: Path-addressed storage returns `ObjectRef` handles that support streaming access without downloading entire datasets. - -```python -ref = (Dataset & key).fetch1('zarr_array') - -# Direct fsspec access for Zarr/xarray -z = zarr.open(ref.fsmap, mode='r') - -# Or download locally -local_path = ref.download('/tmp/data') - -# Stream chunks without full download -with ref.open('rb') as f: - chunk = f.read(1024) -``` - -This enables efficient access to large datasets stored in Zarr, HDF5, or custom formats. - -## Deprecated and Removed - -### Removed APIs - -- **`.fetch()` method**: Replaced with `.to_dicts()`, `.to_arrays()`, or `.to_pandas()` -- **`._update()` method**: Replaced with `.update1()` -- **`@` operator (natural join)**: Use `*` with semantic matching or `.join(semantic_check=False)` -- **`dj.U() * table` pattern**: Use just `table` (universal set is implicit) - -### Deprecated Features - -- **AdaptedTypes**: Replaced by codec system (still works but migration recommended) -- **Native type syntax**: `int unsigned` → `uint32` (warnings on new tables) -- **Legacy external storage** (`blob@store`): Replaced by `` codec syntax - -### Legacy Support - -During migration (Phases 1-3), both legacy and 2.0 APIs can coexist: -- Legacy clients can still access data -- 2.0 clients understand legacy column types -- Dual attributes enable cross-testing - -After finalization (Phase 4+), only 2.0 clients are supported. - -## License Change - -DataJoint 2.0 is licensed under the **Apache License 2.0** (previously LGPL-2.1). This provides: -- More permissive for commercial and academic use -- Clearer patent grant provisions -- Better compatibility with broader ecosystem - -## Migration Path - -→ **[Complete Migration Guide](../how-to/migrate-to-v20.md)** - -Upgrading from DataJoint 0.x is a **phased process** designed to minimize risk: - -### Phase 1: Code Updates (Reversible) -- Update Python code to 2.0 API patterns (`.fetch()` → `.to_dicts()`, etc.) -- Update configuration files (`dj_local_conf.json` → `datajoint.json` + `.secrets/`) -- **No database changes** — legacy clients still work - -### Phase 2: Type Migration (Reversible) -- Update database column comments to use core types (`:uint32:`, `::`) -- Rebuild `~lineage` tables for semantic matching -- Update Python table definitions -- **Legacy clients still work** — only metadata changed - -### Phase 3: External Storage Dual Attributes (Reversible) -- Create `*_v2` attributes alongside legacy external storage columns -- Both APIs can access data during transition -- Enables cross-testing between legacy and 2.0 -- **Legacy clients still work** - -### Phase 4: Finalize (Point of No Return) -- Remove legacy external storage columns -- Drop old `~jobs` and `~external_*` tables -- **Legacy clients stop working** — database backup required - -### Phase 5: Adopt New Features (Optional) -- Use new codecs (``, ``) -- Leverage Jobs 2.0 features (priority, better errors) -- Implement custom codecs for domain-specific types - -### Migration Support - -The migration guide includes: -- **AI agent prompts** for automated migration steps -- **Validation commands** to check migration status -- **Rollback procedures** for each phase -- **Dry-run modes** for all database changes - -Most users complete Phases 1-2 in a single session. Phases 3-4 only apply if you use legacy external storage. - -## See Also - -### Migration -- **[Migration Guide](../how-to/migrate-to-v20.md)** — Complete upgrade instructions -- [Configuration](../how-to/configure-database.md) — Setup new configuration system - -### Core Concepts -- [Type System](type-system.md) — Understand the three-tier type architecture -- [Computation Model](computation-model.md) — Jobs 2.0 and AutoPopulate -- [Query Algebra](query-algebra.md) — Semantic matching and operators - -### Getting Started -- [Installation](../how-to/installation.md) — Install DataJoint 2.0 -- [Tutorials](../tutorials/index.md) — Learn by example - -### Reference -- [Type System Specification](../reference/specs/type-system.md) — Complete type system details -- [Codec API](../reference/specs/codec-api.md) — Build custom codecs -- [AutoPopulate Specification](../reference/specs/autopopulate.md) — Jobs 2.0 reference - - -============================================================ -# Tutorials -============================================================ - - ---- -## File: tutorials/advanced/custom-codecs.ipynb - -# Custom Codecs - -This tutorial covers extending DataJoint's type system. You'll learn: - -- **Codec basics** — Encoding and decoding -- **Creating codecs** — Domain-specific types -- **Codec chaining** — Composing codecs - - -```python -import datajoint as dj -import numpy as np - -schema = dj.Schema('tutorial_codecs') -``` - - -## Creating a Custom Codec - - -```python -import networkx as nx - -class GraphCodec(dj.Codec): - """Store NetworkX graphs.""" - - name = "graph" # Use as - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, value, *, key=None, store_name=None): - return {'nodes': list(value.nodes(data=True)), 'edges': list(value.edges(data=True))} - - def decode(self, stored, *, key=None): - g = nx.Graph() - g.add_nodes_from(stored['nodes']) - g.add_edges_from(stored['edges']) - return g - - def validate(self, value): - if not isinstance(value, nx.Graph): - raise TypeError(f"Expected nx.Graph") -``` - - - -```python -@schema -class Connectivity(dj.Manual): - definition = """ - conn_id : int - --- - network : - """ -``` - - - -```python -# Create and insert -g = nx.Graph() -g.add_edges_from([(1, 2), (2, 3), (1, 3)]) -Connectivity.insert1({'conn_id': 1, 'network': g}) - -# Fetch -result = (Connectivity & {'conn_id': 1}).fetch1('network') -print(f"Type: {type(result)}") -print(f"Edges: {list(result.edges())}") -``` - - -## Codec Structure - -```python -class MyCodec(dj.Codec): - name = "mytype" # Use as - - def get_dtype(self, is_store: bool) -> str: - return "" # Storage type - - def encode(self, value, *, key=None, store_name=None): - return serializable_data - - def decode(self, stored, *, key=None): - return python_object - - def validate(self, value): # Optional - pass -``` - -## Example: Spike Train - - -```python -from dataclasses import dataclass - -@dataclass -class SpikeTrain: - times: np.ndarray - unit_id: int - quality: str - -class SpikeTrainCodec(dj.Codec): - name = "spike_train" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, value, *, key=None, store_name=None): - return {'times': value.times, 'unit_id': value.unit_id, 'quality': value.quality} - - def decode(self, stored, *, key=None): - return SpikeTrain(times=stored['times'], unit_id=stored['unit_id'], quality=stored['quality']) -``` - - - -```python -@schema -class Unit(dj.Manual): - definition = """ - unit_id : int - --- - spikes : - """ - -train = SpikeTrain(times=np.sort(np.random.uniform(0, 100, 50)), unit_id=1, quality='good') -Unit.insert1({'unit_id': 1, 'spikes': train}) - -result = (Unit & {'unit_id': 1}).fetch1('spikes') -print(f"Type: {type(result)}, Spikes: {len(result.times)}") -``` - - - -```python -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/advanced/distributed.ipynb - -# Distributed Computing - -This tutorial covers running computations across multiple workers. You'll learn: - -- **Jobs 2.0** — DataJoint's job coordination system -- **Multi-process** — Parallel workers on one machine -- **Multi-machine** — Cluster-scale computation -- **Error handling** — Recovery and monitoring - - -```python -import datajoint as dj -import numpy as np -import time - -schema = dj.Schema('tutorial_distributed') - -# Clean up from previous runs -schema.drop(prompt=False) -schema = dj.Schema('tutorial_distributed') -``` - - -## Setup - - -```python -@schema -class Experiment(dj.Manual): - definition = """ - exp_id : int - --- - n_samples : int - """ - -@schema -class Analysis(dj.Computed): - definition = """ - -> Experiment - --- - result : float64 - compute_time : float32 - """ - - def make(self, key): - start = time.time() - n = (Experiment & key).fetch1('n_samples') - result = float(np.mean(np.random.randn(n) ** 2)) - time.sleep(0.1) - self.insert1({**key, 'result': result, 'compute_time': time.time() - start}) -``` - - - -```python -Experiment.insert([{'exp_id': i, 'n_samples': 10000} for i in range(20)]) -print(f"To compute: {len(Analysis.key_source - Analysis)}") -``` - - -## Direct vs Distributed Mode - -**Direct mode** (default): No coordination, suitable for single worker. - -**Distributed mode** (`reserve_jobs=True`): Workers coordinate via jobs table. - - -```python -# Distributed mode -Analysis.populate(reserve_jobs=True, max_calls=5, display_progress=True) -``` - - -## The Jobs Table - - -```python -# Refresh job queue -result = Analysis.jobs.refresh() -print(f"Added: {result['added']}") - -# Check status -for status, count in Analysis.jobs.progress().items(): - print(f"{status}: {count}") -``` - - -## Multi-Process and Multi-Machine - -The `processes=N` parameter spawns multiple worker processes on one machine. However, this requires table classes to be defined in importable Python modules (not notebooks), because multiprocessing needs to pickle and transfer the class definitions to worker processes. - -For production use, define your tables in a module and run workers as scripts: - -```python -# pipeline.py - Define your tables -import datajoint as dj -schema = dj.Schema('my_pipeline') - -@schema -class Analysis(dj.Computed): - definition = """...""" - def make(self, key): ... -``` - -```python -# worker.py - Run workers -from pipeline import Analysis - -# Single machine, 4 processes -Analysis.populate(reserve_jobs=True, processes=4) - -# Or run this script on multiple machines -while True: - result = Analysis.populate(reserve_jobs=True, max_calls=100, suppress_errors=True) - if result['success_count'] == 0: - break -``` - -In this notebook, we'll demonstrate distributed coordination with a single process: - - -```python -# Complete remaining jobs with distributed coordination -Analysis.populate(reserve_jobs=True, display_progress=True) -print(f"Computed: {len(Analysis())}") -``` - - -## Error Handling - - -```python -# View errors -print(f"Errors: {len(Analysis.jobs.errors)}") - -# Retry failed jobs -Analysis.jobs.errors.delete() -Analysis.populate(reserve_jobs=True, suppress_errors=True) -``` - - -## Quick Reference - -| Option | Description | -|--------|-------------| -| `reserve_jobs=True` | Enable coordination | -| `processes=N` | N worker processes | -| `max_calls=N` | Limit jobs per run | -| `suppress_errors=True` | Continue on errors | - - -```python -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/advanced/json-type.ipynb - -# JSON Data Type - -This tutorial covers the `json` data type in DataJoint, which allows storing semi-structured data within tables. You'll learn: - -- When to use the JSON type -- Defining tables with JSON attributes -- Inserting JSON data -- Querying and filtering JSON fields -- Projecting JSON subfields - -## Prerequisites - -- **MySQL 8.0+** with `JSON_VALUE` function support -- Percona is fully compatible -- MariaDB is **not supported** (different `JSON_VALUE` syntax) - - -```python -import datajoint as dj -``` - - -## Table Definition - -For this exercise, let's imagine we work for an awesome company that is organizing a fun RC car race across various teams in the company. Let's see which team has the fastest car! 🏎️ - -This establishes 2 important entities: a `Team` and a `Car`. Normally the entities are mapped to their own dedicated table, however, let's assume that `Team` is well-structured but `Car` is less structured than we'd prefer. In other words, the structure for what makes up a *car* is varying too much between entries (perhaps because users of the pipeline haven't agreed yet on the definition? 🤷). - -This would make it a good use-case to keep `Team` as a table but make `Car` a `json` type defined within the `Team` table. - -Let's begin. - - -```python -import datajoint as dj - -# Clean up any existing schema from previous runs -schema = dj.Schema('tutorial_json', create_tables=False) -schema.drop() - -# Create fresh schema -schema = dj.Schema('tutorial_json') -``` - - - -```python -@schema -class Team(dj.Lookup): - definition = """ - # A team within a company - name: varchar(40) # team name - --- - car=null: json # A car belonging to a team (null to allow registering first but specifying car later) - - unique index(car.length:decimal(4, 1)) # Add an index if this key is frequently accessed - """ -``` - - -## Insert - -Let's suppose that engineering is first up to register their car. - - -```python -Team.insert1( - { - "name": "engineering", - "car": { - "name": "Rever", - "length": 20.5, - "inspected": True, - "tire_pressure": [32, 31, 33, 34], - "headlights": [ - { - "side": "left", - "hyper_white": None, - }, - { - "side": "right", - "hyper_white": None, - }, - ], - }, - } -) -``` - - -Next, business and marketing teams are up and register their cars. - -A few points to notice below: -- The person signing up on behalf of marketing does not know the specifics of the car during registration but another team member will be updating this soon before the race. -- Notice how the `business` and `engineering` teams appear to specify the same property but refer to it as `safety_inspected` and `inspected` respectfully. - - -```python -Team.insert( - [ - { - "name": "marketing", - "car": None, - }, - { - "name": "business", - "car": { - "name": "Chaching", - "length": 100, - "safety_inspected": False, - "tire_pressure": [34, 30, 27, 32], - "headlights": [ - { - "side": "left", - "hyper_white": True, - }, - { - "side": "right", - "hyper_white": True, - }, - ], - }, - }, - ] -) -``` - - -We can preview the table data much like normal but notice how the value of `car` behaves like other BLOB-like attributes. - - -```python -Team() -``` - - -## Restriction - -Now let's see what kinds of queries we can form to demonstrate how we can query this pipeline. - - -```python -# Which team has a `car` equal to 100 inches long? -Team & {"car.length": 100} -``` - - - -```python -# Which team has a `car` less than 50 inches long? -Team & "car->>'$.length' < 50" -``` - - - -```python -# Any team that has had their car inspected? -Team & [{"car.inspected:unsigned": True}, {"car.safety_inspected:unsigned": True}] -``` - - - -```python -# Which teams do not have hyper white lights for their first head light? -Team & {"car.headlights[0].hyper_white": None} -``` - - -Notice that the previous query will satisfy the `None` check if it experiences any of the following scenarios: -- if entire record missing (`marketing` satisfies this) -- JSON key is missing -- JSON value is set to JSON `null` (`engineering` satisfies this) - -## Projection - -Projections can be quite useful with the `json` type since we can extract out just what we need. This allows greater query flexibility but more importantly, for us to be able to fetch only what is pertinent. - - -```python -# Only interested in the car names and the length but let the type be inferred -q_untyped = Team.proj( - car_name="car.name", - car_length="car.length", -) -q_untyped -``` - - - -```python -q_untyped.to_dicts() -``` - - - -```python -# Nevermind, I'll specify the type explicitly -q_typed = Team.proj( - car_name="car.name", - car_length="car.length:float", -) -q_typed -``` - - - -```python -q_typed.to_dicts() -``` - - -## Describe - -Lastly, the `.describe()` function on the `Team` table can help us generate the table's definition. This is useful if we are connected directly to the pipeline without the original source. - - -```python -rebuilt_definition = Team.describe() -print(rebuilt_definition) -``` - - -## Cleanup - -Finally, let's clean up what we created in this tutorial. - - -```python -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/advanced/sql-comparison.ipynb - -# DataJoint for SQL Users - -This tutorial maps SQL concepts to DataJoint for users with relational database experience. You'll see: - -- How DataJoint syntax corresponds to SQL -- What DataJoint adds beyond standard SQL -- When to use each approach - -**Prerequisites:** Familiarity with SQL (SELECT, JOIN, WHERE, GROUP BY). - -## Setup - - -```python -import datajoint as dj - -schema = dj.Schema('tutorial_sql_comparison') -``` - - -## Schema Definition - -### SQL -```sql -CREATE TABLE Researcher ( - researcher_id INT NOT NULL, - name VARCHAR(100) NOT NULL, - email VARCHAR(100), - PRIMARY KEY (researcher_id) -); - -CREATE TABLE Subject ( - subject_id INT NOT NULL, - species VARCHAR(32) NOT NULL, - sex ENUM('M', 'F', 'unknown'), - PRIMARY KEY (subject_id) -); - -CREATE TABLE Session ( - subject_id INT NOT NULL, - session_date DATE NOT NULL, - researcher_id INT NOT NULL, - notes VARCHAR(255), - PRIMARY KEY (subject_id, session_date), - FOREIGN KEY (subject_id) REFERENCES Subject(subject_id), - FOREIGN KEY (researcher_id) REFERENCES Researcher(researcher_id) -); -``` - -### DataJoint - - -```python -@schema -class Researcher(dj.Manual): - definition = """ - researcher_id : int32 - --- - name : varchar(100) - email : varchar(100) - """ - -@schema -class Subject(dj.Manual): - definition = """ - subject_id : int32 - --- - species : varchar(32) - sex : enum('M', 'F', 'unknown') - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_date : date - --- - -> Researcher - notes : varchar(255) - """ -``` - - -### Key Differences - -| Aspect | SQL | DataJoint | -|--------|-----|----------| -| Primary key | `PRIMARY KEY (...)` | Above `---` line | -| Foreign key | `FOREIGN KEY ... REFERENCES` | `-> TableName` | -| Types | `INT`, `VARCHAR(n)` | `int32`, `varchar(n)` | -| Table metadata | None | Table tier (`Manual`, `Computed`, etc.) | - -## Insert Sample Data - - -```python -Researcher.insert([ - {'researcher_id': 1, 'name': 'Alice Chen', 'email': 'alice@lab.org'}, - {'researcher_id': 2, 'name': 'Bob Smith', 'email': 'bob@lab.org'}, -]) - -Subject.insert([ - {'subject_id': 1, 'species': 'mouse', 'sex': 'M'}, - {'subject_id': 2, 'species': 'mouse', 'sex': 'F'}, - {'subject_id': 3, 'species': 'rat', 'sex': 'M'}, -]) - -Session.insert([ - {'subject_id': 1, 'session_date': '2024-06-01', - 'researcher_id': 1, 'notes': 'First session'}, - {'subject_id': 1, 'session_date': '2024-06-15', - 'researcher_id': 1, 'notes': 'Follow-up'}, - {'subject_id': 2, 'session_date': '2024-06-10', - 'researcher_id': 2, 'notes': 'Initial'}, - {'subject_id': 3, 'session_date': '2024-06-20', - 'researcher_id': 2, 'notes': 'Rat study'}, -]) -``` - - -## Query Comparison - -### SELECT * FROM table - - -```python -# SQL: SELECT * FROM Subject -Subject() -``` - - -### WHERE — Restriction (`&`) - - -```python -# SQL: SELECT * FROM Subject WHERE sex = 'M' -Subject & {'sex': 'M'} -``` - - - -```python -# SQL: SELECT * FROM Subject WHERE species = 'mouse' AND sex = 'F' -Subject & {'species': 'mouse', 'sex': 'F'} -``` - - - -```python -# SQL: SELECT * FROM Session WHERE session_date > '2024-06-10' -Session & 'session_date > "2024-06-10"' -``` - - -### Restriction by Query (Subqueries) - -In SQL, you often use subqueries to filter based on another table: - -```sql --- Sessions with mice only -SELECT * FROM Session -WHERE subject_id IN (SELECT subject_id FROM Subject WHERE species = 'mouse') -``` - -In DataJoint, you simply restrict by another query — no special subquery syntax needed: - - -```python -# SQL: SELECT * FROM Session -# WHERE subject_id IN (SELECT subject_id FROM Subject WHERE species = 'mouse') - -# DataJoint: restrict Session by a query on Subject -mice = Subject & {'species': 'mouse'} -Session & mice -``` - - - -```python -# Restrict by Alice's sessions (finds subjects she worked with) -alice_sessions = Session & (Researcher & {'name': 'Alice Chen'}) -Subject & alice_sessions -``` - - -## Semantic Matching - -A fundamental difference between SQL and DataJoint is **semantic matching** — the principle that attributes acquire meaning through foreign key relationships, and all binary operators use this meaning to determine how tables combine. - -### The Problem with SQL - -SQL requires you to explicitly specify how tables connect: - -```sql -SELECT * FROM Session -JOIN Subject ON Session.subject_id = Subject.subject_id; -``` - -This is verbose and error-prone. Nothing prevents you from joining on unrelated columns that happen to share a name, or accidentally creating a Cartesian product when tables have no common columns. - -Experienced SQL programmers learn to always join through foreign key relationships. DataJoint makes this the **default and enforced behavior**. - -### How Semantic Matching Works - -In DataJoint, when you declare a foreign key with `-> Subject`, the `subject_id` attribute in your table inherits its **meaning** from the `Subject` table. This meaning propagates through the foreign key graph. - -**Semantic matching** means: all binary operators (`*`, `&`, `-`, `+`, `.aggr()`) match attributes based on shared meaning — those connected through foreign keys. If two tables have no semantically matching attributes, the operation raises an error rather than silently producing incorrect results. - - -```python -# All these operations use semantic matching on subject_id: - -# Join: combines Session and Subject on subject_id -Session * Subject - -# Restriction: filters Session to rows matching the Subject query -Session & (Subject & {'species': 'mouse'}) - -# Antijoin: Session rows NOT matching any Subject (none here, all subjects exist) -Session - Subject -``` - - -### One Join Operator Instead of Many - -SQL has multiple join types (`INNER`, `LEFT`, `RIGHT`, `FULL OUTER`, `CROSS`) because it must handle arbitrary column matching. DataJoint's single join operator (`*`) is sufficient because semantic matching is **more restrictive** than SQL's natural joins: - -- SQL natural joins match on **all columns with the same name** — which can accidentally match unrelated columns -- DataJoint semantic joins match only on **attributes connected through foreign keys** — and raise an error if you attempt to join on attributes that shouldn't be joined - -This catches errors at query time rather than producing silently incorrect results. - -### Algebraic Closure - -In standard SQL, query results are just "bags of rows" — they don't have a defined entity type. You cannot know what kind of thing each row represents without external context. - -DataJoint achieves **algebraic closure**: every query result is a valid entity set with a well-defined **entity type**. You always know what kind of entity the result represents, identified by a specific primary key. This means: - -1. **Every operator returns a valid relation** — not just rows, but a set of entities of a known type -2. **Operators compose indefinitely** — you can chain any sequence of operations -3. **Results remain queryable** — a query result can be used as an operand in further operations - -The entity type (and its primary key) is determined by precise rules based on the operator and the functional dependencies between operands. See the [Primary Keys specification](../../reference/specs/primary-keys.md) for details. - -### SELECT columns — Projection (`.proj()`) - - -```python -# SQL: SELECT name, email FROM Researcher -Researcher.proj('name', 'email') -``` - - - -```python -# SQL: SELECT subject_id, species AS animal_type FROM Subject -Subject.proj(animal_type='species') -``` - - -### JOIN - - -```python -# SQL: SELECT * FROM Session JOIN Subject USING (subject_id) -Session * Subject -``` - - - -```python -# SQL: SELECT session_date, name, species -# FROM Session -# JOIN Subject USING (subject_id) -# JOIN Researcher USING (researcher_id) -(Session * Subject * Researcher).proj('session_date', 'name', 'species') -``` - - -### GROUP BY — Aggregation (`.aggr()`) - - -```python -# SQL: SELECT subject_id, COUNT(*) as num_sessions -# FROM Session GROUP BY subject_id -Subject.aggr(Session, num_sessions='count(*)') -``` - - - -```python -# SQL: SELECT researcher_id, name, COUNT(*) as num_sessions -# FROM Researcher JOIN Session USING (researcher_id) -# GROUP BY researcher_id -Researcher.aggr(Session, num_sessions='count(*)') -``` - - - -```python -# SQL: SELECT AVG(...), COUNT(*) FROM Session (no grouping) -dj.U().aggr(Session, total_sessions='count(*)') -``` - - -### NOT IN — Negative Restriction (`-`) - - -```python -# SQL: SELECT * FROM Subject -# WHERE subject_id NOT IN (SELECT subject_id FROM Session) -# (Subjects with no sessions) -Subject - Session -``` - - -### Combined Example - - -```python -# SQL: SELECT r.name, COUNT(*) as mouse_sessions -# FROM Researcher r -# JOIN Session s USING (researcher_id) -# JOIN Subject sub USING (subject_id) -# WHERE sub.species = 'mouse' -# GROUP BY r.researcher_id - -Researcher.aggr( - Session * (Subject & {'species': 'mouse'}), - mouse_sessions='count(*)' -) -``` - - -## Operator Reference - -| SQL | DataJoint | Notes | -|-----|-----------|-------| -| `SELECT *` | `Table()` | Display table | -| `SELECT cols` | `.proj('col1', 'col2')` | Projection | -| `SELECT col AS alias` | `.proj(alias='col')` | Rename | -| `WHERE condition` | `& {'col': value}` or `& 'expr'` | Restriction | -| `JOIN ... USING` | `Table1 * Table2` | Natural join | -| `GROUP BY ... AGG()` | `.aggr(Table, alias='agg()')` | Aggregation | -| `NOT IN (subquery)` | `Table1 - Table2` | Antijoin | -| `UNION` | `Table1 + Table2` | Union | - -## What DataJoint Adds - -DataJoint is not just "Python syntax for SQL." It adds: - -### 1. Table Tiers - -Tables are classified by their role in the workflow: - -| Tier | Purpose | SQL Equivalent | -|------|---------|----------------| -| `Lookup` | Reference data, parameters | Regular table | -| `Manual` | User-entered data | Regular table | -| `Imported` | Data from external files | Regular table + trigger | -| `Computed` | Derived results | Materialized view + trigger | - -### 2. Automatic Computation - -Computed tables have a `make()` method that runs automatically. An important principle: **`make()` should only fetch data from upstream tables** — those declared as dependencies in the table definition. - - -```python -@schema -class SessionAnalysis(dj.Computed): - definition = """ - -> Session # depends on Session - --- - day_of_week : varchar(10) - """ - - def make(self, key): - # Fetch only from upstream tables (Session and its dependencies) - date = (Session & key).fetch1('session_date') - self.insert1({**key, 'day_of_week': date.strftime('%A')}) -``` - - - -```python -# Automatically compute for all sessions -SessionAnalysis.populate(display_progress=True) -SessionAnalysis() -``` - - -In SQL, you'd need triggers, stored procedures, or external scheduling to achieve this. - -### 3. Cascading Deletes - -DataJoint enforces referential integrity with automatic cascading: - - -```python -# Deleting a session would delete its computed analysis -# (Session & {'subject_id': 1, 'session_date': '2024-06-01'}).delete() # Uncomment to try -``` - - -### 4. Schema as Workflow - -The diagram shows the computational workflow, not just relationships: - - -```python -dj.Diagram(schema) -``` - - -- **Green** = Manual (input) -- **Red** = Computed (derived) -- Arrows show dependency/execution order - -### 5. Object Storage Integration - -Store large objects (arrays, files) with relational semantics: - -```python -class Recording(dj.Imported): - definition = """ - -> Session - --- - raw_data : # NumPy array stored in database - video : # Large file stored externally - """ -``` - -SQL has no standard way to handle this. - -## When to Use Raw SQL - -DataJoint generates SQL under the hood. Sometimes raw SQL is useful: - - -```python -# See the generated SQL for any query -query = Session * Subject & {'species': 'mouse'} -print(query.make_sql()) -``` - - - -```python -# Execute raw SQL when needed -# result = dj.conn().query('SELECT * FROM ...') -``` - - -## Summary - -| Feature | SQL | DataJoint | -|---------|-----|----------| -| Query language | SQL strings | Python operators | -| Schema definition | DDL | Python classes | -| Foreign keys | Manual declaration | `->` syntax | -| Table purpose | Implicit | Explicit tiers | -| Automatic computation | Triggers/procedures | `populate()` | -| Large objects | BLOBs (limited) | Codec system | -| Workflow visualization | None | `dj.Diagram()` | - -DataJoint uses SQL databases (MySQL/PostgreSQL) underneath but provides: -- **Pythonic syntax** for queries -- **Workflow semantics** for scientific pipelines -- **Automatic computation** via `populate()` -- **Object storage** for large scientific data - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/01-first-pipeline.ipynb - -# A Simple Pipeline - -This tutorial introduces DataJoint by building a simple research lab database. You'll learn to: - -- Define tables with primary keys and dependencies -- Insert and query data -- Use the four core operations: restriction, projection, join, aggregation -- Understand the schema diagram - -We'll work with **Manual tables** only—tables where you enter data directly. Later tutorials introduce automated computation. - -For complete working examples, see: -- [University Database](../examples/university.ipynb) — Academic records with complex queries -- [Blob Detection](../examples/blob-detection.ipynb) — Image processing with computation - -## Setup - - -```python -import datajoint as dj - -schema = dj.Schema('tutorial_first_pipeline') -``` - - -## The Domain: A Research Lab - -We'll model a research lab that: -- Has **researchers** who conduct experiments -- Works with **subjects** (e.g., mice) -- Runs **sessions** where data is collected -- Collects **recordings** during each session - -```mermaid -flowchart TD - Researcher --> Session - Subject --> Session - Session --> Recording -``` - -## Defining Tables - -Each table is a Python class. The `definition` string specifies: -- **Primary key** (above `---`) — uniquely identifies each row -- **Attributes** (below `---`) — additional data for each row -- **Dependencies** (`->`) — references to other tables - - -```python -@schema -class Researcher(dj.Manual): - definition = """ - researcher_id : int32 - --- - researcher_name : varchar(100) - email : varchar(100) - """ -``` - - - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : int32 - --- - species : varchar(32) - date_of_birth : date - sex : enum('M', 'F', 'unknown') - """ -``` - - -### Dependencies - -A `Session` involves one researcher and one subject. The `->` syntax creates a **dependency** (foreign key): - - -```python -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_date : date - --- - -> Researcher - session_notes : varchar(255) - """ -``` - - -The `-> Subject` in the primary key means: -- `subject_id` is automatically included in Session's primary key -- Combined with `session_date`, each session is uniquely identified -- You cannot create a session for a non-existent subject - -The `-> Researcher` below the line is a non-primary dependency—it records who ran the session but isn't part of the unique identifier. - - -```python -@schema -class Recording(dj.Manual): - definition = """ - -> Session - recording_id : int16 - --- - duration : float32 # recording duration (seconds) - quality : enum('good', 'fair', 'poor') - """ -``` - - -## Schema Diagram - -The diagram shows tables and their dependencies: - - -```python -dj.Diagram(schema) -``` - - -**Reading the diagram:** -- **Green boxes** = Manual tables (you enter data) -- **Solid lines** = Primary key dependencies (part of identity) -- **Dashed lines** = Non-primary dependencies (references) - -## Inserting Data - -Data must be inserted in dependency order—you can't reference something that doesn't exist. - - -```python -# First: tables with no dependencies -Researcher.insert([ - {'researcher_id': 1, 'researcher_name': 'Alice Chen', - 'email': 'alice@lab.org'}, - {'researcher_id': 2, 'researcher_name': 'Bob Smith', - 'email': 'bob@lab.org'}, -]) - -Subject.insert([ - {'subject_id': 1, 'species': 'mouse', - 'date_of_birth': '2024-01-15', 'sex': 'M'}, - {'subject_id': 2, 'species': 'mouse', - 'date_of_birth': '2024-01-20', 'sex': 'F'}, - {'subject_id': 3, 'species': 'mouse', - 'date_of_birth': '2024-02-01', 'sex': 'M'}, -]) -``` - - - -```python -# Then: tables that depend on others -Session.insert([ - {'subject_id': 1, 'session_date': '2024-06-01', - 'researcher_id': 1, 'session_notes': 'First session'}, - {'subject_id': 1, 'session_date': '2024-06-15', - 'researcher_id': 1, 'session_notes': 'Follow-up'}, - {'subject_id': 2, 'session_date': '2024-06-10', - 'researcher_id': 2, 'session_notes': 'Initial recording'}, -]) -``` - - - -```python -# Finally: tables at the bottom of the hierarchy -Recording.insert([ - {'subject_id': 1, 'session_date': '2024-06-01', 'recording_id': 1, - 'duration': 300.5, 'quality': 'good'}, - {'subject_id': 1, 'session_date': '2024-06-01', 'recording_id': 2, - 'duration': 450.0, 'quality': 'good'}, - {'subject_id': 1, 'session_date': '2024-06-15', 'recording_id': 1, - 'duration': 600.0, 'quality': 'fair'}, - {'subject_id': 2, 'session_date': '2024-06-10', 'recording_id': 1, - 'duration': 350.0, 'quality': 'good'}, -]) -``` - - -## Viewing Data - -Display a table by calling it: - - -```python -Subject() -``` - - - -```python -Recording() -``` - - -## The Four Core Operations - -DataJoint queries use four fundamental operations. These compose to answer any question about your data. - -### 1. Restriction (`&`) — Filter rows - -Keep only rows matching a condition: - - -```python -# Subjects that are male -Subject & {'sex': 'M'} -``` - - - -```python -# Recordings with good quality -Recording & 'quality = "good"' -``` - - - -```python -# Sessions for subject 1 -Session & {'subject_id': 1} -``` - - -### 2. Projection (`.proj()`) — Select columns - -Choose which attributes to return, or compute new ones: - - -```python -# Just names and emails -Researcher.proj('researcher_name', 'email') -``` - - - -```python -# Compute duration in minutes -Recording.proj(duration_min='duration / 60') -``` - - -### 3. Join (`*`) — Combine tables - -Merge data from related tables: - - -```python -# Sessions with subject info -Session * Subject -``` - - - -```python -# Full recording details with subject and researcher -(Recording * Session * Subject * Researcher).proj( - 'researcher_name', 'species', 'duration', 'quality' -) -``` - - -### 4. Aggregation (`.aggr()`) — Summarize groups - -Compute statistics across groups of rows: - - -```python -# Count recordings per session -Session.aggr(Recording, num_recordings='count(*)') -``` - - - -```python -# Total recording time per subject -Subject.aggr(Recording, total_duration='sum(duration)') -``` - - - -```python -# Average duration across all recordings -dj.U().aggr(Recording, avg_duration='avg(duration)') -``` - - -## Combining Operations - -Operations chain together to answer complex questions: - - -```python -# Good-quality recordings for male subjects, with researcher name -( - Recording - & 'quality = "good"' - & (Subject & {'sex': 'M'}) -) * Session * Researcher.proj('researcher_name') -``` - - - -```python -# Count of good recordings per researcher -Researcher.aggr( - Session * (Recording & 'quality = "good"'), - good_recordings='count(*)' -) -``` - - -## Fetching Data - -To get data into Python, use fetch methods: - - -```python -# Fetch as list of dicts -Subject.to_dicts() -``` - - - -```python -# Fetch specific attributes as arrays -durations = (Recording & 'quality = "good"').to_arrays('duration') -print(f"Good recording durations: {durations}") -``` - - - -```python -# Fetch one row -one_subject = (Subject & {'subject_id': 1}).fetch1() -print(f"Subject 1: {one_subject}") -``` - - -## Deleting Data - -Deleting respects dependencies—downstream data is deleted automatically: - - -```python -# This would delete the session AND all its recordings -# (Session & {'subject_id': 2, 'session_date': '2024-06-10'}).delete() - -# Uncomment to try (will prompt for confirmation) -``` - - -## Summary - -You've learned the fundamentals of DataJoint: - -| Concept | Description | -|---------|-------------| -| **Tables** | Python classes with a `definition` string | -| **Primary key** | Above `---`, uniquely identifies rows | -| **Dependencies** | `->` creates foreign keys | -| **Restriction** | `&` filters rows | -| **Projection** | `.proj()` selects/computes columns | -| **Join** | `*` combines tables | -| **Aggregation** | `.aggr()` summarizes groups | - -### Next Steps - -- [Schema Design](02-schema-design.ipynb) — Primary keys, relationships, table tiers -- [Queries](04-queries.ipynb) — Advanced query patterns -- [Computation](05-computation.ipynb) — Automated processing with Imported/Computed tables - -### Complete Examples - -- [University Database](../examples/university.ipynb) — Complex queries on academic records -- [Blob Detection](../examples/blob-detection.ipynb) — Image processing pipeline with computation - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/02-schema-design.ipynb - -# Schema Design - -This tutorial covers how to design DataJoint schemas effectively. You'll learn: - -- **Table tiers** — Manual, Lookup, Imported, and Computed tables -- **Primary keys** — Uniquely identifying entities -- **Foreign keys** — Creating dependencies between tables -- **Relationship patterns** — One-to-many, one-to-one, and many-to-many - -We'll build a schema for a neuroscience experiment tracking subjects, sessions, and trials. - - -```python -import datajoint as dj - -schema = dj.Schema('tutorial_design') -``` - - -## Table Tiers - -DataJoint has four table tiers, each serving a different purpose: - -| Tier | Class | Purpose | Data Entry | -|------|-------|---------|------------| -| **Manual** | `dj.Manual` | Core experimental data | Inserted by operators or instruments | -| **Lookup** | `dj.Lookup` | Reference/configuration data | Pre-populated, rarely changes | -| **Imported** | `dj.Imported` | Data from external files | Auto-populated via `make()` | -| **Computed** | `dj.Computed` | Derived/processed data | Auto-populated via `make()` | - -**Manual** tables are not necessarily populated by hand—they contain data entered into the pipeline by operators, instruments, or ingestion scripts using `insert` commands. In contrast, **Imported** and **Computed** tables are auto-populated by calling the `.populate()` method, which invokes the `make()` callback for each missing entry. - -### Manual Tables - -Manual tables store data that is inserted directly—the starting point of your pipeline. - - -```python -@schema -class Lab(dj.Manual): - definition = """ - # Research laboratory - lab_id : varchar(16) # short identifier (e.g., 'tolias') - --- - lab_name : varchar(100) - institution : varchar(100) - created_at = CURRENT_TIMESTAMP : datetime # when record was created - """ - -@schema -class Subject(dj.Manual): - definition = """ - # Experimental subject - subject_id : varchar(16) - --- - -> Lab - species : varchar(50) - date_of_birth : date - sex : enum('M', 'F', 'U') - """ -``` - - -### Lookup Tables - -Lookup tables store reference data that rarely changes. Use the `contents` attribute to pre-populate them. - - -```python -@schema -class TaskType(dj.Lookup): - definition = """ - # Types of behavioral tasks - task_type : varchar(32) - --- - description : varchar(255) - """ - contents = [ - {'task_type': 'go_nogo', 'description': 'Go/No-Go discrimination task'}, - {'task_type': '2afc', 'description': 'Two-alternative forced choice'}, - {'task_type': 'foraging', 'description': 'Foraging/exploration task'}, - ] - -@schema -class SessionStatus(dj.Lookup): - definition = """ - # Session status codes - status : varchar(16) - """ - contents = [ - {'status': 'scheduled'}, - {'status': 'in_progress'}, - {'status': 'completed'}, - {'status': 'aborted'}, - ] -``` - - - -```python -# Lookup tables are automatically populated -TaskType() -``` - - -## Primary Keys - -The **primary key** uniquely identifies each row. Attributes above the `---` line form the primary key. - -### Design Principles - -1. **Entity integrity** — Each row represents exactly one real-world entity -2. **No duplicates** — The primary key prevents inserting the same entity twice -3. **Minimal** — Include only attributes necessary for uniqueness - -### Natural vs Surrogate Keys - -- **Natural key**: An identifier used *outside* the database to refer to entities in the real world. Requires a real-world mechanism to establish and maintain the association (e.g., ear tags, cage labels, barcodes). Example: `subject_id = 'M001'` where M001 is printed on the animal's cage. - -- **Surrogate key**: An identifier used *only inside* the database, with minimal or no exposure to end users. Users don't search by surrogate keys or use them in conversation. Example: internal record IDs, auto-generated UUIDs for system tracking. - -DataJoint works well with both. Natural keys make data more interpretable and enable identification of physical entities. Surrogate keys are appropriate when entities exist only within the system or when natural identifiers shouldn't be stored (e.g., privacy). - - -```python -@schema -class Session(dj.Manual): - definition = """ - # Experimental session - -> Subject - session_idx : uint16 # session number for this subject - --- - -> TaskType - -> SessionStatus - session_date : date - session_notes = '' : varchar(1000) - task_params = NULL : json # task-specific parameters (nullable) - """ - - class Trial(dj.Part): - definition = """ - # Individual trial within a session - -> master - trial_idx : uint16 - --- - stimulus : varchar(50) - response : varchar(50) - correct : bool - reaction_time : float32 # seconds - """ -``` - - -The primary key of `Session` is `(subject_id, session_idx)` — a **composite key**. This means: -- Each subject can have multiple sessions (1, 2, 3, ...) -- Session 1 for subject A is different from session 1 for subject B - -## Foreign Keys - -The `->` syntax creates a **foreign key** dependency. Foreign keys: - -1. **Import attributes** — Primary key attributes are inherited from the parent -2. **Enforce referential integrity** — Can't insert a session for a non-existent subject -3. **Enable cascading deletes** — Deleting a subject removes all its sessions -4. **Define workflow** — The parent must exist before the child - - -```python -# Let's insert some data to see how foreign keys work -Lab.insert1({ - 'lab_id': 'tolias', - 'lab_name': 'Tolias Lab', - 'institution': 'Baylor College of Medicine' -}) -# Note: created_at is auto-populated with CURRENT_TIMESTAMP - -Subject.insert1({ - 'subject_id': 'M001', - 'lab_id': 'tolias', - 'species': 'Mus musculus', - 'date_of_birth': '2026-01-15', - 'sex': 'M' -}) - -Subject() -``` - - - -```python -# Insert sessions for this subject -Session.insert([ - {'subject_id': 'M001', 'session_idx': 1, 'task_type': 'go_nogo', - 'status': 'completed', 'session_date': '2026-01-06', - 'task_params': {'go_probability': 0.5, 'timeout_sec': 2.0}}, - {'subject_id': 'M001', 'session_idx': 2, 'task_type': 'go_nogo', - 'status': 'completed', 'session_date': '2026-01-07', - 'task_params': {'go_probability': 0.7, 'timeout_sec': 1.5}}, - {'subject_id': 'M001', 'session_idx': 3, 'task_type': '2afc', - 'status': 'in_progress', 'session_date': '2026-01-08', - 'task_params': None}, # NULL - no parameters for this session -]) - -Session() -``` - - - -```python -# This would fail - referential integrity prevents invalid foreign keys -try: - Session.insert1({'subject_id': 'INVALID', 'session_idx': 1, - 'task_type': 'go_nogo', 'status': 'completed', - 'session_date': '2026-01-06'}) -except Exception as e: - print(f"Error: {type(e).__name__}") - print("Cannot insert session for non-existent subject!") -``` - - -## Relationship Patterns - -### One-to-Many (Hierarchical) - -When a foreign key is part of the primary key, it creates a **one-to-many** relationship: -- One subject → many sessions -- One session → many trials - -### Master-Part (Compositional Integrity) - -A **part table** provides **compositional integrity**: master and parts are inserted and deleted as an atomic unit. Part tables: -- Reference the master with `-> master` -- Are inserted together with the master atomically -- Are deleted when the master is deleted -- Can be one-to-many or one-to-one with the master -- A master can have multiple part tables, which may reference each other - -We defined `Session.Trial` as a part table because trials belong to their session: -- A session and all its trials should be entered together -- Deleting a session removes all its trials -- Downstream computations can assume all trials are present once the session exists - -Use part tables when components must be complete before processing can begin. - - -```python -# Access the part table -Session.Trial() -``` - - -### One-to-One (Extension) - -When the child's primary key exactly matches the parent's, it creates a **one-to-one** relationship. This is useful for: -- Extending a table with optional or computed data -- Separating computed results from source data - -`SessionSummary` below has a one-to-one relationship with `Session`—each session has exactly one summary. - - -```python -@schema -class SessionSummary(dj.Computed): - definition = """ - # Summary statistics for a session - -> Session - --- - num_trials : uint16 - num_correct : uint16 - accuracy : float32 - mean_reaction_time : float32 - """ - - def make(self, key): - correct_vals, rt_vals = (Session.Trial & key).to_arrays('correct', 'reaction_time') - n_trials = len(correct_vals) - n_correct = sum(correct_vals) if n_trials else 0 - - self.insert1({ - **key, - 'num_trials': n_trials, - 'num_correct': n_correct, - 'accuracy': n_correct / n_trials if n_trials else 0.0, - 'mean_reaction_time': sum(rt_vals) / n_trials if n_trials else 0.0 - }) -``` - - -### Optional Foreign Keys (Nullable) - -Use `[nullable]` for optional relationships: - - -```python -@schema -class Experimenter(dj.Manual): - definition = """ - # Lab member who runs experiments - experimenter_id : uuid # anonymized identifier - --- - full_name : varchar(100) - email = '' : varchar(100) - """ - -@schema -class SessionExperimenter(dj.Manual): - definition = """ - # Links sessions to experimenters (optional) - -> Session - --- - -> [nullable] Experimenter # experimenter may be unknown - """ -``` - - -### Many-to-Many (Association Tables) - -For many-to-many relationships, create an association table with foreign keys to both parents: - - -```python -@schema -class Protocol(dj.Lookup): - definition = """ - # Experimental protocols - protocol_id : varchar(32) - --- - protocol_name : varchar(100) - version : varchar(16) - """ - contents = [ - {'protocol_id': 'iacuc_2024_01', 'protocol_name': 'Mouse Behavior', 'version': '1.0'}, - {'protocol_id': 'iacuc_2024_02', 'protocol_name': 'Imaging Protocol', 'version': '2.1'}, - ] - -@schema -class SubjectProtocol(dj.Manual): - definition = """ - # Protocols assigned to subjects (many-to-many) - -> Subject - -> Protocol - --- - assignment_date : date - """ -``` - - -## View the Schema - -DataJoint can visualize the schema as a diagram: - - -```python -dj.Diagram(schema) -``` - - -### Reading the Diagram - -DataJoint diagrams show tables as nodes and foreign keys as edges. The notation conveys relationship semantics at a glance. - -**Line Styles:** - -| Line | Style | Relationship | Meaning | -|------|-------|--------------|---------| -| ━━━ | Thick solid | Extension | FK **is** entire PK (one-to-one) | -| ─── | Thin solid | Containment | FK **in** PK with other fields (one-to-many) | -| ┄┄┄ | Dashed | Reference | FK in secondary attributes (one-to-many) | - -**Visual Indicators:** - -| Indicator | Meaning | -|-----------|---------| -| **Underlined name** | Introduces new dimension (new PK attributes) | -| Non-underlined name | Inherits all dimensions (PK entirely from FKs) | -| **Green** | Manual table | -| **Gray** | Lookup table | -| **Red** | Computed table | -| **Blue** | Imported table | -| **Orange dots** | Renamed foreign keys (via `.proj()`) | - -**Key principle:** Solid lines mean the parent's identity becomes part of the child's identity. Dashed lines mean the child maintains independent identity. - -**Note:** Diagrams do NOT show `[nullable]` or `[unique]` modifiers—check table definitions for these constraints. - -See [How to Read Diagrams](../../how-to/read-diagrams.ipynb) for diagram operations and comparison to ER notation. - -## Insert Test Data and Populate - - -```python -# Insert trials for the first session -import random -random.seed(42) - -trials = [] -for i in range(20): - correct = random.random() > 0.3 - trials.append({ - 'subject_id': 'M001', - 'session_idx': 1, - 'trial_idx': i + 1, - 'stimulus': random.choice(['left', 'right']), - 'response': random.choice(['go', 'nogo']), - 'correct': correct, - 'reaction_time': random.uniform(0.2, 0.8) - }) - -Session.Trial.insert(trials, skip_duplicates=True) -print(f"Inserted {len(Session.Trial())} trials") -``` - - - -```python -# Populate the computed summary -SessionSummary.populate(display_progress=True) -SessionSummary() -``` - - -## Best Practices - -### 1. Choose Meaningful Primary Keys -- Use natural identifiers when possible (`subject_id = 'M001'`) -- Keep keys minimal but sufficient for uniqueness - -### 2. Use Appropriate Table Tiers -- **Manual**: Data entered by operators or instruments -- **Lookup**: Configuration, parameters, reference data -- **Imported**: Data read from files (recordings, images) -- **Computed**: Derived analyses and summaries - -### 3. Normalize Your Data -- Don't repeat information across rows -- Create separate tables for distinct entities -- Use foreign keys to link related data - -### 4. Use Core DataJoint Types - -DataJoint has a three-layer type architecture (see [Type System Specification](../reference/specs/type-system.md)): - -1. **Native database types** (Layer 1): Backend-specific types like `INT`, `FLOAT`, `TINYINT UNSIGNED`. These are **discouraged** but allowed for backward compatibility. - -2. **Core DataJoint types** (Layer 2): Standardized, scientist-friendly types that work identically across MySQL and PostgreSQL. **Always prefer these.** - -3. **Codec types** (Layer 3): Types with `encode()`/`decode()` semantics like ``, ``, ``. - -**Core types used in this tutorial:** - -| Type | Description | Example | -|------|-------------|---------| -| `uint8`, `uint16`, `int32` | Sized integers | `session_idx : uint16` | -| `float32`, `float64` | Sized floats | `reaction_time : float32` | -| `varchar(n)` | Variable-length string | `name : varchar(100)` | -| `bool` | Boolean | `correct : bool` | -| `date` | Date only | `date_of_birth : date` | -| `datetime` | Date and time (UTC) | `created_at : datetime` | -| `enum(...)` | Enumeration | `sex : enum('M', 'F', 'U')` | -| `json` | JSON document | `task_params : json` | -| `uuid` | Universally unique ID | `experimenter_id : uuid` | - -**Why native types are allowed but discouraged:** - -Native types (like `int`, `float`, `tinyint`) are passed through to the database but generate a **warning at declaration time**. They are discouraged because: -- They lack explicit size information -- They are not portable across database backends -- They are not recorded in field metadata for reconstruction - -If you see a warning like `"Native type 'int' used; consider 'int32' instead"`, update your definition to use the corresponding core type. - -### 5. Document Your Tables -- Add comments after `#` in definitions -- Document units in attribute comments - -## Key Concepts Recap - -| Concept | Description | -|---------|-------------| -| **Primary Key** | Attributes above `---` that uniquely identify rows | -| **Secondary Attributes** | Attributes below `---` that store additional data | -| **Foreign Key** (`->`) | Reference to another table, imports its primary key | -| **One-to-Many** | FK in primary key: parent has many children | -| **One-to-One** | FK is entire primary key: exactly one child per parent | -| **Master-Part** | Compositional integrity: master and parts inserted/deleted atomically | -| **Nullable FK** | `[nullable]` makes the reference optional | -| **Lookup Table** | Pre-populated reference data | - -## Next Steps - -- [Data Entry](03-data-entry.ipynb) — Inserting, updating, and deleting data -- [Queries](04-queries.ipynb) — Filtering, joining, and projecting -- [Computation](05-computation.ipynb) — Building computational pipelines - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/03-data-entry.ipynb - -# Data Entry - -This tutorial covers how to manipulate data in DataJoint tables. You'll learn: - -- **Insert** — Adding rows to tables -- **Update** — Modifying existing rows (for corrections) -- **Delete** — Removing rows with cascading -- **Validation** — Checking data before insertion - -DataJoint is designed around **insert** and **delete** as the primary operations. Updates are intentionally limited to surgical corrections. - - -```python -import datajoint as dj -import numpy as np - -schema = dj.Schema('tutorial_data_entry') -``` - - - -```python -# Define tables for this tutorial -@schema -class Lab(dj.Manual): - definition = """ - lab_id : varchar(16) - --- - lab_name : varchar(100) - """ - -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - -> Lab - species : varchar(50) - date_of_birth : date - notes = '' : varchar(1000) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 - --- - session_date : date - duration : float32 # minutes - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - outcome : enum('hit', 'miss', 'false_alarm', 'correct_reject') - reaction_time : float32 # seconds - """ - -@schema -class ProcessedData(dj.Computed): - definition = """ - -> Session - --- - hit_rate : float32 - """ - - def make(self, key): - outcomes = (Session.Trial & key).to_arrays('outcome') - n_trials = len(outcomes) - hit_rate = np.sum(outcomes == 'hit') / n_trials if n_trials else 0.0 - self.insert1({**key, 'hit_rate': hit_rate}) -``` - - -## Insert Operations - -### `insert1()` — Single Row - -Use `insert1()` to add a single row as a dictionary: - - -```python -# Insert a single row -Lab.insert1({'lab_id': 'tolias', 'lab_name': 'Tolias Lab'}) - -Subject.insert1({ - 'subject_id': 'M001', - 'lab_id': 'tolias', - 'species': 'Mus musculus', - 'date_of_birth': '2026-01-15' -}) - -Subject() -``` - - -### `insert()` — Multiple Rows - -Use `insert()` to add multiple rows at once. This is more efficient than calling `insert1()` in a loop. - - -```python -# Insert multiple rows as a list of dictionaries -Subject.insert([ - { - 'subject_id': 'M002', - 'lab_id': 'tolias', - 'species': 'Mus musculus', - 'date_of_birth': '2026-02-01' - }, - { - 'subject_id': 'M003', - 'lab_id': 'tolias', - 'species': 'Mus musculus', - 'date_of_birth': '2026-02-15' - }, -]) - -Subject() -``` - - -### Accepted Input Formats - -`insert()` accepts several formats: - -| Format | Example | -|--------|--------| -| List of dicts | `[{'id': 1, 'name': 'A'}, ...]` | -| pandas DataFrame | `pd.DataFrame({'id': [1, 2], 'name': ['A', 'B']})` | -| numpy structured array | `np.array([(1, 'A')], dtype=[('id', int), ('name', 'U10')])` | -| QueryExpression | `OtherTable.proj(...)` (INSERT...SELECT) | - - -```python -# Insert from pandas DataFrame -import pandas as pd - -df = pd.DataFrame({ - 'subject_id': ['M004', 'M005'], - 'lab_id': ['tolias', 'tolias'], - 'species': ['Mus musculus', 'Mus musculus'], - 'date_of_birth': ['2026-03-01', '2026-03-15'] -}) - -Subject.insert(df) -print(f"Total subjects: {len(Subject())}") -``` - - -### Handling Duplicates - -By default, inserting a row with an existing primary key raises an error: - - -```python -# This will raise an error - duplicate primary key -try: - Subject.insert1({'subject_id': 'M001', 'lab_id': 'tolias', - 'species': 'Mus musculus', 'date_of_birth': '2026-01-15'}) -except Exception as e: - print(f"Error: {type(e).__name__}") - print("Cannot insert duplicate primary key!") -``` - - -Use `skip_duplicates=True` to silently skip rows with existing keys: - - -```python -# Skip duplicates - existing row unchanged -Subject.insert1( - {'subject_id': 'M001', 'lab_id': 'tolias', 'species': 'Mus musculus', 'date_of_birth': '2026-01-15'}, - skip_duplicates=True -) -print("Insert completed (duplicate skipped)") -``` - - -**Note:** `replace=True` is also available but has the same caveats as `update1()`—it bypasses immutability and can break provenance. Use sparingly for corrections only. - -### Extra Fields - -By default, inserting a row with fields not in the table raises an error: - - -```python -try: - Subject.insert1({'subject_id': 'M006', 'lab_id': 'tolias', - 'species': 'Mus musculus', 'date_of_birth': '2026-04-01', - 'unknown_field': 'some value'}) # Unknown field! -except Exception as e: - print(f"Error: {type(e).__name__}") - print("Field 'unknown_field' not in table!") -``` - - - -```python -# Use ignore_extra_fields=True to silently ignore unknown fields -Subject.insert1( - {'subject_id': 'M006', 'lab_id': 'tolias', 'species': 'Mus musculus', - 'date_of_birth': '2026-04-01', 'unknown_field': 'ignored'}, - ignore_extra_fields=True -) -print(f"Total subjects: {len(Subject())}") -``` - - -## Master-Part Tables and Transactions - -**Compositional integrity** means that a master and all its parts must be inserted (or deleted) as an atomic unit. This ensures downstream computations see complete data. - -- **Auto-populated tables** (Computed, Imported) enforce this automatically—`make()` runs in a transaction -- **Manual tables** require explicit transactions to maintain compositional integrity - -### Inserting Master with Parts - - -```python -# Use a transaction to ensure master and parts are inserted atomically -with dj.conn().transaction: - Session.insert1({ - 'subject_id': 'M001', - 'session_idx': 1, - 'session_date': '2026-01-06', - 'duration': 45.5 - }) - Session.Trial.insert([ - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 1, - 'outcome': 'hit', 'reaction_time': 0.35}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 2, - 'outcome': 'miss', 'reaction_time': 0.82}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 3, - 'outcome': 'hit', 'reaction_time': 0.41}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 4, - 'outcome': 'false_alarm', 'reaction_time': 0.28}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 5, - 'outcome': 'hit', 'reaction_time': 0.39}, - ]) - -# Both master and parts committed together, or neither if error occurred -Session.Trial() -``` - - -## Update Operations - -DataJoint provides only `update1()` for modifying single rows. This is intentional—updates bypass the normal workflow and should be used sparingly for **corrective operations**. - -### When to Use Updates - -**Appropriate uses:** -- Fixing data entry errors (typos, wrong values) -- Adding notes or metadata after the fact -- Administrative corrections - -**Inappropriate uses** (use delete + insert + populate instead): -- Regular workflow operations -- Changes that should trigger recomputation - - -```python -# Update a single row - must provide all primary key values -Subject.update1({'subject_id': 'M001', 'notes': 'Primary subject for behavioral study'}) - -(Subject & 'subject_id="M001"').fetch1() -``` - - - -```python -# Update multiple attributes at once -Subject.update1({ - 'subject_id': 'M002', - 'notes': 'Control group', - 'species': 'Mus musculus (C57BL/6)' # More specific -}) - -(Subject & 'subject_id="M002"').fetch1() -``` - - -### Update Requirements - -1. **Complete primary key**: All PK attributes must be provided -2. **Exactly one match**: Must match exactly one existing row -3. **No restrictions**: Cannot call on a restricted table - - -```python -# Error: incomplete primary key -try: - Subject.update1({'notes': 'Missing subject_id!'}) -except Exception as e: - print(f"Error: {type(e).__name__}") - print("Primary key must be complete") -``` - - - -```python -# Error: cannot update restricted table -try: - (Subject & 'subject_id="M001"').update1({'subject_id': 'M001', 'notes': 'test'}) -except Exception as e: - print(f"Error: {type(e).__name__}") - print("Cannot update restricted table") -``` - - -### Reset to Default - -Setting an attribute to `None` resets it to its default value: - - -```python -# Reset notes to default (empty string) -Subject.update1({'subject_id': 'M003', 'notes': None}) - -(Subject & 'subject_id="M003"').fetch1() -``` - - -## Delete Operations - -### Cascading Deletes - -Deleting a row automatically cascades to all dependent tables. This maintains referential integrity across the pipeline. - - -```python -# First, let's see what we have -print(f"Sessions: {len(Session())}") -print(f"Trials: {len(Session.Trial())}") - -# Populate computed table -ProcessedData.populate() -print(f"ProcessedData: {len(ProcessedData())}") -``` - - - -```python -# Delete a session - cascades to Trial and ProcessedData -(Session & {'subject_id': 'M001', 'session_idx': 1}).delete(prompt=False) - -print(f"After delete:") -print(f"Sessions: {len(Session())}") -print(f"Trials: {len(Session.Trial())}") -print(f"ProcessedData: {len(ProcessedData())}") -``` - - -### Prompt Behavior - -The `prompt` parameter controls whether `delete()` asks for confirmation. When `prompt=None` (default), the behavior is determined by `dj.config['safemode']`: - -```python -# Uses config['safemode'] setting (default) -(Table & condition).delete() - -# Explicitly skip confirmation -(Table & condition).delete(prompt=False) - -# Explicitly require confirmation -(Table & condition).delete(prompt=True) -``` - - -```python -# Add more data for demonstration -with dj.conn().transaction: - Session.insert1({ - 'subject_id': 'M002', - 'session_idx': 1, - 'session_date': '2026-01-07', - 'duration': 30.0 - }) - Session.Trial.insert([ - {'subject_id': 'M002', 'session_idx': 1, 'trial_idx': 1, - 'outcome': 'hit', 'reaction_time': 0.40}, - {'subject_id': 'M002', 'session_idx': 1, 'trial_idx': 2, - 'outcome': 'hit', 'reaction_time': 0.38}, - ]) - -# Delete with prompt=False (no confirmation prompt) -(Session & {'subject_id': 'M002', 'session_idx': 1}).delete(prompt=False) -``` - - -### The Recomputation Pattern - -When source data needs to change, the correct pattern is **delete → insert → populate**. This ensures all derived data remains consistent: - - -```python -# Add a session with trials (using transaction for compositional integrity) -with dj.conn().transaction: - Session.insert1({ - 'subject_id': 'M003', - 'session_idx': 1, - 'session_date': '2026-01-08', - 'duration': 40.0 - }) - Session.Trial.insert([ - {'subject_id': 'M003', 'session_idx': 1, 'trial_idx': 1, - 'outcome': 'hit', 'reaction_time': 0.35}, - {'subject_id': 'M003', 'session_idx': 1, 'trial_idx': 2, - 'outcome': 'miss', 'reaction_time': 0.50}, - ]) - -# Compute results -ProcessedData.populate() -print("Before correction:", ProcessedData.fetch1()) -``` - - - -```python -# Suppose we discovered trial 2 was actually a 'hit' not 'miss' -# WRONG: Updating the trial would leave ProcessedData stale! -# Session.Trial.update1({...}) # DON'T DO THIS - -# CORRECT: Delete, reinsert, recompute -key = {'subject_id': 'M003', 'session_idx': 1} - -# 1. Delete cascades to ProcessedData -(Session & key).delete(prompt=False) - -# 2. Reinsert with corrected data (using transaction) -with dj.conn().transaction: - Session.insert1({**key, 'session_date': '2026-01-08', 'duration': 40.0}) - Session.Trial.insert([ - {**key, 'trial_idx': 1, 'outcome': 'hit', 'reaction_time': 0.35}, - {**key, 'trial_idx': 2, 'outcome': 'hit', 'reaction_time': 0.50}, - ]) - -# 3. Recompute -ProcessedData.populate() -print("After correction:", ProcessedData.fetch1()) -``` - - -## Validation - -Use `validate()` to check data before insertion: - - -```python -# Validate rows before inserting -rows_to_insert = [ - {'subject_id': 'M007', 'lab_id': 'tolias', 'species': 'Mus musculus', 'date_of_birth': '2026-05-01'}, - {'subject_id': 'M008', 'lab_id': 'tolias', 'species': 'Mus musculus', 'date_of_birth': '2026-05-15'}, -] - -result = Subject.validate(rows_to_insert) - -if result: - Subject.insert(rows_to_insert) - print(f"Inserted {len(rows_to_insert)} rows") -else: - print("Validation failed:") - print(result.summary()) -``` - - - -```python -# Example of validation failure -bad_rows = [ - {'subject_id': 'M009', 'species': 'Mus musculus', 'date_of_birth': '2026-05-20'}, # Missing lab_id! -] - -result = Subject.validate(bad_rows) - -if not result: - print("Validation failed!") - for error in result.errors: - print(f" {error}") -``` - - -## Transactions - -Single operations are atomic by default. Use explicit transactions for: - -1. **Master-part inserts** — Maintain compositional integrity -2. **Multi-table operations** — All succeed or all fail -3. **Complex workflows** — Coordinate related changes - - -```python -# Atomic transaction - all inserts succeed or none do -with dj.conn().transaction: - Session.insert1({ - 'subject_id': 'M007', - 'session_idx': 1, - 'session_date': '2026-01-10', - 'duration': 35.0 - }) - Session.Trial.insert([ - {'subject_id': 'M007', 'session_idx': 1, 'trial_idx': 1, - 'outcome': 'hit', 'reaction_time': 0.33}, - {'subject_id': 'M007', 'session_idx': 1, 'trial_idx': 2, - 'outcome': 'miss', 'reaction_time': 0.45}, - ]) - -print(f"Session inserted with {len(Session.Trial & {'subject_id': 'M007'})} trials") -``` - - -## Best Practices - -### 1. Prefer Insert/Delete Over Update - -When source data changes, delete and reinsert rather than updating. Updates and `replace=True` bypass immutability and break provenance: - -```python -# Good: Delete and reinsert -(Trial & key).delete(prompt=False) -Trial.insert1(corrected_trial) -DerivedTable.populate() - -# Avoid: Update that leaves derived data stale -Trial.update1({**key, 'value': new_value}) -``` - -### 2. Use Transactions for Master-Part Inserts - -```python -# Ensures compositional integrity -with dj.conn().transaction: - Session.insert1(session_data) - Session.Trial.insert(trials) -``` - -### 3. Batch Inserts for Performance - -```python -# Good: Single insert call -Subject.insert(all_rows) - -# Slow: Loop of insert1 calls -for row in all_rows: - Subject.insert1(row) # Creates many transactions -``` - -### 4. Validate Before Insert - -```python -result = Subject.validate(rows) -if not result: - raise ValueError(result.summary()) -Subject.insert(rows) -``` - -### 5. Configure Safe Mode for Production - -```python -# In production scripts, explicitly control prompt behavior -(Subject & condition).delete(prompt=False) # No confirmation - -# Or configure globally via settings -dj.config['safemode'] = True # Require confirmation by default -``` - -## Quick Reference - -| Operation | Method | Use Case | -|-----------|--------|----------| -| Insert one | `insert1(row)` | Adding single entity | -| Insert many | `insert(rows)` | Bulk data loading | -| Update one | `update1(row)` | Surgical corrections only | -| Delete | `delete()` | Removing entities (cascades) | -| Delete quick | `delete_quick()` | Internal cleanup (no cascade) | -| Validate | `validate(rows)` | Pre-insert check | - -See the [Data Manipulation Specification](../reference/specs/data-manipulation.md) for complete details. - -## Next Steps - -- [Queries](04-queries.ipynb) — Filtering, joining, and projecting data -- [Computation](05-computation.ipynb) — Building computational pipelines - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/04-queries.ipynb - -# Queries - -This tutorial covers how to query data in DataJoint. You'll learn: - -- **Restriction** (`&`, `-`) — Filtering rows -- **Top** (`dj.Top`) — Limiting and ordering results -- **Projection** (`.proj()`) — Selecting and computing columns -- **Join** (`*`) — Combining tables -- **Extension** (`.extend()`) — Adding optional attributes -- **Aggregation** (`.aggr()`) — Grouping and summarizing -- **Fetching** — Retrieving data in various formats - -DataJoint queries are **lazy**—they build SQL expressions that execute only when you fetch data. - - -```python -import datajoint as dj -import numpy as np - -schema = dj.Schema('tutorial_queries') -``` - - - -```python -# Define tables for this tutorial -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(50) - date_of_birth : date - sex : enum('M', 'F', 'U') - weight : float32 # grams - """ - -@schema -class Experimenter(dj.Manual): - definition = """ - experimenter_id : varchar(16) - --- - full_name : varchar(100) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 - --- - -> Experimenter - session_date : date - duration : float32 # minutes - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - stimulus : varchar(50) - response : varchar(50) - correct : bool - reaction_time : float32 # seconds - """ -``` - - - -```python -# Insert sample data -import random -random.seed(42) - -Experimenter.insert([ - {'experimenter_id': 'alice', 'full_name': 'Alice Smith'}, - {'experimenter_id': 'bob', 'full_name': 'Bob Jones'}, -]) - -subjects = [ - {'subject_id': 'M001', 'species': 'Mus musculus', - 'date_of_birth': '2026-01-15', 'sex': 'M', 'weight': 25.3}, - {'subject_id': 'M002', 'species': 'Mus musculus', - 'date_of_birth': '2026-02-01', 'sex': 'F', 'weight': 22.1}, - {'subject_id': 'M003', 'species': 'Mus musculus', - 'date_of_birth': '2026-02-15', 'sex': 'M', 'weight': 26.8}, - {'subject_id': 'R001', 'species': 'Rattus norvegicus', - 'date_of_birth': '2024-01-01', 'sex': 'F', 'weight': 280.5}, -] -Subject.insert(subjects) - -# Insert sessions -sessions = [ - {'subject_id': 'M001', 'session_idx': 1, 'experimenter_id': 'alice', - 'session_date': '2026-01-06', 'duration': 45.0}, - {'subject_id': 'M001', 'session_idx': 2, 'experimenter_id': 'alice', - 'session_date': '2026-01-07', 'duration': 50.0}, - {'subject_id': 'M002', 'session_idx': 1, 'experimenter_id': 'bob', - 'session_date': '2026-01-06', 'duration': 40.0}, - {'subject_id': 'M002', 'session_idx': 2, 'experimenter_id': 'bob', - 'session_date': '2026-01-08', 'duration': 55.0}, - {'subject_id': 'M003', 'session_idx': 1, 'experimenter_id': 'alice', - 'session_date': '2026-01-07', 'duration': 35.0}, -] -Session.insert(sessions) - -# Insert trials -trials = [] -for s in sessions: - for i in range(10): - trials.append({ - 'subject_id': s['subject_id'], - 'session_idx': s['session_idx'], - 'trial_idx': i + 1, - 'stimulus': random.choice(['left', 'right']), - 'response': random.choice(['left', 'right']), - 'correct': random.random() > 0.3, - 'reaction_time': random.uniform(0.2, 0.8) - }) -Session.Trial.insert(trials) - -print(f"Subjects: {len(Subject())}, Sessions: {len(Session())}, " - f"Trials: {len(Session.Trial())}") -``` - - -## Restriction (`&` and `-`) - -Restriction filters rows based on conditions. Use `&` to select matching rows, `-` to exclude them. - -### String Conditions - -SQL expressions using attribute names: - - -```python -# Simple comparison -Subject & "weight > 25" -``` - - - -```python -# Date comparison -Session & "session_date > '2026-01-06'" -``` - - - -```python -# Multiple conditions with AND -Subject & "sex = 'M' AND weight > 25" -``` - - -### Dictionary Conditions - -Dictionaries specify exact matches: - - -```python -# Single attribute -Subject & {'sex': 'F'} -``` - - - -```python -# Multiple attributes (AND) -Session & {'subject_id': 'M001', 'session_idx': 1} -``` - - -### Restriction by Query Expression - -Restrict by another query expression. DataJoint uses **semantic matching**: attributes with the same name are matched only if they share the same origin through foreign key lineage. This prevents accidental matches on unrelated attributes that happen to share names (like generic `id` columns in unrelated tables). - -See [Semantic Matching](../reference/specs/semantic-matching.md) for the full specification. - - -```python -# Subjects that have at least one session -Subject & Session -``` - - - -```python -# Subjects without any sessions (R001 has no sessions) -Subject - Session -``` - - -### Collection Conditions (OR) - -Lists create OR conditions: - - -```python -# Either of these subjects -Subject & [{'subject_id': 'M001'}, {'subject_id': 'M002'}] -``` - - -### Chaining Restrictions - -Sequential restrictions combine with AND: - - -```python -# These are equivalent -result1 = Subject & "sex = 'M'" & "weight > 25" -result2 = (Subject & "sex = 'M'") & "weight > 25" - -print(f"Result 1: {len(result1)} rows") -print(f"Result 2: {len(result2)} rows") -``` - - -### Top Restriction (`dj.Top`) - -`dj.Top` is a special restriction that limits and orders query results. Unlike fetch-time `order_by` and `limit`, `dj.Top` applies **within the query itself**, making it composable with other operators. - -```python -query & dj.Top(limit=N, order_by='attr DESC', offset=M) -``` - -This is useful when you need the "top N" rows as part of a larger query—for example, the 5 highest-scoring trials per session. - - -```python -# Top 2 heaviest subjects -Subject & dj.Top(limit=2, order_by='weight DESC') -``` - - - -```python -# Skip first 2, then get next 2 (pagination) -Subject & dj.Top(limit=2, order_by='weight DESC', offset=2) -``` - - - -```python -# Combine with other restrictions -(Subject & "sex = 'M'") & dj.Top(limit=1, order_by='weight DESC') -``` - - -**When to use `dj.Top` vs fetch-time `order_by`/`limit`:** - -- Use `dj.Top` when the limited result needs to be **joined or restricted further** -- Use fetch-time parameters (`to_dicts(order_by=..., limit=...)`) for **final output** - -**Note:** Some databases (including MySQL 8.0) don't support LIMIT in certain subquery contexts. If you encounter this limitation, fetch the keys first and use them as a restriction: - - -```python -# Get trials only from the 2 longest sessions -# Workaround: fetch keys first, then use as restriction -longest_session_keys = (Session & dj.Top(limit=2, order_by='duration DESC')).keys() -Session.Trial & longest_session_keys -``` - - -## Projection (`.proj()`) - -Projection selects, renames, or computes attributes. - -### Selecting Attributes - - -```python -# Primary key only (no arguments) -Subject.proj() -``` - - - -```python -# Primary key + specific attributes -Subject.proj('species', 'sex') -``` - - - -```python -# All attributes (using ellipsis) -Subject.proj(...) -``` - - - -```python -# All except specific attributes -Subject.proj(..., '-weight') -``` - - -### Renaming Attributes - - -```python -# Rename 'species' to 'animal_species' -Subject.proj(animal_species='species') -``` - - -### Computed Attributes - - -```python -# Arithmetic computation -Subject.proj('species', weight_kg='weight / 1000') -``` - - - -```python -# Date functions -Session.proj('session_date', year='YEAR(session_date)', month='MONTH(session_date)') -``` - - -## Join (`*`) - -Join combines tables on shared attributes. Unlike SQL, which offers many join variants (INNER, LEFT, RIGHT, FULL, CROSS, NATURAL), DataJoint provides **one rigorous join operator** with strict semantic rules. - -The `*` operator: -- Matches only **semantically compatible** attributes (same name AND same origin via foreign key lineage) -- Produces a result with a **valid primary key** determined by functional dependencies -- Follows clear algebraic properties - -This simplicity makes DataJoint queries unambiguous and composable. - - -```python -# Join Subject and Session on subject_id -Subject * Session -``` - - - -```python -# Join then restrict -(Subject * Session) & "sex = 'M'" -``` - - - -```python -# Restrict then join (equivalent result) -(Subject & "sex = 'M'") * Session -``` - - - -```python -# Three-way join -(Subject * Session * Experimenter).proj('species', 'session_date', 'full_name') -``` - - -### Primary Keys in Join Results - -Every query result has a valid primary key. For joins, the result's primary key depends on **functional dependencies** between the operands: - -| Condition | Result Primary Key | -|-----------|-------------------| -| `A → B` (A determines B) | PK(A) | -| `B → A` (B determines A) | PK(B) | -| Both | PK(A) | -| Neither | PK(A) ∪ PK(B) | - -**"A determines B"** means all of B's primary key attributes exist in A (as primary or secondary attributes). - -In our example: -- `Session` has PK: `(subject_id, session_idx)` -- `Trial` has PK: `(subject_id, session_idx, trial_idx)` - -Since Session's PK is a subset of Trial's PK, `Session → Trial`. The join `Session * Trial` has the same primary key as Session. - -See the [Query Algebra Specification](../reference/specs/query-algebra.md) for the complete functional dependency rules. - -### Extension (`.extend()`) - -Sometimes you want to add attributes from a related table without losing rows that lack matching entries. The **extend** operator is a specialized join for this purpose. - -`A.extend(B)` is equivalent to a left join: it preserves all rows from A, adding B's attributes where matches exist (with NULL where they don't). - -**Requirement**: A must "determine" B—all of B's primary key attributes must exist in A. This ensures the result maintains A's entity identity. - - -```python -# Session contains experimenter_id (FK to Experimenter) -# extend adds Experimenter's attributes while keeping all Sessions -Session.extend(Experimenter) -``` - - -**Why extend instead of join?** - -A regular join (`*`) would exclude sessions if their experimenter wasn't in the Experimenter table. Extend preserves all sessions, filling in NULL for missing experimenter data. This is essential when you want to add optional attributes without filtering your results. - -## Aggregation (`.aggr()`) - -DataJoint aggregation operates **entity-to-entity**: you aggregate one entity type with respect to another. This differs fundamentally from SQL's `GROUP BY`, which groups by arbitrary attribute sets. - -In DataJoint: -```python -Session.aggr(Trial, n_trials='count(*)') -``` - -This reads: "For each **Session entity**, aggregate its associated **Trial entities**." - -The equivalent SQL would be: -```sql -SELECT session.*, COUNT(*) as n_trials -FROM session -JOIN trial USING (subject_id, session_idx) -GROUP BY session.subject_id, session.session_idx -``` - -The key insight: aggregation always groups by the **primary key of the left operand**. This enforces meaningful groupings—you aggregate over well-defined entities, not arbitrary attribute combinations. - - -```python -# Count trials per session -Session.aggr(Session.Trial, n_trials='count(*)') -``` - - - -```python -# Multiple aggregates -Session.aggr( - Session.Trial, - n_trials='count(*)', - n_correct='sum(correct)', - avg_rt='avg(reaction_time)' -) -``` - - - -```python -# Count sessions per subject -Subject.aggr(Session, n_sessions='count(*)') -``` - - -### The `exclude_nonmatching` Parameter - -By default, aggregation keeps all entities from the grouping table, even those without matches. This ensures you see zeros rather than missing rows. - -However, `count(*)` counts the NULL-joined row as 1. To correctly count 0 for entities without matches, use `count(pk_attribute)` which excludes NULLs: - - -```python -# All subjects, including those without sessions (n_sessions=0) -# count(session_idx) returns 0 for NULLs, unlike count(*) -Subject.aggr(Session, n_sessions='count(session_idx)') -``` - - - -```python -# Only subjects that have at least one session (exclude those without matches) -Subject.aggr(Session, n_sessions='count(session_idx)', exclude_nonmatching=True) -``` - - -### Universal Set (`dj.U()`) - -What if you need to aggregate but there's no appropriate entity to group by? DataJoint provides `dj.U()` (the "universal set") for these cases. - -**`dj.U()`** (no attributes) represents the singleton entity—the "one universe." Aggregating against it produces a single row with global statistics. - -**`dj.U('attr1', 'attr2')`** creates an ad-hoc grouping entity from the specified attributes. This enables aggregation when no table exists with those attributes as its primary key. - -For example, suppose you want to count sessions by `session_date`, but no table has `session_date` as its primary key. You can use `dj.U('session_date')` to create the grouping: - - -```python -# Group by session_date (not a primary key in any table) -dj.U('session_date').aggr(Session, n_sessions='count(*)', total_duration='sum(duration)') -``` - - - -```python -# Universal aggregation: dj.U() with no attributes produces one row -# This aggregates against the singleton "universe" -dj.U().aggr(Session, total_sessions='count(*)', avg_duration='avg(duration)') -``` - - - -```python -# Group by experimenter_id (a foreign key in Session, not part of Session's PK) -# Without dj.U(), we couldn't aggregate sessions by experimenter -dj.U('experimenter_id').aggr(Session, n_sessions='count(*)') -``` - - - -```python -# Unique values -dj.U('species') & Subject -``` - - -## Fetching Data - -DataJoint 2.0 provides explicit methods for different output formats. - -### `to_dicts()` — List of Dictionaries - - -```python -# Get all rows as list of dicts -rows = Subject.to_dicts() -rows[:2] -``` - - -### `to_pandas()` — DataFrame - - -```python -# Get as pandas DataFrame (primary key as index) -df = Subject.to_pandas() -df -``` - - -### `to_arrays()` — NumPy Arrays - - -```python -# Structured array (all columns) -arr = Subject.to_arrays() -arr -``` - - - -```python -# Specific columns as separate arrays -species, weights = Subject.to_arrays('species', 'weight') -print(f"Species: {species}") -print(f"Weights: {weights}") -``` - - -### `keys()` — Primary Keys - - -```python -# Get primary keys for iteration -keys = Session.keys() -keys[:3] -``` - - -### `fetch1()` — Single Row - - -```python -# Fetch one row (raises error if not exactly 1) -row = (Subject & {'subject_id': 'M001'}).fetch1() -row -``` - - - -```python -# Fetch specific attributes from one row -species, weight = (Subject & {'subject_id': 'M001'}).fetch1('species', 'weight') -print(f"{species}: {weight}g") -``` - - -### Ordering and Limiting - - -```python -# Sort by weight descending, get top 2 -Subject.to_dicts(order_by='weight DESC', limit=2) -``` - - - -```python -# Sort by primary key -Subject.to_dicts(order_by='KEY') -``` - - -### Lazy Iteration - -Iterating directly over a table streams rows efficiently: - - -```python -# Stream rows (single database cursor) -for row in Subject: - print(f"{row['subject_id']}: {row['species']}") -``` - - -## Query Composition - -Queries are composable and immutable. Build complex queries step by step: - - -```python -# Build a complex query step by step -male_mice = Subject & "sex = 'M'" & "species LIKE '%musculus%'" -sessions_with_subject = male_mice * Session -alice_sessions = sessions_with_subject & {'experimenter_id': 'alice'} -result = alice_sessions.proj('session_date', 'duration', 'weight') - -result -``` - - - -```python -# Or as a single expression -((Subject & "sex = 'M'" & "species LIKE '%musculus%'") - * Session - & {'experimenter_id': 'alice'} -).proj('session_date', 'duration', 'weight') -``` - - -## Operator Precedence - -Python operator precedence applies: - -1. `*` (join) — highest -2. `+`, `-` (union, anti-restriction) -3. `&` (restriction) — lowest - -Use parentheses for clarity: - - -```python -# Without parentheses: join happens first -# Subject * Session & condition means (Subject * Session) & condition - -# With parentheses: explicit order -result1 = (Subject & "sex = 'M'") * Session # Restrict then join -result2 = Subject * (Session & "duration > 40") # Restrict then join - -print(f"Result 1: {len(result1)} rows") -print(f"Result 2: {len(result2)} rows") -``` - - -## Quick Reference - -### Operators - -| Operation | Syntax | Description | -|-----------|--------|-------------| -| Restrict | `A & cond` | Select matching rows | -| Anti-restrict | `A - cond` | Select non-matching rows | -| Top | `A & dj.Top(limit, order_by)` | Limit/order results | -| Project | `A.proj(...)` | Select/compute columns | -| Join | `A * B` | Combine tables | -| Extend | `A.extend(B)` | Add B's attributes, keep all A rows | -| Aggregate | `A.aggr(B, ...)` | Group and summarize | -| Union | `A + B` | Combine entity sets | - -### Fetch Methods - -| Method | Returns | Use Case | -|--------|---------|----------| -| `to_dicts()` | `list[dict]` | JSON, iteration | -| `to_pandas()` | `DataFrame` | Data analysis | -| `to_arrays()` | `np.ndarray` | Numeric computation | -| `to_arrays('a', 'b')` | `tuple[array, ...]` | Specific columns | -| `keys()` | `list[dict]` | Primary keys | -| `fetch1()` | `dict` | Single row | - -See the [Query Algebra Specification](../reference/specs/query-algebra.md) and [Fetch API](../reference/specs/fetch-api.md) for complete details. - -## Next Steps - -- [Computation](05-computation.ipynb) — Building computational pipelines - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/05-computation.ipynb - -# Computation - -This tutorial covers how to build computational pipelines with DataJoint. You'll learn: - -- **Computed tables** — Automatic derivation from other tables -- **Imported tables** — Ingesting data from external files -- **The `make()` method** — Computing and inserting results -- **Part tables** — Storing detailed results -- **Populate patterns** — Running computations efficiently - -DataJoint's auto-populated tables (`Computed` and `Imported`) execute automatically based on their dependencies. - - -```python -import datajoint as dj -import numpy as np - -schema = dj.Schema('tutorial_computation') -``` - - -## Manual Tables (Source Data) - -First, let's define the source tables that our computations will depend on: - - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(50) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 - --- - session_date : date - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - stimulus : varchar(50) - response : varchar(50) - correct : bool - reaction_time : float32 # seconds - """ - -@schema -class AnalysisMethod(dj.Lookup): - definition = """ - method_name : varchar(32) - --- - description : varchar(255) - """ - contents = [ - {'method_name': 'basic', 'description': 'Simple accuracy calculation'}, - {'method_name': 'weighted', 'description': 'Reaction-time weighted accuracy'}, - ] -``` - - - -```python -# Insert sample data -import random -random.seed(42) - -Subject.insert([ - {'subject_id': 'M001', 'species': 'Mus musculus'}, - {'subject_id': 'M002', 'species': 'Mus musculus'}, -]) - -sessions = [ - {'subject_id': 'M001', 'session_idx': 1, 'session_date': '2026-01-06'}, - {'subject_id': 'M001', 'session_idx': 2, 'session_date': '2026-01-07'}, - {'subject_id': 'M002', 'session_idx': 1, 'session_date': '2026-01-06'}, -] -Session.insert(sessions) - -# Insert trials for each session -trials = [] -for s in sessions: - for i in range(15): - trials.append({ - 'subject_id': s['subject_id'], - 'session_idx': s['session_idx'], - 'trial_idx': i + 1, - 'stimulus': random.choice(['left', 'right']), - 'response': random.choice(['left', 'right']), - 'correct': random.random() > 0.3, - 'reaction_time': random.uniform(0.2, 0.8) - }) -Session.Trial.insert(trials) - -print(f"Subjects: {len(Subject())}, Sessions: {len(Session())}, " - f"Trials: {len(Session.Trial())}") -``` - - -## Computed Tables - -A `Computed` table derives its data from other DataJoint tables. The `make()` method computes and inserts one entry at a time. - -### Basic Computed Table - - -```python -@schema -class SessionSummary(dj.Computed): - definition = """ - # Summary statistics for each session - -> Session - --- - n_trials : uint16 - n_correct : uint16 - accuracy : float32 - mean_rt : float32 # mean reaction time (seconds) - """ - - def make(self, key): - # Fetch trial data for this session - correct, rt = (Session.Trial & key).to_arrays('correct', 'reaction_time') - - n_trials = len(correct) - n_correct = sum(correct) if n_trials else 0 - - # Insert computed result - self.insert1({ - **key, - 'n_trials': n_trials, - 'n_correct': n_correct, - 'accuracy': n_correct / n_trials if n_trials else 0.0, - 'mean_rt': np.mean(rt) if n_trials else 0.0 - }) -``` - - -### Running Computations with `populate()` - -The `populate()` method automatically finds entries that need computing and calls `make()` for each: - - -```python -# Check what needs computing -print(f"Entries to compute: {len(SessionSummary.key_source - SessionSummary)}") - -# Run the computation -SessionSummary.populate(display_progress=True) - -# View results -SessionSummary() -``` - - -### Key Source - -The `key_source` property defines which entries should be computed. By default, it's the join of all parent tables referenced in the primary key: - - -```python -# SessionSummary.key_source is automatically Session -# (the table referenced in the primary key) -print("Key source:") -SessionSummary.key_source -``` - - -## Multiple Dependencies - -Computed tables can depend on multiple parent tables. The `key_source` is the join of all parents: - - -```python -@schema -class SessionAnalysis(dj.Computed): - definition = """ - # Analysis with configurable method - -> Session - -> AnalysisMethod - --- - score : float32 - """ - - def make(self, key): - # Fetch trial data - correct, rt = (Session.Trial & key).to_arrays('correct', 'reaction_time') - - # Apply method-specific analysis - if key['method_name'] == 'basic': - score = sum(correct) / len(correct) if len(correct) else 0.0 - elif key['method_name'] == 'weighted': - # Weight correct trials by inverse reaction time - weights = 1.0 / rt - score = sum(correct * weights) / sum(weights) if len(correct) else 0.0 - else: - score = 0.0 - - self.insert1({**key, 'score': score}) -``` - - - -```python -# Key source is Session * AnalysisMethod (all combinations) -print(f"Key source has {len(SessionAnalysis.key_source)} entries") -print(f" = {len(Session())} sessions x {len(AnalysisMethod())} methods") - -SessionAnalysis.populate(display_progress=True) -SessionAnalysis() -``` - - -## Computed Tables with Part Tables - -Use part tables to store detailed results alongside summary data: - - -```python -@schema -class TrialAnalysis(dj.Computed): - definition = """ - # Per-trial analysis results - -> Session - --- - n_analyzed : uint16 - """ - - class TrialResult(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - rt_percentile : float32 # reaction time percentile within session - is_fast : bool # below median reaction time - """ - - def make(self, key): - # Fetch trial data - trial_data = (Session.Trial & key).to_dicts() - - if not trial_data: - self.insert1({**key, 'n_analyzed': 0}) - return - - # Calculate percentiles - rts = [t['reaction_time'] for t in trial_data] - median_rt = np.median(rts) - - # Insert master entry - self.insert1({**key, 'n_analyzed': len(trial_data)}) - - # Insert part entries - parts = [] - for t in trial_data: - percentile = sum(rt <= t['reaction_time'] for rt in rts) / len(rts) * 100 - parts.append({ - **key, - 'trial_idx': t['trial_idx'], - 'rt_percentile': float(percentile), - 'is_fast': t['reaction_time'] < median_rt - }) - - self.TrialResult.insert(parts) -``` - - - -```python -TrialAnalysis.populate(display_progress=True) - -print("Master table:") -print(TrialAnalysis()) - -print("\nPart table (first session):") -print((TrialAnalysis.TrialResult & {'subject_id': 'M001', 'session_idx': 1})) -``` - - -## Cascading Computations - -Computed tables can depend on other computed tables, creating a pipeline: - - -```python -@schema -class SubjectSummary(dj.Computed): - definition = """ - # Summary across all sessions for a subject - -> Subject - --- - n_sessions : uint16 - total_trials : uint32 - overall_accuracy : float32 - """ - - def make(self, key): - # Fetch from SessionSummary (another computed table) - summaries = (SessionSummary & key).to_dicts() - - n_sessions = len(summaries) - total_trials = sum(s['n_trials'] for s in summaries) - total_correct = sum(s['n_correct'] for s in summaries) - - self.insert1({ - **key, - 'n_sessions': n_sessions, - 'total_trials': total_trials, - 'overall_accuracy': total_correct / total_trials if total_trials else 0.0 - }) -``` - - - -```python -# SubjectSummary depends on SessionSummary which is already populated -SubjectSummary.populate(display_progress=True) -SubjectSummary() -``` - - -## View the Pipeline - -Visualize the dependency structure: - - -```python -dj.Diagram(schema) -``` - - -## Recomputation After Changes - -When source data changes, delete the affected computed entries and re-populate: - - -```python -# Add a new session -Session.insert1({'subject_id': 'M001', 'session_idx': 3, 'session_date': '2026-01-08'}) - -# Add trials for the new session -new_trials = [ - {'subject_id': 'M001', 'session_idx': 3, 'trial_idx': i + 1, - 'stimulus': 'left', 'response': 'left', 'correct': True, 'reaction_time': 0.3} - for i in range(20) -] -Session.Trial.insert(new_trials) - -# Re-populate (only computes new entries) -print("Populating new session...") -SessionSummary.populate(display_progress=True) -TrialAnalysis.populate(display_progress=True) - -# SubjectSummary needs to be recomputed for M001 -# Delete old entry first (cascading not needed here since no dependents) -(SubjectSummary & {'subject_id': 'M001'}).delete(prompt=False) -SubjectSummary.populate(display_progress=True) - -print("\nUpdated SubjectSummary:") -SubjectSummary() -``` - - -## Populate Options - -### Restrict to Specific Entries - - -```python -# Populate only for a specific subject -SessionAnalysis.populate(Subject & {'subject_id': 'M001'}) -``` - - -### Limit Number of Computations - - -```python -# Process at most 5 entries -SessionAnalysis.populate(max_calls=5, display_progress=True) -``` - - -### Error Handling - - -```python -# Continue despite errors -result = SessionAnalysis.populate(suppress_errors=True) -print(f"Success: {result.get('success', 0)}, Errors: {result.get('error', 0)}") -``` - - -## Progress Tracking - - -```python -# Check progress -remaining, total = SessionAnalysis.progress() -print(f"SessionAnalysis: {total - remaining}/{total} computed") -``` - - -## Custom Key Source - -Override `key_source` to customize which entries to compute: - - -```python -@schema -class QualityCheck(dj.Computed): - definition = """ - -> Session - --- - passes_qc : bool - """ - - @property - def key_source(self): - # Only process sessions with at least 10 trials - good_sessions = dj.U('subject_id', 'session_idx').aggr( - Session.Trial, n='count(*)' - ) & 'n >= 10' - return Session & good_sessions - - def make(self, key): - # Fetch summary stats - summary = (SessionSummary & key).fetch1() - - # QC: accuracy > 50% and mean RT < 1 second - passes = summary['accuracy'] > 0.5 and summary['mean_rt'] < 1.0 - - self.insert1({**key, 'passes_qc': passes}) -``` - - - -```python -print(f"Key source entries: {len(QualityCheck.key_source)}") -QualityCheck.populate(display_progress=True) -QualityCheck() -``` - - -## Best Practices - -### 1. Keep `make()` Simple and Idempotent - -```python -def make(self, key): - # 1. Fetch source data - data = (SourceTable & key).fetch1() - - # 2. Compute result - result = compute(data) - - # 3. Insert result - self.insert1({**key, **result}) -``` - -### 2. Use Part Tables for Detailed Results - -Store summary in master, details in parts: - -```python -def make(self, key): - self.insert1({**key, 'summary': s}) # Master - self.Detail.insert(details) # Parts -``` - -### 3. Re-populate After Data Changes - -```python -# Delete affected entries (cascades automatically) -(SourceTable & key).delete() - -# Reinsert corrected data -SourceTable.insert1(corrected) - -# Re-populate -ComputedTable.populate() -``` - -### 4. Use Lookup Tables for Parameters - -```python -@schema -class Method(dj.Lookup): - definition = "..." - contents = [...] # Pre-defined methods - -@schema -class Analysis(dj.Computed): - definition = """ - -> Session - -> Method # Parameter combinations - --- - result : float64 - """ -``` - -See the [AutoPopulate Specification](../reference/specs/autopopulate.md) for complete details. - -## Quick Reference - -| Method | Description | -|--------|-------------| -| `populate()` | Compute all pending entries | -| `populate(restriction)` | Compute subset of entries | -| `populate(max_calls=N)` | Compute at most N entries | -| `populate(display_progress=True)` | Show progress bar | -| `populate(suppress_errors=True)` | Continue on errors | -| `progress()` | Check completion status | -| `key_source` | Entries that should be computed | - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/basics/06-object-storage.ipynb - -# Object-Augmented Schemas - -This tutorial covers DataJoint's Object-Augmented Schema (OAS) model. You'll learn: - -- **The OAS concept** — Unified relational + object storage -- **Blobs** — Storing arrays and Python objects -- **Object storage** — Scaling to large datasets -- **Staged insert** — Writing directly to object storage (Zarr, HDF5) -- **Attachments** — Preserving file names and formats -- **Codecs** — How data is serialized and deserialized - -In an Object-Augmented Schema, the relational database and object storage operate as a **single integrated system**—not as separate "internal" and "external" components. - - -```python -import datajoint as dj -import numpy as np - -schema = dj.Schema('tutorial_oas') -``` - - -## The Object-Augmented Schema Model - -Scientific data often combines: -- **Structured metadata** — Subjects, sessions, parameters (relational) -- **Large data objects** — Arrays, images, recordings (binary) - -DataJoint's OAS model manages both as a unified system: - -```mermaid -block-beta - columns 1 - block:oas:1 - columns 2 - OAS["Object-Augmented Schema"]:2 - block:db:1 - DB["Relational Database"] - DB1["Metadata"] - DB2["Keys"] - DB3["Relationships"] - end - block:os:1 - OS["Object Storage (S3/File/etc)"] - OS1["Large arrays"] - OS2["Images/videos"] - OS3["Recordings"] - end - end -``` - -From the user's perspective, this is **one schema**—storage location is transparent. - -## Blob Attributes - -Use `` to store arbitrary Python objects: - - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uint16 - --- - metadata : # Dict, stored in database - waveform : # NumPy array, stored in database - """ -``` - - - -```python -# Insert with blob data -Recording.insert1({ - 'recording_id': 1, - 'metadata': {'channels': 32, 'sample_rate': 30000, 'duration': 60.0}, - 'waveform': np.random.randn(32, 30000) # 32 channels x 1 second -}) - -Recording() -``` - - - -```python -# Fetch blob data -data = (Recording & {'recording_id': 1}).fetch1() -print(f"Metadata: {data['metadata']}") -print(f"Waveform shape: {data['waveform'].shape}") -``` - - -### What Can Be Stored in Blobs? - -The `` codec handles: - -- NumPy arrays (any dtype, any shape) -- Python dicts, lists, tuples, sets -- Strings, bytes, integers, floats -- datetime objects and UUIDs -- Nested combinations of the above - -**Note:** Pandas DataFrames should be converted before storage (e.g., `df.to_dict()` or `df.to_records()`). - - -```python -@schema -class AnalysisResult(dj.Manual): - definition = """ - result_id : uint16 - --- - arrays : - nested_data : - """ - -# Store complex data structures -arrays = {'x': np.array([1, 2, 3]), 'y': np.array([4, 5, 6])} -nested = {'arrays': [np.array([1, 2]), np.array([3, 4])], 'params': {'a': 1, 'b': 2}} - -AnalysisResult.insert1({ - 'result_id': 1, - 'arrays': arrays, - 'nested_data': nested -}) - -# Fetch back -result = (AnalysisResult & {'result_id': 1}).fetch1() -print(f"Arrays type: {type(result['arrays'])}") -print(f"Arrays keys: {result['arrays'].keys()}") -``` - - -## Object Storage with `@` - -For large datasets, add `@` to route data to object storage. The schema remains unified—only the physical storage location changes. - -### Configure Object Storage - -First, configure a store: - - -```python -import tempfile -import os - -# Create a store for this tutorial -store_path = tempfile.mkdtemp(prefix='dj_store_') - -# Configure a named store for this tutorial -dj.config.stores['tutorial'] = { - 'protocol': 'file', - 'location': store_path -} - -print(f"Store configured at: {store_path}") -``` - - -### Using Object Storage - - -```python -@schema -class LargeRecording(dj.Manual): - definition = """ - recording_id : uint16 - --- - small_data : # In database (small) - large_data : # In object storage (large) - """ -``` - - - -```python -# Insert data - usage is identical regardless of storage -small = np.random.randn(10, 10) -large = np.random.randn(1000, 1000) # ~8 MB array - -LargeRecording.insert1({ - 'recording_id': 1, - 'small_data': small, - 'large_data': large -}) - -LargeRecording() -``` - - - -```python -# Fetch is also identical - storage is transparent -data = (LargeRecording & {'recording_id': 1}).fetch1() -print(f"Small data shape: {data['small_data'].shape}") -print(f"Large data shape: {data['large_data'].shape}") -``` - - - -```python -# Objects are stored in the configured location -for root, dirs, files in os.walk(store_path): - for f in files: - path = os.path.join(root, f) - size = os.path.getsize(path) - print(f"{os.path.relpath(path, store_path)}: {size:,} bytes") -``` - - -### Hash-Addressed Storage - -`` uses hash-addressed storage. Data is identified by a Base32-encoded MD5 hash, enabling automatic deduplication—identical data is stored only once: - - -```python -# Insert the same data twice -shared_data = np.ones((500, 500)) - -LargeRecording.insert([ - {'recording_id': 2, 'small_data': small, 'large_data': shared_data}, - {'recording_id': 3, 'small_data': small, 'large_data': shared_data}, # Same! -]) - -print(f"Rows in table: {len(LargeRecording())}") - -# Deduplication: identical data stored once -files = [f for _, _, fs in os.walk(store_path) for f in fs] -print(f"Files in store: {len(files)}") -``` - - -## Schema-Addressed Storage with `` - -While `` uses hash-addressed storage with deduplication, `` uses **schema-addressed** storage where each row has its own dedicated storage path: - -| Aspect | `` | `` | -|--------|-----------|-------------| -| Addressing | By content hash | By primary key | -| Deduplication | Yes | No | -| Deletion | Garbage collected | With row | -| Use case | Arrays, serialized objects | Zarr, HDF5, multi-file outputs | - -Use `` when you need: -- Hierarchical formats like Zarr or HDF5 -- Direct write access during data generation -- Each row to have its own isolated storage location - - -```python -@schema -class ImagingSession(dj.Manual): - definition = """ - subject_id : int32 - session_id : int32 - --- - n_frames : int32 - frame_rate : float32 - frames : # Zarr array stored at path derived from PK - """ -``` - - -### Staged Insert for Direct Object Storage Writes - -For large datasets like multi-GB imaging recordings, copying data from local storage to object storage is inefficient. The `staged_insert1` context manager lets you **write directly to object storage** before finalizing the database insert: - -1. Set primary key values in `staged.rec` -2. Get a storage handle with `staged.store(field, extension)` -3. Write data directly (e.g., with Zarr) -4. On successful exit, metadata is computed and the record is inserted - - -```python -import zarr - -# Simulate acquiring imaging data frame-by-frame -n_frames = 100 -height, width = 512, 512 - -with ImagingSession.staged_insert1 as staged: - # Set primary key values first - staged.rec['subject_id'] = 1 - staged.rec['session_id'] = 1 - - # Get storage handle for the object field - store = staged.store('frames', '.zarr') - - # Create Zarr array directly in object storage - z = zarr.open(store, mode='w', shape=(n_frames, height, width), - chunks=(10, height, width), dtype='uint16') - - # Write frames as they are "acquired" - for i in range(n_frames): - frame = np.random.randint(0, 4096, (height, width), dtype='uint16') - z[i] = frame - - # Set remaining attributes - staged.rec['n_frames'] = n_frames - staged.rec['frame_rate'] = 30.0 - -# Record is now inserted with metadata computed from the Zarr -ImagingSession() -``` - - - -```python -# Fetch returns an ObjectRef for lazy access -ref = (ImagingSession & {'subject_id': 1, 'session_id': 1}).fetch1('frames') -print(f"Type: {type(ref).__name__}") -print(f"Path: {ref.path}") - -# Open as Zarr array (data stays in object storage) -z = zarr.open(ref.fsmap, mode='r') -print(f"Shape: {z.shape}") -print(f"Chunks: {z.chunks}") -print(f"First frame mean: {z[0].mean():.1f}") -``` - - -### Benefits of Staged Insert - -- **No intermediate copies** — Data flows directly to object storage -- **Streaming writes** — Write frame-by-frame as data is acquired -- **Atomic transactions** — If an error occurs, storage is cleaned up automatically -- **Automatic metadata** — File sizes and manifests are computed on finalize - -Use `staged_insert1` when: -- Data is too large to hold in memory -- You're generating data incrementally (e.g., during acquisition) -- You need direct control over storage format (Zarr chunks, HDF5 datasets) - -## Attachments - -Use `` to store files with their original names preserved: - - -```python -@schema -class Document(dj.Manual): - definition = """ - doc_id : uint16 - --- - report : - """ -``` - - - -```python -# Create a sample file -sample_file = os.path.join(tempfile.gettempdir(), 'analysis_report.txt') -with open(sample_file, 'w') as f: - f.write('Analysis Results\n') - f.write('================\n') - f.write('Accuracy: 95.2%\n') - -# Insert using file path directly -Document.insert1({ - 'doc_id': 1, - 'report': sample_file # Just pass the path -}) - -Document() -``` - - - -```python -# Fetch returns path to extracted file -doc_path = (Document & {'doc_id': 1}).fetch1('report') -print(f"Type: {type(doc_path)}") -print(f"Path: {doc_path}") - -# Read the content -with open(doc_path, 'r') as f: - print(f"Content:\n{f.read()}") -``` - - -## Codec Summary - -| Codec | Syntax | Description | -|-------|--------|-------------| -| `` | In database | Python objects, arrays | -| `` | Default store | Large objects, hash-addressed | -| `` | Named store | Specific storage tier | -| `` | In database | Files with names | -| `` | Named store | Large files with names | -| `` | Named store | Path-addressed (Zarr, etc.) | -| `` | Named store | References to existing files | - -## Computed Tables with Large Data - -Computed tables commonly produce large results: - - -```python -@schema -class ProcessedRecording(dj.Computed): - definition = """ - -> LargeRecording - --- - filtered : # Result in object storage - mean_value : float64 - """ - - def make(self, key): - # Fetch source data - data = (LargeRecording & key).fetch1('large_data') - - # Process - from scipy.ndimage import gaussian_filter - filtered = gaussian_filter(data, sigma=2) - - self.insert1({ - **key, - 'filtered': filtered, - 'mean_value': float(np.mean(filtered)) - }) -``` - - - -```python -ProcessedRecording.populate(display_progress=True) -ProcessedRecording() -``` - - -## Efficient Data Access - -### Fetch Only What You Need - - -```python -# Fetch only scalar metadata (fast) -meta = (ProcessedRecording & {'recording_id': 1}).fetch1('mean_value') -print(f"Mean value: {meta}") -``` - - - -```python -# Fetch large data only when needed -filtered = (ProcessedRecording & {'recording_id': 1}).fetch1('filtered') -print(f"Filtered shape: {filtered.shape}") -``` - - -### Project Away Large Columns Before Joins - - -```python -# Efficient: project to scalar columns before join -result = LargeRecording.proj('recording_id') * ProcessedRecording.proj('mean_value') -result -``` - - -## Best Practices - -### 1. Choose Storage Based on Size - -```python -# Small objects (< 1 MB): no @ -parameters : - -# Large objects (> 1 MB): use @ -raw_data : -``` - -### 2. Use Named Stores for Different Tiers - -```python -# Fast local storage for active data -working_data : - -# Cold storage for archives -archived_data : -``` - -### 3. Separate Queryable Metadata from Large Data - -```python -@schema -class Experiment(dj.Manual): - definition = """ - exp_id : uint16 - --- - # Queryable metadata - date : date - duration : float32 - n_trials : uint16 - # Large data - raw_data : - """ -``` - -### 4. Use Attachments for Files - -```python -# Preserves filename -video : -config_file : -``` - -## Garbage Collection - -Hash-addressed storage (``, ``, ``) uses deduplication—identical content is stored once. This means deleting a row doesn't automatically delete the stored content, since other rows might reference it. - -Use garbage collection to clean up orphaned content: - -```python -import datajoint as dj - -# Preview what would be deleted (dry run) -stats = dj.gc.collect(dry_run=True) -print(f"Orphaned items: {stats['orphaned']}") -print(f"Space to reclaim: {stats['orphaned_bytes'] / 1e6:.1f} MB") - -# Actually delete orphaned content -stats = dj.gc.collect() -print(f"Deleted: {stats['deleted']} items") -``` - -### When to Run Garbage Collection - -- **After bulk deletions** — Clean up storage after removing many rows -- **Periodically** — Schedule weekly/monthly cleanup jobs -- **Before archiving** — Reclaim space before backups - -### Key Points - -- GC only affects hash-addressed types (``, ``, ``) -- Schema-addressed types (``, ``) are deleted with their rows -- Always use `dry_run=True` first to preview changes -- GC is safe—it only deletes content with zero references - -See [Clean Up Storage](../how-to/garbage-collection.md) for detailed usage. - -## Quick Reference - -| Pattern | Use Case | -|---------|----------| -| `` | Small Python objects | -| `` | Large arrays with deduplication | -| `` | Large arrays in specific store | -| `` | Files preserving names | -| `` | Schema-addressed data (Zarr, HDF5) | - -## Next Steps - -- [Configure Object Storage](../how-to/configure-storage.md) — Set up S3, MinIO, or filesystem stores -- [Clean Up Storage](../how-to/garbage-collection.md) — Garbage collection for hash-addressed storage -- [Custom Codecs](advanced/custom-codecs.ipynb) — Define domain-specific types -- [Manage Large Data](../how-to/manage-large-data.md) — Performance optimization - - -```python -# Cleanup -schema.drop(prompt=False) -import shutil -shutil.rmtree(store_path, ignore_errors=True) -``` - - ---- -## File: tutorials/domain/allen-ccf/allen-ccf.ipynb - -# Allen Common Coordinate Framework (CCF) - -This tutorial demonstrates how to model the Allen Mouse Brain Common Coordinate Framework in DataJoint. You'll learn to: - -- Model **hierarchical structures** (brain region ontology) -- Use **Part tables** for large-scale voxel data -- Handle **self-referential relationships** (parent regions) -- **Batch insert** large datasets efficiently - -## The Allen CCF - -The CCF is a 3D reference atlas of the mouse brain, providing: -- Coordinate system with voxel resolution (10, 25, 50, or 100 µm) -- Hierarchical ontology of ~1300 brain regions -- Region boundaries for each voxel - -**Reference:** -> Wang Q, Ding SL, Li Y, et al. (2020). The Allen Mouse Brain Common Coordinate Framework: A 3D Reference Atlas. *Cell*, 181(4), 936-953.e20. DOI: [10.1016/j.cell.2020.04.007](https://doi.org/10.1016/j.cell.2020.04.007) - -## Data Sources - -- **Ontology**: [structure_graph.csv](http://api.brain-map.org/api/v2/data/query.csv?criteria=model::Structure,rma::criteria,[ontology_id$eq1],rma::options[order$eq%27structures.graph_order%27][num_rows$eqall]) -- **Volume**: [Allen Institute Archive](http://download.alleninstitute.org/informatics-archive/current-release/mouse_ccf/annotation/) - -> **Note**: This tutorial works with the ontology (small CSV). The full 3D volume requires ~100MB+ download. - -## Setup - - -```python -import datajoint as dj -import numpy as np -import pandas as pd -from pathlib import Path -import urllib.request - -schema = dj.Schema('tutorial_allen_ccf') - -DATA_DIR = Path('./data') -DATA_DIR.mkdir(exist_ok=True) -``` - - -## Download Brain Region Ontology - -The ontology defines the hierarchical structure of brain regions. - - -```python -ONTOLOGY_URL = ( - "http://api.brain-map.org/api/v2/data/query.csv?" - "criteria=model::Structure,rma::criteria,[ontology_id$eq1]," - "rma::options[order$eq%27structures.graph_order%27][num_rows$eqall]" -) -ONTOLOGY_FILE = DATA_DIR / 'allen_structure_graph.csv' - -if not ONTOLOGY_FILE.exists(): - print("Downloading Allen brain structure ontology...") - urllib.request.urlretrieve(ONTOLOGY_URL, ONTOLOGY_FILE) - print(f"Downloaded to {ONTOLOGY_FILE}") -else: - print(f"Using cached {ONTOLOGY_FILE}") - -ontology = pd.read_csv(ONTOLOGY_FILE) -print(f"Loaded {len(ontology)} brain regions") -ontology.head() -``` - - -## Schema Design - -### CCF Master Table - -The master table stores atlas metadata. Multiple CCF versions (different resolutions) can coexist. - - -```python -@schema -class CCF(dj.Manual): - definition = """ - # Common Coordinate Framework atlas - ccf_id : int32 - --- - ccf_version : varchar(64) # e.g., 'CCFv3' - ccf_resolution : float32 # voxel resolution in microns - ccf_description : varchar(255) - """ -``` - - -### Brain Region Table - -Each brain region has an ID, name, acronym, and color code for visualization. - - -```python -@schema -class BrainRegion(dj.Imported): - definition = """ - # Brain region from Allen ontology - -> CCF - region_id : int32 # Allen structure ID - --- - acronym : varchar(32) # short name (e.g., 'VISp') - region_name : varchar(255) # full name - color_hex : varchar(6) # hex color code for visualization - structure_order : int32 # order in hierarchy - """ - - def make(self, key): - # Load ontology and insert all regions for this CCF - ontology = pd.read_csv(ONTOLOGY_FILE) - - entries = [ - { - **key, - 'region_id': row['id'], - 'acronym': row['acronym'], - 'region_name': row['safe_name'], - 'color_hex': row['color_hex_triplet'], - 'structure_order': row['graph_order'], - } - for _, row in ontology.iterrows() - ] - - self.insert(entries) - print(f"Inserted {len(entries)} brain regions") -``` - - -### Hierarchical Parent-Child Relationships - -Brain regions form a hierarchy (e.g., Visual Cortex → Primary Visual Area → Layer 1). We model this with a self-referential foreign key. - - -```python -@schema -class RegionParent(dj.Imported): - definition = """ - # Hierarchical parent-child relationships - -> BrainRegion - --- - -> BrainRegion.proj(parent_id='region_id') # parent region - depth : int16 # depth in hierarchy (root=0) - """ - - def make(self, key): - ontology = pd.read_csv(ONTOLOGY_FILE) - - # Build parent mapping - parent_map = dict(zip(ontology['id'], ontology['parent_structure_id'])) - - entries = [] - for _, row in ontology.iterrows(): - parent_id = row['parent_structure_id'] - # Skip root (no parent) or if parent not in ontology - if pd.isna(parent_id): - parent_id = row['id'] # root points to itself - - entries.append({ - **key, - 'region_id': row['id'], - 'parent_id': int(parent_id), - 'depth': row['depth'], - }) - - self.insert(entries) - print(f"Inserted {len(entries)} parent relationships") -``` - - -### Voxel Data (Optional) - -For the full atlas, each voxel maps to a brain region. This is a large table (~10M+ rows for 10µm resolution). - -**Design note:** `CCF` is part of the primary key because a voxel's identity depends on which atlas it belongs to. The coordinate `(x=5000, y=3000, z=4000)` exists in every atlas version (10µm, 25µm, etc.) but represents different physical mappings. Without `ccf_id` in the primary key, you couldn't store voxels from multiple atlas resolutions. - -> **Note**: We define the schema but don't populate it in this tutorial due to data size. - - -```python -@schema -class Voxel(dj.Imported): - definition = """ - # Brain atlas voxels - -> CCF - x : int32 # AP axis (µm) - y : int32 # DV axis (µm) - z : int32 # ML axis (µm) - --- - -> BrainRegion - index(y, z) # for efficient coronal slice queries - """ - - # Note: make() would load NRRD file and insert voxels - # Skipped in this tutorial due to data size -``` - - -## View Schema - - -```python -dj.Diagram(schema) -``` - - -## Populate the Database - - -```python -# Insert CCF metadata -CCF.insert1( - { - 'ccf_id': 1, - 'ccf_version': 'CCFv3', - 'ccf_resolution': 25.0, - 'ccf_description': 'Allen Mouse CCF v3 (25µm resolution)' - }, - skip_duplicates=True -) - -CCF() -``` - - - -```python -# Populate brain regions -BrainRegion.populate(display_progress=True) -``` - - - -```python -# View sample regions -BrainRegion() & 'region_id < 100' -``` - - - -```python -# Populate parent relationships -RegionParent.populate(display_progress=True) -``` - - -## Querying the Hierarchy - -### Find a Region by Acronym - - -```python -# Find primary visual cortex -BrainRegion & 'acronym = "VISp"' -``` - - -### Find Children of a Region - - -```python -# Get VISp region ID -visp = (BrainRegion & 'acronym = "VISp"').fetch1() -visp_id = visp['region_id'] - -# Find all children (direct descendants) -children = BrainRegion * (RegionParent & f'parent_id = {visp_id}' & f'region_id != {visp_id}') -children.proj('acronym', 'region_name') -``` - - -### Find Parent Path (Ancestors) - - -```python -def get_ancestors(region_acronym, ccf_id=1): - """Get the path from a region to the root.""" - region = (BrainRegion & f'acronym = "{region_acronym}"' & f'ccf_id = {ccf_id}').fetch1() - region_id = region['region_id'] - - path = [region['acronym']] - - while True: - parent_id = (RegionParent & {'ccf_id': ccf_id, 'region_id': region_id}).fetch1('parent_id') - if parent_id == region_id: # reached root - break - parent = (BrainRegion & {'ccf_id': ccf_id, 'region_id': parent_id}).fetch1() - path.append(parent['acronym']) - region_id = parent_id - - return ' → '.join(reversed(path)) - -# Show path from VISp layer 1 to root -print("Path from VISp1 to root:") -print(get_ancestors('VISp1')) -``` - - -### Count Regions by Depth - - -```python -# Aggregate regions by depth in hierarchy -import matplotlib.pyplot as plt - -depths = (RegionParent & 'ccf_id = 1').to_arrays('depth') -unique, counts = np.unique(depths, return_counts=True) - -plt.figure(figsize=(10, 4)) -plt.bar(unique, counts) -plt.xlabel('Depth in Hierarchy') -plt.ylabel('Number of Regions') -plt.title('Brain Regions by Hierarchy Depth') -plt.xticks(unique) -``` - - -### Search Regions by Name - - -```python -# Find all visual-related regions -(BrainRegion & 'region_name LIKE "%Visual%"').proj('acronym', 'region_name') -``` - - -## Extending the Schema - -### Recording Locations - -A common use case is tracking where electrodes were placed during recordings. - -**Design choice:** Here we use `recording_id` alone as the primary key, with `BrainRegion` (which includes `ccf_id`) as a dependent attribute. This means each recording has exactly one canonical atlas registration. - -An alternative design would include `ccf_id` in the primary key: -```python -recording_id : int32 --> CCF ---- -... -``` -This would allow the same recording to be registered to multiple atlas versions (e.g., comparing assignments in CCFv2 vs CCFv3). Choose based on your use case: - -| Design | Primary Key | Use Case | -|--------|-------------|----------| -| Single registration | `recording_id` | One canonical atlas per lab | -| Multi-atlas | `(recording_id, ccf_id)` | Compare across atlas versions | - - -```python -@schema -class RecordingSite(dj.Manual): - definition = """ - # Recording electrode location - recording_id : int32 - --- - ap : float32 # anterior-posterior (µm from bregma) - dv : float32 # dorsal-ventral (µm from brain surface) - ml : float32 # medial-lateral (µm from midline) - -> BrainRegion # assigned brain region (includes ccf_id) - """ - -# Insert example recording sites -RecordingSite.insert([ - {'recording_id': 1, 'ccf_id': 1, 'ap': -3500, 'dv': 500, 'ml': 2500, - 'region_id': (BrainRegion & 'acronym="VISp"').fetch1('region_id')}, - {'recording_id': 2, 'ccf_id': 1, 'ap': -1800, 'dv': 1200, 'ml': 1500, - 'region_id': (BrainRegion & 'acronym="CA1"').fetch1('region_id')}, -], skip_duplicates=True) - -# View recordings with region info -RecordingSite * BrainRegion.proj('acronym', 'region_name') -``` - - -## Summary - -This tutorial demonstrated DataJoint patterns for atlas and ontology data: - -| Pattern | Example | Purpose | -|---------|---------|--------| -| **Hierarchical data** | `BrainRegion`, `RegionParent` | Model tree structures | -| **Self-referential FK** | `parent_id → region_id` | Parent-child relationships | -| **Batch insert** | `self.insert(entries)` | Efficient bulk loading | -| **Secondary index** | `index(y, z)` | Optimize spatial queries | -| **Linked tables** | `RecordingSite → BrainRegion` | Reference atlas in experiments | - -### Loading Full Atlas Data - -To load the complete 3D volume: - -1. Download NRRD file from [Allen Institute](http://download.alleninstitute.org/informatics-archive/current-release/mouse_ccf/annotation/ccf_2017/) -2. Install `pynrrd`: `pip install pynrrd` -3. Load and insert voxels (see [Element Electrode Localization](https://github.com/datajoint/element-electrode-localization)) - - -```python -# Final schema diagram -dj.Diagram(schema) -``` - - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/domain/calcium-imaging/calcium-imaging.ipynb - -# Calcium Imaging Pipeline - -This tutorial builds a complete calcium imaging analysis pipeline using DataJoint. You'll learn to: - -- **Import** raw imaging data from TIFF files -- **Segment** cells using parameterized detection -- **Extract** fluorescence traces from detected ROIs -- Use **Lookup tables** for analysis parameters -- Use **Part tables** for one-to-many results - -## The Pipeline - -Calcium Imaging Pipeline - -**Legend:** Green = Manual, Gray = Lookup, Blue = Imported, Red = Computed, White = Part - -Each scan produces a TIFF movie. We compute an average frame, segment cells using threshold-based detection, and extract fluorescence traces for each detected ROI. - -## Setup - - -```python -import datajoint as dj -import numpy as np -import matplotlib.pyplot as plt -from pathlib import Path -from skimage import io -from scipy import ndimage - -schema = dj.Schema('tutorial_calcium_imaging') - -# Data directory (relative to this notebook) -DATA_DIR = Path('./data') -``` - - -## Manual Tables: Experiment Metadata - -We start with tables for subjects, sessions, and scan metadata. These are **Manual tables** - data entered by experimenters or recording systems. - - -```python -@schema -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - dob : date - sex : enum('M', 'F', 'unknown') - """ - - -@schema -class Session(dj.Manual): - definition = """ - -> Mouse - session_date : date - --- - experimenter : varchar(100) - """ - - -@schema -class Scan(dj.Manual): - definition = """ - -> Session - scan_idx : int16 - --- - depth : float32 # imaging depth (um) - wavelength : float32 # laser wavelength (nm) - laser_power : float32 # laser power (mW) - fps : float32 # frames per second - file_name : varchar(128) # TIFF filename - """ -``` - - -### Insert Sample Data - - -```python -# Insert mouse -Mouse.insert1( - {'mouse_id': 0, 'dob': '2017-03-01', 'sex': 'M'}, - skip_duplicates=True -) - -# Insert session -Session.insert1( - {'mouse_id': 0, 'session_date': '2017-05-15', 'experimenter': 'Alice'}, - skip_duplicates=True -) - -# Insert scans (we have 3 TIFF files) -Scan.insert([ - {'mouse_id': 0, 'session_date': '2017-05-15', 'scan_idx': 1, - 'depth': 150, 'wavelength': 920, 'laser_power': 26, 'fps': 15, - 'file_name': 'example_scan_01.tif'}, - {'mouse_id': 0, 'session_date': '2017-05-15', 'scan_idx': 2, - 'depth': 200, 'wavelength': 920, 'laser_power': 24, 'fps': 15, - 'file_name': 'example_scan_02.tif'}, - {'mouse_id': 0, 'session_date': '2017-05-15', 'scan_idx': 3, - 'depth': 250, 'wavelength': 920, 'laser_power': 25, 'fps': 15, - 'file_name': 'example_scan_03.tif'}, -], skip_duplicates=True) - -Scan() -``` - - -## Imported Table: Average Frame - -An **Imported table** pulls data from external files. Here we load each TIFF movie and compute the average frame across all time points. - -The `make()` method defines how to compute one entry. DataJoint's `populate()` calls it for each pending entry. - - -```python -@schema -class AverageFrame(dj.Imported): - definition = """ - -> Scan - --- - average_frame : # mean fluorescence across frames - """ - - def make(self, key): - # Get filename from Scan table - file_name = (Scan & key).fetch1('file_name') - file_path = DATA_DIR / file_name - - # Load TIFF and compute average - movie = io.imread(file_path) - avg_frame = movie.mean(axis=0) - - # Insert result - self.insert1({**key, 'average_frame': avg_frame}) - print(f"Processed {file_name}: {movie.shape[0]} frames") -``` - - - -```python -# Populate computes all pending entries -AverageFrame.populate(display_progress=True) -``` - - - -```python -# Visualize average frames -fig, axes = plt.subplots(1, 3, figsize=(12, 4)) -for ax, key in zip(axes, AverageFrame.keys()): - avg = (AverageFrame & key).fetch1('average_frame') - ax.imshow(avg, cmap='gray') - ax.set_title(f"Scan {key['scan_idx']}") - ax.axis('off') -plt.tight_layout() -``` - - -## Lookup Table: Segmentation Parameters - -A **Lookup table** stores parameter sets that don't change often. This lets us run the same analysis with different parameters and compare results. - -Our cell segmentation has two parameters: -- `threshold`: intensity threshold for detecting bright regions -- `min_size`: minimum blob size (filters out noise) - - -```python -@schema -class SegmentationParam(dj.Lookup): - definition = """ - seg_param_id : int16 - --- - threshold : float32 # intensity threshold - min_size : int32 # minimum blob size (pixels) - """ - - # Pre-populate with parameter sets to try - contents = [ - {'seg_param_id': 1, 'threshold': 50.0, 'min_size': 50}, - {'seg_param_id': 2, 'threshold': 60.0, 'min_size': 50}, - ] - -SegmentationParam() -``` - - -## Computed Table with Part Table: Segmentation - -A **Computed table** derives data from other DataJoint tables. Here, `Segmentation` depends on both `AverageFrame` and `SegmentationParam` - DataJoint will compute all combinations. - -Since each segmentation produces multiple ROIs, we use a **Part table** (`Roi`) to store the individual masks. The master table stores summary info; part tables store detailed results. - - -```python -@schema -class Segmentation(dj.Computed): - definition = """ - -> AverageFrame - -> SegmentationParam - --- - num_rois : int16 # number of detected ROIs - segmented_mask : # labeled mask image - """ - - class Roi(dj.Part): - definition = """ - -> master - roi_idx : int16 - --- - mask : # binary mask for this ROI - center_x : float32 # ROI center x coordinate - center_y : float32 # ROI center y coordinate - """ - - def make(self, key): - # Fetch inputs - avg_frame = (AverageFrame & key).fetch1('average_frame') - threshold, min_size = (SegmentationParam & key).fetch1('threshold', 'min_size') - - # Threshold to get binary mask - binary_mask = avg_frame > threshold - - # Label connected components - labeled, num_labels = ndimage.label(binary_mask) - - # Filter by size and extract ROIs - roi_masks = [] - for i in range(1, num_labels + 1): # 0 is background - roi_mask = (labeled == i) - if roi_mask.sum() >= min_size: - roi_masks.append(roi_mask) - - # Re-label the filtered mask - final_mask = np.zeros_like(labeled) - for i, mask in enumerate(roi_masks, 1): - final_mask[mask] = i - - # Insert master entry - self.insert1({ - **key, - 'num_rois': len(roi_masks), - 'segmented_mask': final_mask - }) - - # Insert part entries (one per ROI) - for roi_idx, mask in enumerate(roi_masks): - # Compute center of mass - cy, cx = ndimage.center_of_mass(mask) - self.Roi.insert1({ - **key, - 'roi_idx': roi_idx, - 'mask': mask, - 'center_x': cx, - 'center_y': cy - }) - - print(f"Scan {key['scan_idx']}, params {key['seg_param_id']}: {len(roi_masks)} ROIs") -``` - - - -```python -# Populate all AverageFrame x SegmentationParam combinations -Segmentation.populate(display_progress=True) -``` - - - -```python -# View results summary -Segmentation() -``` - - - -```python -# View individual ROIs -Segmentation.Roi() -``` - - -### Visualize Segmentation Results - - -```python -# Compare segmentation with different parameters for scan 1 -fig, axes = plt.subplots(1, 3, figsize=(12, 4)) - -key = {'mouse_id': 0, 'session_date': '2017-05-15', 'scan_idx': 1} -avg_frame = (AverageFrame & key).fetch1('average_frame') - -axes[0].imshow(avg_frame, cmap='gray') -axes[0].set_title('Average Frame') -axes[0].axis('off') - -for ax, param_id in zip(axes[1:], [1, 2]): - seg_key = {**key, 'seg_param_id': param_id} - mask, num_rois = (Segmentation & seg_key).fetch1('segmented_mask', 'num_rois') - threshold = (SegmentationParam & {'seg_param_id': param_id}).fetch1('threshold') - - ax.imshow(avg_frame, cmap='gray') - ax.imshow(mask, cmap='tab10', alpha=0.5 * (mask > 0)) - ax.set_title(f'Threshold={threshold}: {num_rois} ROIs') - ax.axis('off') - -plt.tight_layout() -``` - - -## Fluorescence Trace Extraction - -Now we extract the fluorescence time series for each ROI. This requires going back to the raw TIFF movie, so we use an **Imported table**. - -The master table (`Fluorescence`) stores shared time axis; the part table (`Trace`) stores each ROI's trace. - - -```python -@schema -class Fluorescence(dj.Imported): - definition = """ - -> Segmentation - --- - timestamps : # time for each frame (seconds) - """ - - class Trace(dj.Part): - definition = """ - -> master - -> Segmentation.Roi - --- - trace : # fluorescence trace (mean within ROI mask) - """ - - def make(self, key): - # Get scan info and load movie - file_name, fps = (Scan & key).fetch1('file_name', 'fps') - movie = io.imread(DATA_DIR / file_name) - n_frames = movie.shape[0] - - # Create time axis - timestamps = np.arange(n_frames) / fps - - # Insert master entry - self.insert1({**key, 'timestamps': timestamps}) - - # Extract trace for each ROI - for roi_key in (Segmentation.Roi & key).keys(): - mask = (Segmentation.Roi & roi_key).fetch1('mask') - - # Compute mean fluorescence within mask for each frame - trace = np.array([frame[mask].mean() for frame in movie]) - - self.Trace.insert1({**roi_key, 'trace': trace}) - - n_rois = len(Segmentation.Roi & key) - print(f"Extracted {n_rois} traces from {file_name}") -``` - - - -```python -Fluorescence.populate(display_progress=True) -``` - - -### Visualize Fluorescence Traces - - -```python -# Plot traces for one segmentation result -key = {'mouse_id': 0, 'session_date': '2017-05-15', 'scan_idx': 1, 'seg_param_id': 2} - -timestamps = (Fluorescence & key).fetch1('timestamps') -traces = (Fluorescence.Trace & key).to_arrays('trace') - -plt.figure(figsize=(12, 4)) -for i, trace in enumerate(traces): - plt.plot(timestamps, trace + i * 20, label=f'ROI {i}') # offset for visibility - -plt.xlabel('Time (s)') -plt.ylabel('Fluorescence (offset)') -plt.title('Fluorescence Traces') -plt.legend() -plt.tight_layout() -``` - - -## Pipeline Diagram - - -```python -dj.Diagram(schema) -``` - - -**Legend:** -- **Green rectangles**: Manual tables (user-entered data) -- **Gray rectangles**: Lookup tables (parameters) -- **Blue ovals**: Imported tables (data from files) -- **Red ovals**: Computed tables (derived from other tables) -- **Plain text**: Part tables (detailed results) - -## Summary - -This pipeline demonstrates key DataJoint patterns for imaging analysis: - -| Concept | Example | Purpose | -|---------|---------|--------| -| **Manual tables** | `Mouse`, `Session`, `Scan` | Store experiment metadata | -| **Imported tables** | `AverageFrame`, `Fluorescence` | Load data from external files | -| **Computed tables** | `Segmentation` | Derive data from other tables | -| **Lookup tables** | `SegmentationParam` | Store analysis parameters | -| **Part tables** | `Roi`, `Trace` | Store one-to-many results | -| **`populate()`** | Auto-compute missing entries | Automatic pipeline execution | - -### Key Benefits - -1. **Parameter tracking**: Different segmentation parameters stored alongside results -2. **Reproducibility**: Re-run `populate()` to recompute after changes -3. **Data integrity**: Foreign keys ensure consistent relationships -4. **Provenance**: Clear lineage from raw data to final traces - - -```python -# Cleanup: drop schema for re-running -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/domain/electrophysiology/electrophysiology.ipynb - -# Electrophysiology Pipeline - -This tutorial builds an electrophysiology analysis pipeline using DataJoint. You'll learn to: - -- **Import** neural recordings from data files -- **Compute** activity statistics -- **Detect spikes** using parameterized thresholds -- **Extract waveforms** using Part tables - -## The Pipeline - -Electrophysiology Pipeline - -**Legend:** Green = Manual, Gray = Lookup, Blue = Imported, Red = Computed, White = Part - -Each session records from neurons. We compute statistics, detect spikes with configurable thresholds, and extract spike waveforms. - -## Setup - - -```python -import datajoint as dj -import numpy as np -import matplotlib.pyplot as plt -from pathlib import Path - -schema = dj.Schema('tutorial_electrophysiology') - -# Data directory (relative to this notebook) -DATA_DIR = Path('./data') -``` - - -## Manual Tables: Experiment Metadata - - -```python -@schema -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - dob : date - sex : enum('M', 'F', 'unknown') - """ - - -@schema -class Session(dj.Manual): - definition = """ - -> Mouse - session_date : date - --- - experimenter : varchar(100) - """ -``` - - -### Insert Sample Data - -Our data files follow the naming convention `data_{mouse_id}_{session_date}.npy`. - - -```python -# Insert mice -Mouse.insert([ - {'mouse_id': 0, 'dob': '2017-03-01', 'sex': 'M'}, - {'mouse_id': 5, 'dob': '2016-12-25', 'sex': 'F'}, - {'mouse_id': 100, 'dob': '2017-05-12', 'sex': 'F'}, -], skip_duplicates=True) - -# Insert sessions (matching our data files) -Session.insert([ - {'mouse_id': 0, 'session_date': '2017-05-15', 'experimenter': 'Alice'}, - {'mouse_id': 0, 'session_date': '2017-05-19', 'experimenter': 'Alice'}, - {'mouse_id': 5, 'session_date': '2017-01-05', 'experimenter': 'Bob'}, - {'mouse_id': 100, 'session_date': '2017-05-25', 'experimenter': 'Carol'}, - {'mouse_id': 100, 'session_date': '2017-06-01', 'experimenter': 'Carol'}, -], skip_duplicates=True) - -Session() -``` - - -## Imported Table: Neuron Activity - -Each data file contains recordings from one or more neurons. We import each neuron's activity trace. - - -```python -@schema -class Neuron(dj.Imported): - definition = """ - -> Session - neuron_id : int16 - --- - activity : # neural activity trace - """ - - def make(self, key): - # Construct filename from key - filename = f"data_{key['mouse_id']}_{key['session_date']}.npy" - filepath = DATA_DIR / filename - - # Load data (shape: n_neurons x n_timepoints) - data = np.load(filepath) - - # Insert one row per neuron - for neuron_id, activity in enumerate(data): - self.insert1({ - **key, - 'neuron_id': neuron_id, - 'activity': activity - }) - - print(f"Imported {len(data)} neuron(s) from {filename}") -``` - - - -```python -Neuron.populate(display_progress=True) -``` - - - -```python -Neuron() -``` - - -### Visualize Neural Activity - - -```python -fig, axes = plt.subplots(2, 3, figsize=(12, 6)) - -for ax, key in zip(axes.ravel(), Neuron.keys()): - activity = (Neuron & key).fetch1('activity') - ax.plot(activity) - ax.set_title(f"Mouse {key['mouse_id']}, {key['session_date']}") - ax.set_xlabel('Time bin') - ax.set_ylabel('Activity') - -# Hide unused subplot -axes[1, 2].axis('off') -plt.tight_layout() -``` - - -## Computed Table: Activity Statistics - -For each neuron, compute basic statistics of the activity trace. - - -```python -@schema -class ActivityStats(dj.Computed): - definition = """ - -> Neuron - --- - mean_activity : float32 - std_activity : float32 - max_activity : float32 - """ - - def make(self, key): - activity = (Neuron & key).fetch1('activity') - - self.insert1({ - **key, - 'mean_activity': activity.mean(), - 'std_activity': activity.std(), - 'max_activity': activity.max() - }) -``` - - - -```python -ActivityStats.populate(display_progress=True) -ActivityStats() -``` - - -## Lookup Table: Spike Detection Parameters - -Spike detection depends on threshold choice. Using a Lookup table, we can run detection with multiple thresholds and compare results. - - -```python -@schema -class SpikeParams(dj.Lookup): - definition = """ - spike_param_id : int16 - --- - threshold : float32 # spike detection threshold - """ - - contents = [ - {'spike_param_id': 1, 'threshold': 0.5}, - {'spike_param_id': 2, 'threshold': 0.9}, - ] - -SpikeParams() -``` - - -## Computed Table with Part Table: Spike Detection - -Detect spikes by finding threshold crossings. Store: -- **Master table**: spike count and binary spike array -- **Part table**: waveform for each spike - - -```python -@schema -class Spikes(dj.Computed): - definition = """ - -> Neuron - -> SpikeParams - --- - spike_times : # indices of detected spikes - spike_count : int32 # total number of spikes - """ - - class Waveform(dj.Part): - definition = """ - -> master - spike_idx : int32 - --- - waveform : # activity around spike (±40 samples) - """ - - def make(self, key): - # Fetch inputs - activity = (Neuron & key).fetch1('activity') - threshold = (SpikeParams & key).fetch1('threshold') - - # Detect threshold crossings (rising edge) - above_threshold = (activity > threshold).astype(int) - rising_edge = np.diff(above_threshold) > 0 - spike_times = np.where(rising_edge)[0] + 1 # +1 to get crossing point - - # Insert master entry - self.insert1({ - **key, - 'spike_times': spike_times, - 'spike_count': len(spike_times) - }) - - # Extract and insert waveforms - window = 40 # samples before and after spike - for spike_idx, t in enumerate(spike_times): - # Skip spikes too close to edges - if t < window or t >= len(activity) - window: - continue - - waveform = activity[t - window : t + window] - self.Waveform.insert1({ - **key, - 'spike_idx': spike_idx, - 'waveform': waveform - }) -``` - - - -```python -Spikes.populate(display_progress=True) -``` - - - -```python -# View spike counts for each neuron × parameter combination -Spikes.proj('spike_count') -``` - - -### Compare Detection Thresholds - - -```python -# Pick one neuron to visualize -neuron_key = {'mouse_id': 0, 'session_date': '2017-05-15', 'neuron_id': 0} -activity = (Neuron & neuron_key).fetch1('activity') - -fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True) - -for ax, param_id in zip(axes, [1, 2]): - key = {**neuron_key, 'spike_param_id': param_id} - spike_times, spike_count = (Spikes & key).fetch1('spike_times', 'spike_count') - threshold = (SpikeParams & {'spike_param_id': param_id}).fetch1('threshold') - - ax.plot(activity, 'b-', alpha=0.7, label='Activity') - ax.axhline(threshold, color='r', linestyle='--', label=f'Threshold={threshold}') - ax.scatter(spike_times, activity[spike_times], color='red', s=20, zorder=5) - ax.set_title(f'Threshold={threshold}: {spike_count} spikes detected') - ax.set_ylabel('Activity') - ax.legend(loc='upper right') - -axes[1].set_xlabel('Time bin') -plt.tight_layout() -``` - - -### Average Waveform - - -```python -# Get waveforms for one neuron with threshold=0.5 -key = {'mouse_id': 100, 'session_date': '2017-05-25', 'neuron_id': 0, 'spike_param_id': 1} -waveforms = (Spikes.Waveform & key).to_arrays('waveform') - -if len(waveforms) > 0: - waveform_matrix = np.vstack(waveforms) - - plt.figure(figsize=(8, 4)) - # Plot individual waveforms (light) - for wf in waveform_matrix: - plt.plot(wf, 'b-', alpha=0.2) - # Plot mean waveform (bold) - plt.plot(waveform_matrix.mean(axis=0), 'r-', linewidth=2, label='Mean waveform') - plt.axvline(40, color='k', linestyle='--', alpha=0.5, label='Spike time') - plt.xlabel('Sample (relative to spike)') - plt.ylabel('Activity') - plt.title(f'Spike Waveforms (n={len(waveforms)})') - plt.legend() -else: - print("No waveforms found for this key") -``` - - -## Pipeline Diagram - - -```python -dj.Diagram(schema) -``` - - -## Querying Results - -DataJoint makes it easy to query across the pipeline. - - -```python -# Find neurons with high spike counts (threshold=0.5) -(Spikes & 'spike_param_id = 1' & 'spike_count > 20').proj('spike_count') -``` - - - -```python -# Join with Mouse to see which mice have most spikes -(Mouse * Session * Spikes & 'spike_param_id = 1').proj('spike_count') -``` - - -## Summary - -This pipeline demonstrates key patterns for electrophysiology analysis: - -| Concept | Example | Purpose | -|---------|---------|--------| -| **Imported tables** | `Neuron` | Load data from files | -| **Computed tables** | `ActivityStats`, `Spikes` | Derive results | -| **Lookup tables** | `SpikeParams` | Parameterize analysis | -| **Part tables** | `Waveform` | Store variable-length results | -| **Multi-parent keys** | `Spikes` | Compute all Neuron × Param combinations | - -### Key Benefits - -1. **Parameter comparison**: Different thresholds stored alongside results -2. **Automatic tracking**: `populate()` knows what's computed vs. pending -3. **Cascading**: Delete a parameter set, all derived results cascade -4. **Provenance**: Trace any spike back to its source recording - - -```python -# Cleanup: drop schema for re-running -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/domain/electrophysiology/ephys-with-npy.ipynb - -# Electrophysiology Pipeline with Object Storage - -This tutorial builds an electrophysiology analysis pipeline using DataJoint with the `` codec for efficient array storage. You'll learn to: - -- **Configure** object storage for neural data -- **Import** neural recordings using `` (lazy loading) -- **Compute** activity statistics -- **Detect spikes** using parameterized thresholds -- **Extract waveforms** as stacked arrays -- **Use memory mapping** for efficient random access -- **Access files directly** without database queries - -## The Pipeline - -Electrophysiology Pipeline - -**Legend:** Green = Manual, Gray = Lookup, Blue = Imported, Red = Computed - -Each session records from neurons. We compute statistics, detect spikes with configurable thresholds, and extract spike waveforms. - -## Why `` Instead of ``? - -| Feature | `` | `` | -|---------|----------|-----------| -| **Lazy loading** | Yes - inspect shape/dtype without download | No - always downloads | -| **Memory mapping** | Yes - random access via `mmap_mode` | No | -| **Format** | Portable `.npy` files | DataJoint serialization | -| **Bulk fetch** | Safe - returns references | Downloads everything | -| **Direct access** | Yes - navigable file paths | No - hash-addressed | - -## Setup - -First, configure object storage for the `` codec. - - -```python -import datajoint as dj -import numpy as np -import matplotlib.pyplot as plt -from pathlib import Path -import tempfile - -# Create a temporary directory for object storage -STORE_PATH = tempfile.mkdtemp(prefix='dj_ephys_') - -# Configure object storage with partitioning by mouse_id, session_date, and neuron_id -dj.config.stores['ephys'] = { - 'protocol': 'file', - 'location': STORE_PATH, - 'partition_pattern': '{mouse_id}/{session_date}/{neuron_id}', # Partition by subject, session, and neuron -} - -schema = dj.Schema('tutorial_electrophysiology_npy') - -# Data directory (relative to this notebook) -DATA_DIR = Path('./data') - -print(f"Store configured at: {STORE_PATH}") -print("Partitioning: {mouse_id}/{session_date}/{neuron_id}") -``` - - -### Partitioning by Subject, Session, and Neuron - -We've configured the store with `partition_pattern: '{mouse_id}/{session_date}/{neuron_id}'`. This organizes storage by the complete experimental hierarchy—grouping all data for each individual neuron together at the top of the directory structure. - -**Without partitioning:** -``` -{store}/{schema}/{table}/{mouse_id=X}/{session_date=Y}/{neuron_id=Z}/file.npy -``` - -**With partitioning:** -``` -{store}/{mouse_id=X}/{session_date=Y}/{neuron_id=Z}/{schema}/{table}/file.npy -``` - -Partitioning moves the specified primary key attributes to the front of the path, making it easy to: -- **Browse by experimental hierarchy** - Navigate: subject → session → neuron -- **Selective sync** - Copy all data for one neuron: `rsync mouse_id=100/session_date=2017-05-25/neuron_id=0/ backup/` -- **Efficient queries** - Filesystem can quickly locate specific neurons -- **Publication-ready** - Export complete hierarchies to data repositories - -The remaining primary key attributes (like `spike_param_id` in the Spikes table) stay in their normal position after the schema/table path. - -## Manual Tables: Experiment Metadata - - -```python -@schema -class Mouse(dj.Manual): - definition = """ - mouse_id : int32 - --- - dob : date - sex : enum('M', 'F', 'unknown') - """ - - -@schema -class Session(dj.Manual): - definition = """ - -> Mouse - session_date : date - --- - experimenter : varchar(100) - """ -``` - - -### Insert Sample Data - -Our data files follow the naming convention `data_{mouse_id}_{session_date}.npy`. - - -```python -# Insert mice -Mouse.insert([ - {'mouse_id': 0, 'dob': '2017-03-01', 'sex': 'M'}, - {'mouse_id': 5, 'dob': '2016-12-25', 'sex': 'F'}, - {'mouse_id': 100, 'dob': '2017-05-12', 'sex': 'F'}, -], skip_duplicates=True) - -# Insert sessions (matching our data files) -Session.insert([ - {'mouse_id': 0, 'session_date': '2017-05-15', 'experimenter': 'Alice'}, - {'mouse_id': 0, 'session_date': '2017-05-19', 'experimenter': 'Alice'}, - {'mouse_id': 5, 'session_date': '2017-01-05', 'experimenter': 'Bob'}, - {'mouse_id': 100, 'session_date': '2017-05-25', 'experimenter': 'Carol'}, - {'mouse_id': 100, 'session_date': '2017-06-01', 'experimenter': 'Carol'}, -], skip_duplicates=True) - -Session() -``` - - -## Imported Table: Neuron Activity with `` - -Each data file contains recordings from one or more neurons. We import each neuron's activity trace using `` for schema-addressed object storage. - - -```python -@schema -class Neuron(dj.Imported): - definition = """ - -> Session - neuron_id : int16 - --- - activity : # neural activity trace (lazy loading) - """ - - def make(self, key): - # Construct filename from key - filename = f"data_{key['mouse_id']}_{key['session_date']}.npy" - filepath = DATA_DIR / filename - - # Load data (shape: n_neurons x n_timepoints) - data = np.load(filepath) - - # Insert one row per neuron - for neuron_id, activity in enumerate(data): - self.insert1({ - **key, - 'neuron_id': neuron_id, - 'activity': activity - }) - - print(f"Imported {len(data)} neuron(s) from {filename}") -``` - - - -```python -Neuron.populate(display_progress=True) -``` - - - -```python -Neuron() -``` - - -### Lazy Loading with NpyRef - -When fetching `` attributes, you get an `NpyRef` that provides metadata without downloading the array. - - -```python -# Fetch returns NpyRef, not the array -key = {'mouse_id': 0, 'session_date': '2017-05-15', 'neuron_id': 0} -ref = (Neuron & key).fetch1('activity') - -print(f"Type: {type(ref).__name__}") -print(f"Shape: {ref.shape} (no download!)") -print(f"Dtype: {ref.dtype}") -print(f"Is loaded: {ref.is_loaded}") -``` - - - -```python -# Explicitly load when ready -activity = ref.load() -print(f"Loaded: {activity.shape}, is_loaded: {ref.is_loaded}") -``` - - -### Memory Mapping for Large Arrays - -For very large arrays, use `mmap_mode` to access data without loading it all into memory. This is especially efficient for local filesystem stores. - - -```python -# Memory-mapped loading - efficient for large arrays -key = {'mouse_id': 0, 'session_date': '2017-05-15', 'neuron_id': 0} -ref = (Neuron & key).fetch1('activity') - -# Load as memory-mapped array (read-only) -mmap_arr = ref.load(mmap_mode='r') - -print(f"Type: {type(mmap_arr).__name__}") -print(f"Shape: {mmap_arr.shape}") - -# Random access only reads the needed portion from disk -slice_data = mmap_arr[100:200] -print(f"Slice [100:200]: {slice_data[:5]}...") -``` - - -### Visualize Neural Activity - -NpyRef works transparently with NumPy functions via `__array__`. - - -```python -fig, axes = plt.subplots(2, 3, figsize=(12, 6)) - -for ax, key in zip(axes.ravel(), Neuron.keys()): - # fetch1 returns NpyRef, but plotting works via __array__ - activity = (Neuron & key).fetch1('activity') - ax.plot(np.asarray(activity)) # Convert to array for plotting - ax.set_title(f"Mouse {key['mouse_id']}, {key['session_date']}") - ax.set_xlabel('Time bin') - ax.set_ylabel('Activity') - -# Hide unused subplot -axes[1, 2].axis('off') -plt.tight_layout() -``` - - -## Computed Table: Activity Statistics - -For each neuron, compute basic statistics of the activity trace. NumPy functions work directly with NpyRef. - - -```python -@schema -class ActivityStats(dj.Computed): - definition = """ - -> Neuron - --- - mean_activity : float32 - std_activity : float32 - max_activity : float32 - """ - - def make(self, key): - # fetch1 returns NpyRef, but np.mean/std/max work via __array__ - activity = (Neuron & key).fetch1('activity') - - self.insert1({ - **key, - 'mean_activity': np.mean(activity), # Auto-loads via __array__ - 'std_activity': np.std(activity), - 'max_activity': np.max(activity) - }) -``` - - - -```python -ActivityStats.populate(display_progress=True) -ActivityStats() -``` - - -## Lookup Table: Spike Detection Parameters - -Spike detection depends on threshold choice. Using a Lookup table, we can run detection with multiple thresholds and compare results. - - -```python -@schema -class SpikeParams(dj.Lookup): - definition = """ - spike_param_id : int16 - --- - threshold : float32 # spike detection threshold - """ - - contents = [ - {'spike_param_id': 1, 'threshold': 0.5}, - {'spike_param_id': 2, 'threshold': 0.9}, - ] - -SpikeParams() -``` - - -## Computed Table: Spike Detection - -Detect spikes by finding threshold crossings. Store spike times and all waveforms as `` arrays. - - -```python -@schema -class Spikes(dj.Computed): - definition = """ - -> Neuron - -> SpikeParams - --- - spike_times : # indices of detected spikes - spike_count : uint32 # total number of spikes - waveforms : # all waveforms stacked (n_spikes x window_size) - """ - - def make(self, key): - # Fetch inputs - activity is NpyRef - activity_ref = (Neuron & key).fetch1('activity') - threshold = (SpikeParams & key).fetch1('threshold') - - # Load activity for processing - activity = activity_ref.load() - - # Detect threshold crossings (rising edge) - above_threshold = (activity > threshold).astype(int) - rising_edge = np.diff(above_threshold) > 0 - spike_times = np.where(rising_edge)[0] + 1 # +1 to get crossing point - - # Extract waveforms for all spikes - window = 40 # samples before and after spike - waveforms = [] - for t in spike_times: - # Skip spikes too close to edges - if t < window or t >= len(activity) - window: - continue - waveforms.append(activity[t - window : t + window]) - - # Stack into 2D array (n_spikes x window_size) - waveforms = np.vstack(waveforms) if waveforms else np.empty((0, 2 * window)) - - # Insert entry with all data - self.insert1({ - **key, - 'spike_times': spike_times, - 'spike_count': len(spike_times), - 'waveforms': waveforms, # All waveforms as single array - }) -``` - - - -```python -Spikes.populate(display_progress=True) -``` - - - -```python -# View spike counts for each neuron x parameter combination -Spikes.proj('spike_count') -``` - - -### Compare Detection Thresholds - - -```python -# Pick one neuron to visualize -neuron_key = {'mouse_id': 0, 'session_date': '2017-05-15', 'neuron_id': 0} -activity = (Neuron & neuron_key).fetch1('activity').load() # Explicit load - -fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True) - -for ax, param_id in zip(axes, [1, 2]): - key = {**neuron_key, 'spike_param_id': param_id} - spike_times_ref = (Spikes & key).fetch1('spike_times') - spike_times = spike_times_ref.load() # Load spike times - spike_count = (Spikes & key).fetch1('spike_count') - threshold = (SpikeParams & {'spike_param_id': param_id}).fetch1('threshold') - - ax.plot(activity, 'b-', alpha=0.7, label='Activity') - ax.axhline(threshold, color='r', linestyle='--', label=f'Threshold={threshold}') - ax.scatter(spike_times, activity[spike_times], color='red', s=20, zorder=5) - ax.set_title(f'Threshold={threshold}: {spike_count} spikes detected') - ax.set_ylabel('Activity') - ax.legend(loc='upper right') - -axes[1].set_xlabel('Time bin') -plt.tight_layout() -``` - - -### Average Waveform - - -```python -# Get waveforms for one neuron with threshold=0.5 -key = {'mouse_id': 100, 'session_date': '2017-05-25', 'neuron_id': 0, 'spike_param_id': 1} - -# Fetch waveforms NpyRef directly from Spikes table -waveforms_ref = (Spikes & key).fetch1('waveforms') - -# Load the waveform matrix -waveform_matrix = waveforms_ref.load() - -if len(waveform_matrix) > 0: - plt.figure(figsize=(8, 4)) - # Plot individual waveforms (light) - for wf in waveform_matrix: - plt.plot(wf, 'b-', alpha=0.2) - # Plot mean waveform (bold) - plt.plot(waveform_matrix.mean(axis=0), 'r-', linewidth=2, label='Mean waveform') - plt.axvline(40, color='k', linestyle='--', alpha=0.5, label='Spike time') - plt.xlabel('Sample (relative to spike)') - plt.ylabel('Activity') - plt.title(f'Spike Waveforms (n={len(waveform_matrix)})') - plt.legend() -else: - print("No waveforms found for this key") -``` - - -### Bulk Fetch with Lazy Loading - -Fetching many rows returns NpyRefs - inspect metadata before downloading. - - -```python -# Fetch all neurons - returns NpyRefs, NOT arrays -all_neurons = Neuron.to_dicts() -print(f"Fetched {len(all_neurons)} neurons\n") - -# Inspect metadata without downloading -for neuron in all_neurons: - ref = neuron['activity'] - print(f"Mouse {neuron['mouse_id']}, {neuron['session_date']}: " - f"shape={ref.shape}, loaded={ref.is_loaded}") -``` - - -## Pipeline Diagram - - -```python -dj.Diagram(schema) -``` - - -## Querying Results - -DataJoint makes it easy to query across the pipeline. - - -```python -# Find neurons with high spike counts (threshold=0.5) -(Spikes & 'spike_param_id = 1' & 'spike_count > 20').proj('spike_count') -``` - - - -```python -# Join with Mouse to see which mice have most spikes -(Mouse * Session * Spikes & 'spike_param_id = 1').proj('spike_count') -``` - - -## Summary - -This pipeline demonstrates key patterns for electrophysiology analysis with object storage: - -| Concept | Example | Purpose | -|---------|---------|--------| -| **Object storage** | `` | Store arrays in file/S3/MinIO | -| **Lazy loading** | `NpyRef` | Inspect shape/dtype without download | -| **Memory mapping** | `ref.load(mmap_mode='r')` | Random access to large arrays | -| **Imported tables** | `Neuron` | Load data from files | -| **Computed tables** | `ActivityStats`, `Spikes` | Derive results | -| **Lookup tables** | `SpikeParams` | Parameterize analysis | -| **Array attributes** | `waveforms` | Store multi-spike data as single array | - -### Key Benefits of `` - -1. **Lazy loading**: Inspect array metadata without downloading -2. **Memory mapping**: Random access to large arrays via `mmap_mode` -3. **Safe bulk fetch**: Fetching 1000 rows doesn't download 1000 arrays -4. **NumPy integration**: `np.mean(ref)` auto-downloads via `__array__` -5. **Portable format**: `.npy` files readable by NumPy, MATLAB, etc. -6. **Schema-addressed**: Files organized by schema/table/key -7. **Direct access**: Navigate and load files without database queries - -## Direct File Access: Navigating the Store - -A key advantage of `` is **schema-addressed storage**: files are organized in a predictable directory structure that mirrors your database schema. This means you can navigate and access data files directly—without querying the database. - -### Store Directory Structure with Partitioning - -This tutorial uses `partition_pattern: '{mouse_id}/{session_date}/{neuron_id}'` to organize files by the complete experimental hierarchy: - -``` -{store}/{mouse_id=X}/{session_date=Y}/{neuron_id=Z}/{schema}/{table}/{remaining_key}/{file}.npy -``` - -**Without partitioning**, the structure would be: -``` -{store}/{schema}/{table}/{mouse_id=X}/{session_date=Y}/{neuron_id=Z}/{remaining_key}/{file}.npy -``` - -**Partitioning moves the experimental hierarchy to the top** of the path, creating a browsable structure that matches how you think about your data: -1. Navigate to a subject (mouse_id=100) -2. Navigate to a session (session_date=2017-05-25) -3. Navigate to a neuron (neuron_id=0) -4. See all data for that neuron organized by table - -This structure enables: -- **Direct file access** for external tools (MATLAB, Julia, shell scripts) -- **Browsable data** organized by subject/session/neuron -- **Selective backup/sync** - Copy entire subjects, sessions, or individual neurons -- **Debugging** by inspecting raw files in the experimental hierarchy - - -```python -# Explore the store directory structure -from pathlib import Path - -print(f"Store location: {STORE_PATH}\n") -print("Directory structure (one subject shown in full):") - -def print_tree(directory, prefix="", max_depth=7, current_depth=0, limit_items=True): - """Print directory tree with limited depth.""" - if current_depth >= max_depth: - return - - try: - entries = sorted(Path(directory).iterdir()) - except PermissionError: - return - - dirs = [e for e in entries if e.is_dir()] - files = [e for e in entries if e.is_file()] - - # At depth 0 (root), only show first subject in detail - if current_depth == 0 and limit_items: - dirs = dirs[:1] # Only show first mouse_id - - # Show directories - for i, d in enumerate(dirs): - is_last_dir = (i == len(dirs) - 1) and len(files) == 0 - connector = "└── " if is_last_dir else "├── " - print(f"{prefix}{connector}{d.name}/") - - extension = " " if is_last_dir else "│ " - print_tree(d, prefix + extension, max_depth, current_depth + 1, limit_items=False) - - if current_depth == 0 and limit_items and len(sorted(Path(directory).iterdir(), key=lambda x: x.is_file())) > 1: - total_mice = len([e for e in Path(directory).iterdir() if e.is_dir()]) - if total_mice > 1: - print(f"{prefix}└── ... and {total_mice - 1} more subjects (mouse_id=5, mouse_id=100)") - - # Show files - for i, f in enumerate(files): - is_last = i == len(files) - 1 - connector = "└── " if is_last else "├── " - size_kb = f.stat().st_size / 1024 - print(f"{prefix}{connector}{f.name} ({size_kb:.1f} KB)") - -print_tree(STORE_PATH) -``` - - - -```python -# Get the actual path from an NpyRef -key = {'mouse_id': 0, 'session_date': '2017-05-15', 'neuron_id': 0} -ref = (Neuron & key).fetch1('activity') - -print(f"NpyRef path (relative): {ref.path}") -print(f"Full path: {Path(STORE_PATH) / ref.path}\n") - -# Load directly with NumPy - bypass the database! -direct_path = Path(STORE_PATH) / ref.path -activity_direct = np.load(direct_path) -print(f"Loaded array: shape={activity_direct.shape}, dtype={activity_direct.dtype}") -print(f"First 5 values: {activity_direct[:5]}") -``` - - - -```python -# Find all .npy files for a specific mouse using filesystem tools -# This works without any database query! - -store_path = Path(STORE_PATH) -all_npy_files = list(store_path.rglob("*.npy")) - -print(f"All .npy files in store ({len(all_npy_files)} total):\n") -for f in sorted(all_npy_files)[:8]: - rel_path = f.relative_to(STORE_PATH) - size_kb = f.stat().st_size / 1024 - print(f" {rel_path} ({size_kb:.1f} KB)") -if len(all_npy_files) > 8: - print(f" ... and {len(all_npy_files) - 8} more files") -``` - - -### Use Cases for Direct Access - -**External tools**: Load data in MATLAB, Julia, or R without DataJoint: -```matlab -% MATLAB - use the path from NpyRef or navigate the store -activity = readNPY('store/schema/table/key_hash/activity.npy'); -``` - -**Shell scripting**: Process files with command-line tools: -```bash -# List all .npy files in store -find $STORE -name "*.npy" - -# Backup the entire store -rsync -av $STORE/ backup/ -``` - -**Disaster recovery**: If the database is lost, the store contains all array data in standard `.npy` format. The path structure and JSON metadata in the database can help reconstruct mappings. - -### Publishing to Data Repositories - -Many scientific data repositories accept **structured file-folder hierarchies**—exactly what `` provides. The schema-addressed storage format makes your data publication-ready: - -| Repository | Accepted Formats | Schema-Addressed Benefit | -|------------|-----------------|-------------------------| -| [DANDI](https://dandiarchive.org) | NWB, folders | Export subject/session hierarchy | -| [OpenNeuro](https://openneuro.org) | BIDS folders | Map to BIDS-like structure | -| [Figshare](https://figshare.com) | Any files/folders | Upload store directly | -| [Zenodo](https://zenodo.org) | Any files/folders | Archive with DOI | -| [OSF](https://osf.io) | Any files/folders | Version-controlled sharing | - -**Export for publication**: -```python -# Export one subject's data for publication -subject_dir = Path(STORE_PATH) / "schema" / "table" / "objects" / "mouse_id=100" - -# Copy to publication directory -import shutil -shutil.copytree(subject_dir, "publication_data/mouse_100/") - -# The resulting structure is self-documenting: -# publication_data/ -# mouse_100/ -# session_date=2017-05-25/ -# neuron_id=0/ -# activity_xyz.npy -# spike_times_abc.npy -# waveforms_def.npy -``` - -**Key advantages for publishing**: - -1. **Self-documenting paths**: Primary key values are in folder names—no lookup table needed -2. **Standard format**: `.npy` files are readable by NumPy, MATLAB, Julia, R, and most analysis tools -3. **Selective export**: Copy only specific subjects, sessions, or tables -4. **Reproducibility**: Published data has the same structure as your working pipeline -5. **Metadata preservation**: Path encodes experimental metadata (subject, session, parameters) - - -```python -# Cleanup: drop schema and remove temporary store -schema.drop(prompt=False) -import shutil -shutil.rmtree(STORE_PATH, ignore_errors=True) -``` - - ---- -## File: tutorials/examples/blob-detection.ipynb - -# Blob Detection Pipeline - -This tutorial introduces DataJoint through a real image analysis pipeline that detects bright blobs in astronomical and biological images. By the end, you'll understand: - -- **Schemas** — Namespaces that group related tables -- **Table types** — Manual, Lookup, and Computed tables -- **Dependencies** — How tables relate through foreign keys -- **Computation** — Automatic population of derived data -- **Master-Part** — Atomic insertion of hierarchical results - -## The Problem - -We have images and want to detect bright spots (blobs) in them. Different detection parameters work better for different images, so we need to: - -1. Store our images -2. Define parameter sets to try -3. Run detection for each image × parameter combination -4. Store and visualize results -5. Select the best parameters for each image - -This is a **computational workflow** — a series of steps where each step depends on previous results. DataJoint makes these workflows reproducible and manageable. - -## Setup - -First, let's import our tools and create a schema (database namespace) for this project. - - -```python -import datajoint as dj -import matplotlib.pyplot as plt -from skimage import data -from skimage.feature import blob_doh -from skimage.color import rgb2gray - -# Create a schema - this is our database namespace -schema = dj.Schema('tutorial_blobs') -``` - - -## Manual Tables: Storing Raw Data - -A **Manual table** stores data that users enter directly — it's the starting point of your pipeline. Here we define an `Image` table to store our sample images. - -The `definition` string specifies: -- **Primary key** (above `---`): attributes that uniquely identify each row -- **Secondary attributes** (below `---`): additional data for each row - - -```python -@schema -class Image(dj.Manual): - definition = """ - # Images for blob detection - image_id : uint8 - --- - image_name : varchar(100) - image : # serialized numpy array - """ -``` - - -Now let's insert two sample images from scikit-image: - - -```python -# Insert sample images -Image.insert([ - {'image_id': 1, 'image_name': 'Hubble Deep Field', - 'image': rgb2gray(data.hubble_deep_field())}, - {'image_id': 2, 'image_name': 'Human Mitosis', - 'image': data.human_mitosis() / 255.0}, -], skip_duplicates=True) - -Image() -``` - - - -```python -# Visualize the images -fig, axes = plt.subplots(1, 2, figsize=(10, 5)) -for ax, row in zip(axes, Image()): - ax.imshow(row['image'], cmap='gray_r') - ax.set_title(row['image_name']) - ax.axis('off') -plt.tight_layout() -``` - - -## Lookup Tables: Parameter Sets - -A **Lookup table** stores reference data that doesn't change often — things like experimental protocols, parameter configurations, or categorical options. - -For blob detection, we'll try different parameter combinations to find what works best for each image type. - - -```python -@schema -class DetectionParams(dj.Lookup): - definition = """ - # Blob detection parameter sets - params_id : uint8 - --- - min_sigma : float32 # minimum blob size - max_sigma : float32 # maximum blob size - threshold : float32 # detection sensitivity - """ - - # Pre-populate with parameter sets to try - contents = [ - {'params_id': 1, 'min_sigma': 2.0, 'max_sigma': 6.0, 'threshold': 0.001}, - {'params_id': 2, 'min_sigma': 3.0, 'max_sigma': 8.0, 'threshold': 0.002}, - {'params_id': 3, 'min_sigma': 4.0, 'max_sigma': 20.0, 'threshold': 0.01}, - ] - -DetectionParams() -``` - - -## Computed Tables: Automatic Processing - -A **Computed table** automatically derives data from other tables. You define: - -1. **Dependencies** (using `->`) — which tables provide input -2. **`make()` method** — how to compute results for one input combination - -DataJoint then handles: -- Determining what needs to be computed -- Running computations (optionally in parallel) -- Tracking what's done vs. pending - -### Master-Part Structure - -Our detection produces multiple blobs per image. We use a **master-part** structure: -- **Master** (`Detection`): One row per job, stores summary (blob count) -- **Part** (`Detection.Blob`): One row per blob, stores details (x, y, radius) - -Both are inserted atomically — if anything fails, the whole transaction rolls back. - - -```python -@schema -class Detection(dj.Computed): - definition = """ - # Blob detection results - -> Image # depends on Image - -> DetectionParams # depends on DetectionParams - --- - num_blobs : uint16 # number of blobs detected - """ - - class Blob(dj.Part): - definition = """ - # Individual detected blobs - -> master - blob_idx : uint16 - --- - x : float32 # x coordinate - y : float32 # y coordinate - radius : float32 # blob radius - """ - - def make(self, key): - # Fetch the image and parameters - img = (Image & key).fetch1('image') - params = (DetectionParams & key).fetch1() - - # Run blob detection - blobs = blob_doh( - img, - min_sigma=params['min_sigma'], - max_sigma=params['max_sigma'], - threshold=params['threshold'] - ) - - # Insert master row - self.insert1({**key, 'num_blobs': len(blobs)}) - - # Insert part rows (all blobs for this detection) - self.Blob.insert([ - {**key, 'blob_idx': i, 'x': x, 'y': y, 'radius': r} - for i, (x, y, r) in enumerate(blobs) - ]) -``` - - -## Viewing the Schema - -DataJoint can visualize the relationships between tables: - - -```python -dj.Diagram(schema) -``` - - -The diagram shows: -- **Green** = Manual tables (user-entered data) -- **Gray** = Lookup tables (reference data) -- **Red** = Computed tables (derived data) -- **Edges** = Dependencies (foreign keys), always flow top-to-bottom - -## Running the Pipeline - -Call `populate()` to run all pending computations. DataJoint automatically determines what needs to be computed: every combination of `Image` × `DetectionParams` that doesn't already have a `Detection` result. - - -```python -# Run all pending computations -Detection.populate(display_progress=True) -``` - - - -```python -# View results summary -Detection() -``` - - -We computed 6 results: 2 images × 3 parameter sets. Each shows how many blobs were detected. - -## Visualizing Results - -Let's see how different parameters affect detection: - - -```python -fig, axes = plt.subplots(2, 3, figsize=(12, 8)) - -for ax, key in zip(axes.ravel(), - Detection.keys(order_by='image_id, params_id')): - # Get image and detection info in one fetch - name, img, num_blobs = (Detection * Image & key).fetch1( - 'image_name', 'image', 'num_blobs') - - ax.imshow(img, cmap='gray_r') - - # Get all blob coordinates in one query - x, y, r = (Detection.Blob & key).to_arrays('x', 'y', 'radius') - for xi, yi, ri in zip(x, y, r): - circle = plt.Circle((yi, xi), ri * 1.2, - color='red', fill=False, alpha=0.6) - ax.add_patch(circle) - - ax.set_title(f"{name}\nParams {key['params_id']}: {num_blobs} blobs", - fontsize=10) - ax.axis('off') - -plt.tight_layout() -``` - - -## Querying Results - -DataJoint's query language makes it easy to explore results: - - -```python -# Find detections with fewer than 300 blobs -Detection & 'num_blobs < 300' -``` - - - -```python -# Join to see image names with blob counts -(Image * Detection).proj('image_name', 'num_blobs') -``` - - -## Storing Selections - -After reviewing the results, we can record which parameter set works best for each image. This is another Manual table that references our computed results: - - -```python -@schema -class SelectedDetection(dj.Manual): - definition = """ - # Best detection for each image - -> Image - --- - -> Detection - """ - -# Select params 3 for Hubble (fewer, larger blobs) -# Select params 1 for Mitosis (many small spots) -SelectedDetection.insert([ - {'image_id': 1, 'params_id': 3}, - {'image_id': 2, 'params_id': 1}, -], skip_duplicates=True) - -SelectedDetection() -``` - - - -```python -# View the final schema with selections -dj.Diagram(schema) -``` - - -## Key Concepts Recap - -| Concept | What It Does | Example | -|---------|--------------|--------| -| **Schema** | Groups related tables | `schema = dj.Schema('tutorial_blobs')` | -| **Manual Table** | Stores user-entered data | `Image`, `SelectedDetection` | -| **Lookup Table** | Stores reference/config data | `DetectionParams` | -| **Computed Table** | Derives data automatically | `Detection` | -| **Part Table** | Stores detailed results with master | `Detection.Blob` | -| **Foreign Key** (`->`) | Creates dependency | `-> Image` | -| **`populate()`** | Runs pending computations | `Detection.populate()` | -| **Restriction** (`&`) | Filters rows | `Detection & 'num_blobs < 300'` | -| **Join** (`*`) | Combines tables | `Image * Detection` | - -## Next Steps - -- [Schema Design](02-schema-design.ipynb) — Learn table types and relationships in depth -- [Queries](04-queries.ipynb) — Master DataJoint's query operators -- [Computation](05-computation.ipynb) — Build complex computational workflows - - -```python -# Cleanup: drop the schema for re-running the tutorial -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/examples/fractal-pipeline.ipynb - -# Fractal Image Pipeline - -This tutorial demonstrates **computed tables** by building an image processing pipeline for Julia fractals. - -You'll learn: -- **Manual tables**: Parameters you define (experimental configurations) -- **Lookup tables**: Fixed reference data (processing methods) -- **Computed tables**: Automatically generated results via `populate()` -- **Many-to-many pipelines**: Processing every combination of inputs × methods - - -```python -import datajoint as dj -import numpy as np -from matplotlib import pyplot as plt - -schema = dj.Schema('tutorial_fractal') -``` - - -## Julia Set Generator - -Julia sets are fractals generated by iterating $f(z) = z^2 + c$ for each point in the complex plane. Points that don't escape to infinity form intricate patterns. - - -```python -def julia(c, size=256, center=(0.0, 0.0), zoom=1.0, iters=256): - """Generate a Julia set image.""" - x, y = np.meshgrid( - np.linspace(-1, 1, size) / zoom + center[0], - np.linspace(-1, 1, size) / zoom + center[1] - ) - z = x + 1j * y - img = np.zeros(z.shape) - mask = np.ones(z.shape, dtype=bool) - for _ in range(iters): - z[mask] = z[mask] ** 2 + c - mask = np.abs(z) < 2 - img += mask - return img -``` - - - -```python -# Example fractal -plt.imshow(julia(-0.4 + 0.6j), cmap='magma') -plt.axis('off'); -``` - - -## Pipeline Architecture - -We'll build a pipeline with four tables: - -- **JuliaSpec** (Manual): Parameters we define for fractal generation -- **JuliaImage** (Computed): Generated from specs -- **DenoiseMethod** (Lookup): Fixed set of denoising algorithms -- **Denoised** (Computed): Each image × each method - -After defining all tables, we'll visualize the schema with `dj.Diagram(schema)`. - - -```python -@schema -class JuliaSpec(dj.Manual): - """Parameters for generating Julia fractals.""" - definition = """ - spec_id : uint8 - --- - c_real : float64 # Real part of c - c_imag : float64 # Imaginary part of c - noise_level = 50 : float64 - """ -``` - - - -```python -@schema -class JuliaImage(dj.Computed): - """Generated fractal images with noise.""" - definition = """ - -> JuliaSpec - --- - image : # Generated fractal image - """ - - def make(self, key): - spec = (JuliaSpec & key).fetch1() - img = julia(spec['c_real'] + 1j * spec['c_imag']) - img += np.random.randn(*img.shape) * spec['noise_level'] - self.insert1({**key, 'image': img.astype(np.float32)}) -``` - - - -```python -from skimage import filters, restoration -from skimage.morphology import disk - -@schema -class DenoiseMethod(dj.Lookup): - """Image denoising algorithms.""" - definition = """ - method_id : uint8 - --- - method_name : varchar(20) - params : - """ - contents = [ - [0, 'gaussian', {'sigma': 1.8}], - [1, 'median', {'radius': 3}], - [2, 'tv', {'weight': 20.0}], - ] -``` - - - -```python -@schema -class Denoised(dj.Computed): - """Denoised images: each image × each method.""" - definition = """ - -> JuliaImage - -> DenoiseMethod - --- - denoised : - """ - - def make(self, key): - img = (JuliaImage & key).fetch1('image') - method, params = (DenoiseMethod & key).fetch1('method_name', 'params') - - if method == 'gaussian': - result = filters.gaussian(img, **params) - elif method == 'median': - result = filters.median(img, disk(params['radius'])) - elif method == 'tv': - result = restoration.denoise_tv_chambolle(img, **params) - else: - raise ValueError(f"Unknown method: {method}") - - self.insert1({**key, 'denoised': result.astype(np.float32)}) -``` - - - -```python -dj.Diagram(schema) -``` - - -## Running the Pipeline - -1. Insert specs into Manual table -2. Call `populate()` on Computed tables - - -```python -# Define fractal parameters -JuliaSpec.insert([ - {'spec_id': 0, 'c_real': -0.4, 'c_imag': 0.6}, - {'spec_id': 1, 'c_real': -0.74543, 'c_imag': 0.11301}, - {'spec_id': 2, 'c_real': -0.1, 'c_imag': 0.651}, - {'spec_id': 3, 'c_real': -0.835, 'c_imag': -0.2321}, -]) -JuliaSpec() -``` - - - -```python -# Generate all fractal images -JuliaImage.populate(display_progress=True) -``` - - - -```python -# View generated images -fig, axes = plt.subplots(1, 4, figsize=(12, 3)) -for ax, row in zip(axes, JuliaImage()): - ax.imshow(row['image'], cmap='magma') - ax.set_title(f"spec_id={row['spec_id']}") - ax.axis('off') -plt.tight_layout() -``` - - - -```python -# Apply all denoising methods to all images -Denoised.populate(display_progress=True) -``` - - - -```python -# 4 images × 3 methods = 12 results -print(f"JuliaImage: {len(JuliaImage())} rows") -print(f"DenoiseMethod: {len(DenoiseMethod())} rows") -print(f"Denoised: {len(Denoised())} rows") -``` - - - -```python -# Compare denoising methods on one image -spec_id = 0 -original = (JuliaImage & {'spec_id': spec_id}).fetch1('image') - -fig, axes = plt.subplots(1, 4, figsize=(14, 3.5)) -axes[0].imshow(original, cmap='magma') -axes[0].set_title('Original (noisy)') - -for ax, method_id in zip(axes[1:], [0, 1, 2]): - result = (Denoised & {'spec_id': spec_id, 'method_id': method_id}).fetch1('denoised') - method_name = (DenoiseMethod & {'method_id': method_id}).fetch1('method_name') - ax.imshow(result, cmap='magma') - ax.set_title(method_name) - -for ax in axes: - ax.axis('off') -plt.tight_layout() -``` - - -## Key Points - -| Table Type | Populated By | Use For | -|------------|-------------|--------| -| **Manual** | `insert()` | Experimental parameters, user inputs | -| **Lookup** | `contents` attribute | Fixed reference data, method catalogs | -| **Computed** | `populate()` | Derived results, processed outputs | - -The pipeline automatically: -- Tracks dependencies (can't process an image that doesn't exist) -- Skips already-computed results (idempotent) -- Computes all combinations when multiple tables converge - - -```python -# Add a new spec — populate only computes what's missing -JuliaSpec.insert1({'spec_id': 4, 'c_real': -0.7, 'c_imag': 0.27}) - -JuliaImage.populate(display_progress=True) # Only spec_id=4 -Denoised.populate(display_progress=True) # Only spec_id=4 × 3 methods -``` - - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/examples/hotel-reservations.ipynb - -# Hotel Reservation System - -This example demonstrates how DataJoint's schema design enforces business rules automatically. You'll learn: - -- **Workflow dependencies** — Tables that enforce operational sequences -- **Business rule enforcement** — Using referential integrity as validation -- **Temporal data** — Room availability and pricing by date -- **Error handling** — Converting database errors to domain exceptions - -## The Problem - -A hotel needs to enforce these business rules: - -1. Rooms have types (Deluxe, Suite) with varying prices by date -2. Guests can only reserve rooms that are available -3. Each room can have at most one reservation per night -4. Guests must have a reservation before checking in -5. Guests must check in before checking out - -Traditional approaches validate these rules in application code. With DataJoint, the **schema itself enforces them** through foreign keys and unique constraints. - - -```python -import datajoint as dj -import datetime -import random - -# Clean start -schema = dj.Schema('tutorial_hotel') -schema.drop(prompt=False) -schema = dj.Schema('tutorial_hotel') -``` - - -## Schema Design - -The schema forms a workflow: `Room → RoomAvailable → Reservation → CheckIn → CheckOut` - -Each arrow represents a foreign key dependency. You cannot insert a child record without a valid parent. - - -```python -@schema -class Room(dj.Lookup): - definition = """ - # Hotel rooms - room : uint8 # room number - --- - room_type : enum('Deluxe', 'Suite') - """ - contents = [ - {'room': i, 'room_type': 'Suite' if i % 5 == 0 else 'Deluxe'} - for i in range(1, 21) # 20 rooms - ] -``` - - - -```python -@schema -class RoomAvailable(dj.Manual): - definition = """ - # Room availability and pricing by date - -> Room - date : date - --- - price : decimal(6, 2) # price per night - """ -``` - - - -```python -@schema -class Guest(dj.Manual): - definition = """ - # Hotel guests - guest_id : uint32 # auto-assigned guest ID - --- - guest_name : varchar(60) - index(guest_name) - """ -``` - - - -```python -@schema -class Reservation(dj.Manual): - definition = """ - # Room reservations (one per room per night) - -> RoomAvailable - --- - -> Guest - credit_card : varchar(80) # encrypted card info - """ -``` - - - -```python -@schema -class CheckIn(dj.Manual): - definition = """ - # Check-in records (requires reservation) - -> Reservation - """ -``` - - - -```python -@schema -class CheckOut(dj.Manual): - definition = """ - # Check-out records (requires check-in) - -> CheckIn - """ -``` - - - -```python -dj.Diagram(schema) -``` - - -## How the Schema Enforces Rules - -| Business Rule | Schema Enforcement | -|---------------|--------------------| -| Room must exist | `Reservation -> RoomAvailable -> Room` | -| Room must be available on date | `Reservation -> RoomAvailable` | -| One reservation per room/night | `RoomAvailable` is primary key of `Reservation` | -| Must reserve before check-in | `CheckIn -> Reservation` | -| Must check-in before check-out | `CheckOut -> CheckIn` | - -The database **rejects invalid operations** — no application code needed. - -## Populate Room Availability - -Make rooms available for the next 30 days with random pricing: - - -```python -random.seed(42) -start_date = datetime.date.today() -days = 30 - -for day in range(days): - date = start_date + datetime.timedelta(days=day) - # Weekend prices are higher - is_weekend = date.weekday() >= 5 - base_price = 200 if is_weekend else 150 - - RoomAvailable.insert( - { - 'room': room['room'], - 'date': date, - 'price': base_price + random.randint(-30, 50) - } - for room in Room.to_dicts() - ) - -print(f"Created {len(RoomAvailable())} room-night records") -RoomAvailable() & {'room': 1} -``` - - -## Business Operations - -These functions wrap database operations and convert constraint violations into meaningful domain errors: - - -```python -# Domain-specific exceptions -class HotelError(Exception): - pass - -class RoomNotAvailable(HotelError): - pass - -class RoomAlreadyReserved(HotelError): - pass - -class NoReservation(HotelError): - pass - -class NotCheckedIn(HotelError): - pass - -class AlreadyProcessed(HotelError): - pass -``` - - - -```python -def reserve_room(room, date, guest_name, credit_card): - """ - Make a reservation. Creates guest record if needed. - - Raises - ------ - RoomNotAvailable - If room is not available on that date - RoomAlreadyReserved - If room is already reserved for that date - """ - # Find or create guest - guests = list((Guest & {'guest_name': guest_name}).keys()) - if guests: - guest_key = guests[0] - else: - guest_key = {'guest_id': random.randint(1, 2**31)} - Guest.insert1({**guest_key, 'guest_name': guest_name}) - - try: - Reservation.insert1({ - 'room': room, - 'date': date, - **guest_key, - 'credit_card': credit_card - }) - except dj.errors.DuplicateError: - raise RoomAlreadyReserved( - f"Room {room} already reserved for {date}") from None - except dj.errors.IntegrityError: - raise RoomNotAvailable( - f"Room {room} not available on {date}") from None -``` - - - -```python -def check_in(room, date): - """ - Check in a guest. Requires existing reservation. - - Raises - ------ - NoReservation - If no reservation exists for this room/date - AlreadyProcessed - If guest already checked in - """ - try: - CheckIn.insert1({'room': room, 'date': date}) - except dj.errors.DuplicateError: - raise AlreadyProcessed( - f"Room {room} already checked in for {date}") from None - except dj.errors.IntegrityError: - raise NoReservation( - f"No reservation for room {room} on {date}") from None -``` - - - -```python -def check_out(room, date): - """ - Check out a guest. Requires prior check-in. - - Raises - ------ - NotCheckedIn - If guest hasn't checked in - AlreadyProcessed - If guest already checked out - """ - try: - CheckOut.insert1({'room': room, 'date': date}) - except dj.errors.DuplicateError: - raise AlreadyProcessed( - f"Room {room} already checked out for {date}") from None - except dj.errors.IntegrityError: - raise NotCheckedIn( - f"Room {room} not checked in for {date}") from None -``` - - -## Demo: Business Rule Enforcement - -Let's see the schema enforce our business rules: - - -```python -# Successful reservation -tomorrow = start_date + datetime.timedelta(days=1) -reserve_room(1, tomorrow, 'Alice Smith', '4111-1111-1111-1111') -print(f"Reserved room 1 for {tomorrow}") - -Reservation() -``` - - - -```python -# Try to double-book the same room — fails! -try: - reserve_room(1, tomorrow, 'Bob Jones', '5555-5555-5555-5555') -except RoomAlreadyReserved as e: - print(f"Blocked: {e}") -``` - - - -```python -# Try to reserve unavailable date — fails! -far_future = start_date + datetime.timedelta(days=365) -try: - reserve_room(1, far_future, 'Carol White', '6666-6666-6666-6666') -except RoomNotAvailable as e: - print(f"Blocked: {e}") -``` - - - -```python -# Try to check in without reservation — fails! -try: - check_in(2, tomorrow) # Room 2 has no reservation -except NoReservation as e: - print(f"Blocked: {e}") -``` - - - -```python -# Successful check-in (has reservation) -check_in(1, tomorrow) -print(f"Checked in room 1 for {tomorrow}") - -CheckIn() -``` - - - -```python -# Try to check out without checking in — fails! -# First make a reservation for room 3 -reserve_room(3, tomorrow, 'David Brown', '7777-7777-7777-7777') - -try: - check_out(3, tomorrow) # Reserved but not checked in -except NotCheckedIn as e: - print(f"Blocked: {e}") -``` - - - -```python -# Successful check-out (was checked in) -check_out(1, tomorrow) -print(f"Checked out room 1 for {tomorrow}") - -CheckOut() -``` - - -## Useful Queries - -The workflow structure enables powerful queries: - - -```python -# Available rooms (not reserved) for tomorrow -available = (RoomAvailable & {'date': tomorrow}) - Reservation -print(f"Available rooms for {tomorrow}: {len(available)}") -available -``` - - - -```python -# Guests currently checked in (checked in but not out) -currently_in = (CheckIn - CheckOut) * Reservation * Guest -currently_in.proj('guest_name', 'room', 'date') -``` - - - -```python -# Reservations without check-in (no-shows or upcoming) -not_checked_in = Reservation - CheckIn -(not_checked_in * Guest).proj('guest_name', 'room', 'date') -``` - - - -```python -# Revenue by room type using aggr -dj.U('room_type').aggr( - Room * RoomAvailable * Reservation, - total_revenue='SUM(price)', - reservations='COUNT(*)' -) -``` - - -## Key Concepts - -| Concept | How It's Used | -|---------|---------------| -| **Workflow Dependencies** | `CheckOut -> CheckIn -> Reservation -> RoomAvailable` | -| **Unique Constraints** | One reservation per room/night (primary key) | -| **Referential Integrity** | Can't reserve unavailable room, can't check in without reservation | -| **Error Translation** | Database exceptions → domain-specific errors | - -The schema **is** the business logic. Application code just translates errors. - -## Next Steps - -- [University Database](university.ipynb) — Academic records with many-to-many relationships -- [Languages & Proficiency](languages.ipynb) — International standards and lookup tables -- [Data Entry](../basics/03-data-entry.ipynb) — Insert patterns and transactions - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/examples/languages.ipynb - -# Languages and Proficiency - -This example demonstrates many-to-many relationships using an association table with international standards. You'll learn: - -- **Many-to-many relationships** — People speak multiple languages; languages have multiple speakers -- **Lookup tables** — Standardized reference data (ISO language codes, CEFR levels) -- **Association tables** — Linking entities with additional attributes -- **Complex queries** — Aggregations, filtering, and joins - -## International Standards - -This example uses two widely-adopted standards: - -- **ISO 639-1** — Two-letter language codes (`en`, `es`, `ja`) -- **CEFR** — Common European Framework of Reference for language proficiency (A1–C2) - -Using international standards ensures data consistency and enables integration with external systems. - - -```python -import datajoint as dj -import numpy as np -from faker import Faker - -dj.config['display.limit'] = 8 - -# Clean start -schema = dj.Schema('tutorial_languages') -schema.drop(prompt=False) -schema = dj.Schema('tutorial_languages') -``` - - -## Lookup Tables - -Lookup tables store standardized reference data that rarely changes. The `contents` attribute pre-populates them when the schema is created. - - -```python -@schema -class Language(dj.Lookup): - definition = """ - # ISO 639-1 language codes - lang_code : char(2) # two-letter code (en, es, ja) - --- - language : varchar(30) # full name - native_name : varchar(50) # name in native script - """ - contents = [ - ('ar', 'Arabic', 'العربية'), - ('de', 'German', 'Deutsch'), - ('en', 'English', 'English'), - ('es', 'Spanish', 'Español'), - ('fr', 'French', 'Français'), - ('hi', 'Hindi', 'हिन्दी'), - ('ja', 'Japanese', '日本語'), - ('ko', 'Korean', '한국어'), - ('pt', 'Portuguese', 'Português'), - ('ru', 'Russian', 'Русский'), - ('zh', 'Chinese', '中文'), - ] -``` - - - -```python -@schema -class CEFRLevel(dj.Lookup): - definition = """ - # CEFR proficiency levels - cefr_level : char(2) # A1, A2, B1, B2, C1, C2 - --- - level_name : varchar(20) # descriptive name - category : enum('Basic', 'Independent', 'Proficient') - description : varchar(100) # can-do summary - """ - contents = [ - ('A1', 'Beginner', 'Basic', - 'Can use familiar everyday expressions'), - ('A2', 'Elementary', 'Basic', - 'Can communicate in simple routine tasks'), - ('B1', 'Intermediate', 'Independent', - 'Can deal with most travel situations'), - ('B2', 'Upper Intermediate', 'Independent', - 'Can interact with fluency and spontaneity'), - ('C1', 'Advanced', 'Proficient', - 'Can express ideas fluently for professional use'), - ('C2', 'Mastery', 'Proficient', - 'Can understand virtually everything'), - ] -``` - - - -```python -print("Languages:") -print(Language()) -print("\nCEFR Levels:") -print(CEFRLevel()) -``` - - -## Entity and Association Tables - -- **Person** — The main entity -- **Proficiency** — Association table linking Person, Language, and CEFRLevel - -The association table's primary key includes both Person and Language, creating the many-to-many relationship. - - -```python -@schema -class Person(dj.Manual): - definition = """ - # People with language skills - person_id : int32 # unique identifier - --- - name : varchar(60) - date_of_birth : date - """ -``` - - - -```python -@schema -class Proficiency(dj.Manual): - definition = """ - # Language proficiency (many-to-many: person <-> language) - -> Person - -> Language - --- - -> CEFRLevel - """ -``` - - - -```python -dj.Diagram(schema) -``` - - -**Reading the diagram:** -- **Gray tables** (Language, CEFRLevel) are Lookup tables -- **Green table** (Person) is Manual -- **Solid lines** indicate foreign keys in the primary key (many-to-many) -- **Dashed line** indicates foreign key in secondary attributes (reference) - -## Populate Sample Data - - -```python -np.random.seed(42) -fake = Faker() -fake.seed_instance(42) - -# Generate 200 people -n_people = 200 -Person.insert( - { - 'person_id': i, - 'name': fake.name(), - 'date_of_birth': fake.date_of_birth( - minimum_age=18, maximum_age=70) - } - for i in range(n_people) -) - -print(f"Created {len(Person())} people") -Person() -``` - - - -```python -# Assign random language proficiencies -lang_keys = list(Language.keys()) -cefr_keys = list(CEFRLevel.keys()) - -# More people at intermediate levels than extremes -cefr_weights = [0.08, 0.12, 0.20, 0.25, 0.20, 0.15] -avg_languages = 2.5 - -for person_key in Person.keys(): - n_langs = np.random.poisson(avg_languages) - if n_langs > 0: - selected_langs = np.random.choice( - len(lang_keys), min(n_langs, len(lang_keys)), replace=False) - Proficiency.insert( - { - **person_key, - **lang_keys[i], - **np.random.choice(cefr_keys, p=cefr_weights) - } - for i in selected_langs - ) - -print(f"Created {len(Proficiency())} proficiency records") -Proficiency() -``` - - -## Query Examples - -### Finding Speakers - - -```python -# Proficient English speakers (C1 or C2) -proficient_english = ( - Person.proj('name') & - (Proficiency & {'lang_code': 'en'} & 'cefr_level >= "C1"') -) -print(f"Proficient English speakers: {len(proficient_english)}") -proficient_english -``` - - - -```python -# People who speak BOTH English AND Spanish -bilingual = ( - Person.proj('name') & - (Proficiency & {'lang_code': 'en'}) & - (Proficiency & {'lang_code': 'es'}) -) -print(f"English + Spanish speakers: {len(bilingual)}") -bilingual -``` - - - -```python -# People who speak English OR Spanish -either = ( - Person.proj('name') & - (Proficiency & 'lang_code in ("en", "es")') -) -print(f"English or Spanish speakers: {len(either)}") -either -``` - - -### Aggregations - - -```python -# People who speak 4+ languages -polyglots = Person.aggr( - Proficiency, - 'name', - n_languages='COUNT(lang_code)', - languages='GROUP_CONCAT(lang_code)' -) & 'n_languages >= 4' - -print(f"Polyglots (4+ languages): {len(polyglots)}") -polyglots -``` - - - -```python -# Top 5 polyglots -top_polyglots = Person.aggr( - Proficiency, - 'name', - n_languages='COUNT(lang_code)' -) & dj.Top(5, order_by='n_languages DESC') - -top_polyglots -``` - - - -```python -# Number of speakers per language -speakers_per_lang = Language.aggr( - Proficiency, - 'language', - n_speakers='COUNT(person_id)' -) -speakers_per_lang -``` - - - -```python -# CEFR level distribution for English -english_levels = CEFRLevel.aggr( - Proficiency & {'lang_code': 'en'}, - 'level_name', - n_speakers='COUNT(person_id)' -) -english_levels -``` - - -### Joining Tables - - -```python -# Full profile: person + language + proficiency details -full_profile = ( - Person * Proficiency * Language * CEFRLevel -).proj('name', 'language', 'level_name', 'category') - -# Show profile for person_id=0 -full_profile & {'person_id': 0} -``` - - - -```python -# Find people with C1+ proficiency in multiple languages -advanced_polyglots = Person.aggr( - Proficiency & 'cefr_level >= "C1"', - 'name', - n_advanced='COUNT(*)' -) & 'n_advanced >= 2' - -print(f"Advanced in 2+ languages: {len(advanced_polyglots)}") -advanced_polyglots -``` - - -## Key Concepts - -| Pattern | Implementation | -|---------|----------------| -| **Many-to-many** | `Proficiency` links `Person` and `Language` | -| **Lookup tables** | `Language` and `CEFRLevel` with `contents` | -| **Association data** | `cefr_level` stored in the association table | -| **Standards** | ISO 639-1 codes, CEFR levels | - -### Benefits of Lookup Tables - -1. **Data consistency** — Only valid codes can be used -2. **Rich metadata** — Full names, descriptions stored once -3. **Easy updates** — Change "Español" to "Spanish" in one place -4. **Self-documenting** — `Language()` shows all valid options - -## Next Steps - -- [University Database](university.ipynb) — Academic records -- [Hotel Reservations](hotel-reservations.ipynb) — Workflow dependencies -- [Queries Tutorial](../basics/04-queries.ipynb) — Query operators in depth - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/examples/university.ipynb - -# University Database - -This tutorial builds a complete university registration system to demonstrate: - -- **Schema design** with realistic relationships -- **Data population** using Faker for synthetic data -- **Rich query patterns** from simple to complex - -University databases are classic examples because everyone understands students, courses, enrollments, and grades. The domain naturally demonstrates: - -- One-to-many relationships (department → courses) -- Many-to-many relationships (students ↔ courses via enrollments) -- Workflow dependencies (enrollment requires both student and section to exist) - - -```python -import datajoint as dj -import numpy as np -from datetime import date - -schema = dj.Schema('tutorial_university') -``` - - -## Schema Design - -Our university schema models: - -| Table | Purpose | -|-------|--------| -| `Student` | Student records with contact info | -| `Department` | Academic departments | -| `StudentMajor` | Student-declared majors | -| `Course` | Course catalog | -| `Term` | Academic terms (Spring/Summer/Fall) | -| `Section` | Course offerings in specific terms | -| `Enroll` | Student enrollments in sections | -| `LetterGrade` | Grade scale (lookup) | -| `Grade` | Assigned grades | - - -```python -@schema -class Student(dj.Manual): - definition = """ - student_id : uint32 # university-wide ID - --- - first_name : varchar(40) - last_name : varchar(40) - sex : enum('F', 'M', 'U') - date_of_birth : date - home_city : varchar(60) - home_state : char(2) # US state code - """ -``` - - - -```python -@schema -class Department(dj.Manual): - definition = """ - dept : varchar(6) # e.g. BIOL, CS, MATH - --- - dept_name : varchar(200) - """ -``` - - - -```python -@schema -class StudentMajor(dj.Manual): - definition = """ - -> Student - --- - -> Department - declare_date : date - """ -``` - - - -```python -@schema -class Course(dj.Manual): - definition = """ - -> Department - course : uint32 # course number, e.g. 1010 - --- - course_name : varchar(200) - credits : decimal(3,1) - """ -``` - - - -```python -@schema -class Term(dj.Manual): - definition = """ - term_year : year - term : enum('Spring', 'Summer', 'Fall') - """ -``` - - - -```python -@schema -class Section(dj.Manual): - definition = """ - -> Course - -> Term - section : char(1) - --- - auditorium : varchar(12) - """ -``` - - - -```python -@schema -class Enroll(dj.Manual): - definition = """ - -> Student - -> Section - """ -``` - - - -```python -@schema -class LetterGrade(dj.Lookup): - definition = """ - grade : char(2) - --- - points : decimal(3,2) - """ - contents = [ - ['A', 4.00], ['A-', 3.67], - ['B+', 3.33], ['B', 3.00], ['B-', 2.67], - ['C+', 2.33], ['C', 2.00], ['C-', 1.67], - ['D+', 1.33], ['D', 1.00], - ['F', 0.00] - ] -``` - - - -```python -@schema -class Grade(dj.Manual): - definition = """ - -> Enroll - --- - -> LetterGrade - """ -``` - - - -```python -dj.Diagram(schema) -``` - - -## Populate with Synthetic Data - -We use [Faker](https://faker.readthedocs.io/) to generate realistic student data. - - -```python -import faker -import random - -fake = faker.Faker() -faker.Faker.seed(42) -random.seed(42) -``` - - - -```python -def generate_students(n=500): - """Generate n student records.""" - fake_name = {'F': fake.name_female, 'M': fake.name_male} - for student_id in range(1000, 1000 + n): - sex = random.choice(['F', 'M']) - name = fake_name[sex]().split()[:2] - yield { - 'student_id': student_id, - 'first_name': name[0], - 'last_name': name[-1], - 'sex': sex, - 'date_of_birth': fake.date_between( - start_date='-35y', end_date='-17y'), - 'home_city': fake.city(), - 'home_state': fake.state_abbr() - } - -Student.insert(generate_students(500)) -print(f"Inserted {len(Student())} students") -``` - - - -```python -# Departments -Department.insert([ - {'dept': 'CS', 'dept_name': 'Computer Science'}, - {'dept': 'BIOL', 'dept_name': 'Life Sciences'}, - {'dept': 'PHYS', 'dept_name': 'Physics'}, - {'dept': 'MATH', 'dept_name': 'Mathematics'}, -]) - -# Assign majors to ~75% of students -students = Student.keys() -depts = Department.keys() -StudentMajor.insert( - { - **s, **random.choice(depts), - 'declare_date': fake.date_between(start_date='-4y') - } - for s in students if random.random() < 0.75 -) -print(f"{len(StudentMajor())} students declared majors") -``` - - - -```python -# Course catalog -Course.insert([ - ['BIOL', 1010, 'Biology in the 21st Century', 3], - ['BIOL', 2020, 'Principles of Cell Biology', 3], - ['BIOL', 2325, 'Human Anatomy', 4], - ['BIOL', 2420, 'Human Physiology', 4], - ['PHYS', 2210, 'Physics for Scientists I', 4], - ['PHYS', 2220, 'Physics for Scientists II', 4], - ['PHYS', 2060, 'Quantum Mechanics', 3], - ['MATH', 1210, 'Calculus I', 4], - ['MATH', 1220, 'Calculus II', 4], - ['MATH', 2270, 'Linear Algebra', 4], - ['MATH', 2280, 'Differential Equations', 4], - ['CS', 1410, 'Intro to Object-Oriented Programming', 4], - ['CS', 2420, 'Data Structures & Algorithms', 4], - ['CS', 3500, 'Software Practice', 4], - ['CS', 3810, 'Computer Organization', 4], -]) -print(f"{len(Course())} courses in catalog") -``` - - - -```python -# Academic terms 2020-2024 -Term.insert( - {'term_year': year, 'term': term} - for year in range(2020, 2025) - for term in ['Spring', 'Summer', 'Fall'] -) - -# Create sections for each course-term with 1-3 sections -for course in Course.keys(): - for term in Term.keys(): - for sec in 'abc'[:random.randint(1, 3)]: - if random.random() < 0.7: # Not every course every term - Section.insert1({ - **course, **term, - 'section': sec, - 'auditorium': f"{random.choice('ABCDEF')}" - f"{random.randint(100, 400)}" - }, skip_duplicates=True) - -print(f"{len(Section())} sections created") -``` - - - -```python -# Enroll students in courses -terms = Term.keys() -for student in Student.keys(): - # Each student enrolls over 2-6 random terms - student_terms = random.sample(terms, k=random.randint(2, 6)) - for term in student_terms: - # Take 2-4 courses per term - available = (Section & term).keys() - if available: - n_courses = min(random.randint(2, 4), len(available)) - for section in random.sample(available, k=n_courses): - Enroll.insert1( - {**student, **section}, skip_duplicates=True) - -print(f"{len(Enroll())} enrollments") -``` - - - -```python -# Assign grades to ~90% of enrollments (some incomplete) -grades = LetterGrade.to_arrays('grade') -# Weight toward B/C range -weights = [5, 8, 10, 15, 12, 10, 15, 10, 5, 5, 5] - -for enroll in Enroll.keys(): - if random.random() < 0.9: - Grade.insert1({**enroll, 'grade': random.choices(grades, weights=weights)[0]}) - -print(f"{len(Grade())} grades assigned") -``` - - -## Querying Data - -DataJoint queries are composable expressions. Displaying a query shows a preview; use `fetch()` to retrieve data. - - -```python -dj.config['display.limit'] = 8 # Limit preview rows -``` - - -### Restriction (`&` and `-`) - -Filter rows using `&` (keep matching) or `-` (remove matching). - - -```python -# Students from California -Student & {'home_state': 'CA'} -``` - - - -```python -# Female students NOT from California -(Student & {'sex': 'F'}) - {'home_state': 'CA'} -``` - - - -```python -# SQL-style string conditions -Student & 'home_state IN ("CA", "TX", "NY")' -``` - - - -```python -# OR conditions using a list -Student & [{'home_state': 'CA'}, {'home_state': 'TX'}] -``` - - -### Subqueries in Restrictions - -Use another query as a restriction condition. - - -```python -# Students majoring in Computer Science -Student & (StudentMajor & {'dept': 'CS'}) -``` - - - -```python -# Students who have NOT taken any Math courses -Student - (Enroll & {'dept': 'MATH'}) -``` - - - -```python -# Students with ungraded enrollments (enrolled but no grade yet) -Student & (Enroll - Grade) -``` - - - -```python -# All-A students: have grades AND no non-A grades -all_a = (Student & Grade) - (Grade - {'grade': 'A'}) -all_a -``` - - -### Projection (`.proj()`) - -Select, rename, or compute attributes. - - -```python -# Select specific attributes -Student.proj('first_name', 'last_name') -``` - - - -```python -# Computed attribute: full name -Student.proj(full_name="CONCAT(first_name, ' ', last_name)") -``` - - - -```python -# Calculate age in years -Student.proj('first_name', 'last_name', - age='TIMESTAMPDIFF(YEAR, date_of_birth, CURDATE())') -``` - - - -```python -# Keep all attributes plus computed ones with ... -Student.proj(..., age='TIMESTAMPDIFF(YEAR, date_of_birth, CURDATE())') -``` - - - -```python -# Exclude specific attributes with - -Student.proj(..., '-date_of_birth') -``` - - - -```python -# Rename attribute -Student.proj('first_name', family_name='last_name') -``` - - -### Universal Set (`dj.U()`) - -The universal set `dj.U()` extracts unique values of specified attributes. - - -```python -# All unique first names -dj.U('first_name') & Student -``` - - - -```python -# All unique home states of enrolled students -dj.U('home_state') & (Student & Enroll) -``` - - - -```python -# Birth years of students in CS courses -dj.U('birth_year') & ( - Student.proj(birth_year='YEAR(date_of_birth)') & (Enroll & {'dept': 'CS'}) -) -``` - - -### Join (`*`) - -Combine tables on matching attributes. - - -```python -# Students with their declared majors -Student.proj('first_name', 'last_name') * StudentMajor -``` - - - -```python -# Courses with department names -Course * Department.proj('dept_name') -``` - - - -```python -# Left join: all students, including those without majors (NULL for unmatched) -Student.proj('first_name', 'last_name').join(StudentMajor, left=True) -``` - - - -```python -# Multi-table join: grades with student names and course info -(Student.proj('first_name', 'last_name') - * Grade - * Course.proj('course_name', 'credits')) -``` - - -### Aggregation (`.aggr()`) - -Group rows and compute aggregate statistics. - - -```python -# Number of students per department -Department.aggr(StudentMajor, n_students='COUNT(*)') -``` - - - -```python -# Breakdown by sex per department -Department.aggr( - StudentMajor * Student, - n_female='SUM(sex="F")', - n_male='SUM(sex="M")' -) -``` - - - -```python -# Enrollment counts per course (with course name) -Course.aggr(Enroll, ..., n_enrolled='COUNT(*)') -``` - - - -```python -# Average grade points per course -Course.aggr( - Grade * LetterGrade, - 'course_name', - avg_gpa='AVG(points)', - n_grades='COUNT(*)' -) -``` - - -### Complex Queries - -Combine operators to answer complex questions. - - -```python -# Student GPA: weighted average of grade points by credits -student_gpa = Student.aggr( - Grade * LetterGrade * Course, - 'first_name', 'last_name', - total_credits='SUM(credits)', - gpa='SUM(points * credits) / SUM(credits)' -) -student_gpa -``` - - - -```python -# Top 5 students by GPA (with at least 12 credits) -student_gpa & 'total_credits >= 12' & dj.Top(5, order_by='gpa DESC') -``` - - - -```python -# Students who have taken courses in ALL departments -# (i.e., no department exists where they haven't enrolled) -all_depts = Student - ( - Student.proj() * Department - Enroll.proj('student_id', 'dept') -) -all_depts.proj('first_name', 'last_name') -``` - - - -```python -# Most popular courses (by enrollment) per department -course_enrollment = Course.aggr(Enroll, ..., n='COUNT(*)') - -# For each department, find the max enrollment -max_per_dept = Department.aggr(course_enrollment, max_n='MAX(n)') - -# Join to find courses matching the max -course_enrollment * max_per_dept & 'n = max_n' -``` - - - -```python -# Grade distribution: count of each grade across all courses -LetterGrade.aggr(Grade, ..., count='COUNT(*)') & 'count > 0' -``` - - -### Fetching Results - -Use the fetch methods to retrieve data into Python: -- `to_dicts()` — list of dictionaries -- `to_arrays()` — numpy arrays -- `to_pandas()` — pandas DataFrame -- `fetch1()` — single row (query must return exactly one row) - - -```python -# Fetch as numpy recarray -data = (Student & {'home_state': 'CA'}).to_arrays() -print(f"Type: {type(data).__name__}, shape: {data.shape}") -data[:3] -``` - - - -```python -# Fetch as list of dicts -(Student & {'home_state': 'CA'}).to_dicts(limit=3) -``` - - - -```python -# Fetch specific attributes as arrays -first_names, last_names = (Student & {'home_state': 'CA'}).to_arrays('first_name', 'last_name') -list(zip(first_names, last_names))[:5] -``` - - - -```python -# Fetch single row with fetch1 -student = (Student & {'student_id': 1000}).fetch1() -print(f"{student['first_name']} {student['last_name']} from {student['home_city']}, {student['home_state']}") -``` - - - -```python -# Fetch as pandas DataFrame -(student_gpa & 'total_credits >= 12').to_pandas().sort_values('gpa', ascending=False).head(10) -``` - - -## Summary - -This tutorial demonstrated: - -| Operation | Syntax | Purpose | -|-----------|--------|--------| -| Restriction | `A & cond` | Keep matching rows | -| Anti-restriction | `A - cond` | Remove matching rows | -| Projection | `A.proj(...)` | Select/compute attributes | -| Join | `A * B` | Combine tables | -| Left join | `A.join(B, left=True)` | Keep all rows from A | -| Aggregation | `A.aggr(B, ...)` | Group and aggregate | -| Universal | `dj.U('attr') & A` | Unique values | -| Top | `A & dj.Top(n, order_by=...)` | Limit/order results | -| Fetch keys | `A.keys()` | Primary key dicts | -| Fetch arrays | `A.to_arrays(...)` | Numpy arrays | -| Fetch dicts | `A.to_dicts()` | List of dicts | -| Fetch pandas | `A.to_pandas()` | DataFrame | -| Fetch one | `A.fetch1()` | Single row dict | - - -```python -# Cleanup -schema.drop(prompt=False) -``` - - ---- -## File: tutorials/index.md - -# Tutorials - -Learn DataJoint by building real pipelines. - -These tutorials guide you through building data pipelines step by step. Each tutorial -is a Jupyter notebook that you can run interactively. Start with the basics and -progress to domain-specific and advanced topics. - -## Quick Start - -Install DataJoint: - -```bash -pip install datajoint -``` - -Configure database credentials in your project (see [Configuration](../reference/configuration.md)): - -```bash -# Create datajoint.json for non-sensitive settings -echo '{"database": {"host": "localhost", "port": 3306}}' > datajoint.json - -# Create secrets directory for credentials -mkdir -p .secrets -echo "root" > .secrets/database.user -echo "password" > .secrets/database.password -``` - -Define and populate a simple pipeline: - -```python -import datajoint as dj - -schema = dj.Schema('my_pipeline') - -@schema -class Subject(dj.Manual): - definition = """ - subject_id : uint16 - --- - name : varchar(100) - date_of_birth : date - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint8 - --- - session_date : date - """ - -@schema -class SessionAnalysis(dj.Computed): - definition = """ - -> Session - --- - result : float64 - """ - - def make(self, key): - # Compute result for this session - self.insert1({**key, 'result': 42.0}) - -# Insert data -Subject.insert1({'subject_id': 1, 'name': 'M001', 'date_of_birth': '2026-01-15'}) -Session.insert1({'subject_id': 1, 'session_idx': 1, 'session_date': '2026-01-06'}) - -# Run computations -SessionAnalysis.populate() -``` - -Continue learning with the structured tutorials below. - -## Learning Paths - -Choose your learning path based on your goals: - -### 🌱 New to DataJoint - -**Goal:** Understand core concepts and build your first pipeline - -**Path:** - -1. [First Pipeline](basics/01-first-pipeline.ipynb) — 30 min — Tables, queries, four core operations -2. [Schema Design](basics/02-schema-design.ipynb) — 45 min — Primary keys, relationships, table tiers -3. [Data Entry](basics/03-data-entry.ipynb) — 30 min — Inserting and managing data -4. [Queries](basics/04-queries.ipynb) — 45 min — Operators, restrictions, projections -5. Try an example: [University Database](examples/university.ipynb) — Complete pipeline with realistic data - -**Next:** Read [Relational Workflow Model](../explanation/relational-workflow-model.md) to understand the conceptual foundation. - ---- - -### 🚀 Building Production Pipelines - -**Goal:** Create automated, scalable data processing workflows - -**Prerequisites:** Complete basics above or have equivalent experience - -**Path:** - -1. [Computation](basics/05-computation.ipynb) — Automated processing with Imported/Computed tables -2. [Object Storage](basics/06-object-storage.ipynb) — Handle large data (arrays, files, images) -3. [Distributed Computing](advanced/distributed.ipynb) — Multi-worker parallel execution -4. Practice: [Fractal Pipeline](examples/fractal-pipeline.ipynb) or [Blob Detection](examples/blob-detection.ipynb) - -**Next:** - -- [Run Computations](../how-to/run-computations.md) — populate() usage patterns -- [Distributed Computing](../how-to/distributed-computing.md) — Cluster deployment -- [Handle Errors](../how-to/handle-errors.md) — Job management and recovery - ---- - -### 🧪 Domain-Specific Applications - -**Goal:** Build scientific data pipelines for your field - -**Prerequisites:** Complete basics, understand computation model - -**Production Software: [DataJoint Elements](https://datajoint.com/docs/elements/)** - -Standard pipelines for neurophysiology experiments, actively used in many labs worldwide. These are not tutorials—they are production-ready modular pipelines for calcium imaging, electrophysiology, array ephys, optogenetics, and more. - -**Learning tutorials (neuroscience):** - -- [Calcium Imaging](domain/calcium-imaging/calcium-imaging.ipynb) — Import movies, segment cells, extract traces -- [Electrophysiology](domain/electrophysiology/electrophysiology.ipynb) — Import recordings, spike detection, waveforms -- [Allen CCF](domain/allen-ccf/allen-ccf.ipynb) — Hierarchical brain atlas ontology - -**General patterns:** - -- [Hotel Reservations](examples/hotel-reservations.ipynb) — Booking systems with resource management -- [Languages & Proficiency](examples/languages.ipynb) — Many-to-many relationships - ---- - -### 🔧 Extending DataJoint - -**Goal:** Customize DataJoint for specialized needs - -**Prerequisites:** Proficient with basics and production pipelines - -**Path:** - -1. [Custom Codecs](advanced/custom-codecs.ipynb) — Create domain-specific data types -2. [JSON Data Type](advanced/json-type.ipynb) — Semi-structured data patterns -3. [SQL Comparison](advanced/sql-comparison.ipynb) — Understand DataJoint's query algebra - -**Next:** - -- [Codec API](../reference/specs/codec-api.md) — Complete codec specification -- [Create Custom Codec](../how-to/create-custom-codec.md) — Step-by-step codec development - ---- - -## Basics - -Core concepts for getting started with DataJoint: - -1. [First Pipeline](basics/01-first-pipeline.ipynb) — Tables, queries, and the four core operations -2. [Schema Design](basics/02-schema-design.ipynb) — Primary keys, relationships, and table tiers -3. [Data Entry](basics/03-data-entry.ipynb) — Inserting and managing data -4. [Queries](basics/04-queries.ipynb) — Operators and fetching results -5. [Computation](basics/05-computation.ipynb) — Imported and Computed tables -6. [Object Storage](basics/06-object-storage.ipynb) — Blobs, attachments, and external stores - -## Examples - -Complete pipelines demonstrating DataJoint patterns: - -- [University Database](examples/university.ipynb) — Academic records with students, courses, and grades -- [Hotel Reservations](examples/hotel-reservations.ipynb) — Booking system with rooms, guests, and reservations -- [Languages & Proficiency](examples/languages.ipynb) — Language skills tracking with many-to-many relationships -- [Fractal Pipeline](examples/fractal-pipeline.ipynb) — Iterative computation and parameter sweeps -- [Blob Detection](examples/blob-detection.ipynb) — Image processing with automated computation - -## Domain Tutorials - -Real-world scientific pipelines: - -- [Calcium Imaging](domain/calcium-imaging/calcium-imaging.ipynb) — Import TIFF movies, segment cells, extract fluorescence traces -- [Electrophysiology](domain/electrophysiology/electrophysiology.ipynb) — Import recordings, detect spikes, extract waveforms -- [Allen CCF](domain/allen-ccf/allen-ccf.ipynb) — Brain atlas with hierarchical region ontology - -## Advanced Topics - -Extending DataJoint for specialized use cases: - -- [SQL Comparison](advanced/sql-comparison.ipynb) — DataJoint for SQL users -- [JSON Data Type](advanced/json-type.ipynb) — Semi-structured data in tables -- [Distributed Computing](advanced/distributed.ipynb) — Multi-process and cluster workflows -- [Custom Codecs](advanced/custom-codecs.ipynb) — Extending the type system - -## Running the Tutorials - -```bash -# Clone the repository -git clone https://github.com/datajoint/datajoint-docs.git -cd datajoint-docs - -# Start the tutorial environment -docker compose up -d - -# Launch Jupyter -jupyter lab src/tutorials/ -``` - -All tutorials use a local MySQL database that resets between sessions. - - -============================================================ -# How-To Guides -============================================================ - - ---- -## File: how-to/alter-tables.md - -# Alter Tables - -Modify existing table structures for schema evolution. - -## Basic Alter - -Sync table definition with code: - -```python -# Update definition in code, then: -MyTable.alter() -``` - -This compares the current code definition with the database and generates `ALTER TABLE` statements. - -## What Can Be Altered - -| Change | Supported | -|--------|-----------| -| Add columns | Yes | -| Drop columns | Yes | -| Modify column types | Yes | -| Rename columns | Yes | -| Change defaults | Yes | -| Update table comment | Yes | -| **Modify primary key** | **No** | -| **Add/remove foreign keys** | **No** | -| **Modify indexes** | **No** | - -## Add a Column - -```python -# Original -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(32) - """ - -# Updated - add column -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(32) - weight = null : float32 # New column - """ - -# Apply change -Subject.alter() -``` - -## Drop a Column - -Remove from definition and alter: - -```python -# Column 'old_field' removed from definition -Subject.alter() -``` - -## Modify Column Type - -```python -# Change varchar(32) to varchar(100) -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(100) # Was varchar(32) - """ - -Subject.alter() -``` - -## Rename a Column - -DataJoint tracks renames via comment metadata: - -```python -# Original: species -# Renamed to: species_name -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species_name : varchar(32) # Renamed from 'species' - """ - -Subject.alter() -``` - -## Skip Confirmation - -```python -# Apply without prompting -Subject.alter(prompt=False) -``` - -## View Pending Changes - -Check what would change without applying: - -```python -# Show current definition -print(Subject.describe()) - -# Compare with code definition -# (alter() shows diff before prompting) -``` - -## Unsupported Changes - -### Primary Key Changes - -Cannot modify primary key attributes: - -```python -# This will raise NotImplementedError -@schema -class Subject(dj.Manual): - definition = """ - new_id : uuid # Changed primary key - --- - species : varchar(32) - """ - -Subject.alter() # Error! -``` - -**Workaround**: Create new table, migrate data, drop old table. - -### Foreign Key Changes - -Cannot add or remove foreign key references: - -```python -# Cannot add new FK via alter() -definition = """ -subject_id : varchar(16) ---- --> NewReference # Cannot add via alter -species : varchar(32) -""" -``` - -**Workaround**: Drop dependent tables, recreate with new structure. - -### Index Changes - -Cannot modify indexes via alter: - -```python -# Cannot add/remove indexes via alter() -definition = """ -subject_id : varchar(16) ---- -index(species) # Cannot add via alter -species : varchar(32) -""" -``` - -## Migration Pattern - -For unsupported changes, use this pattern: - -```python -# 1. Create new table with desired structure -@schema -class SubjectNew(dj.Manual): - definition = """ - subject_id : uuid # New primary key type - --- - species : varchar(32) - """ - -# 2. Migrate data -for row in Subject().to_dicts(): - SubjectNew.insert1({ - 'subject_id': uuid.uuid4(), # Generate new keys - 'species': row['species'] - }) - -# 3. Update dependent tables -# 4. Drop old table -# 5. Rename new table (if needed, via SQL) -``` - -## Add Job Metadata Columns - -For tables created before enabling job metadata: - -```python -from datajoint.migrate import add_job_metadata_columns - -# Dry run -add_job_metadata_columns(ProcessedData, dry_run=True) - -# Apply -add_job_metadata_columns(ProcessedData, dry_run=False) -``` - -## Best Practices - -### Plan Schema Carefully - -Primary keys and foreign keys cannot be changed easily. Design carefully upfront. - -### Use Migrations for Production - -For production systems, use versioned migration scripts: - -```python -# migrations/001_add_weight_column.py -def upgrade(): - Subject.alter(prompt=False) - -def downgrade(): - # Reverse the change - pass -``` - -### Test in Development First - -Always test schema changes on a copy: - -```python -# Clone schema for testing -test_schema = dj.Schema('test_' + schema.database) -``` - -## See Also - -- [Define Tables](define-tables.md) — Table definition syntax -- [Migrate to v2.0](migrate-to-v20.md) — Version migration - - ---- -## File: how-to/backup-restore.md - -# Backup and Restore - -Protect your data with proper backup strategies. - -> **Tip:** [DataJoint.com](https://datajoint.com) provides automatic backups with point-in-time recovery as part of the managed service. - -## Overview - -A complete DataJoint backup includes: -1. **Database** — Table structures and relational data -2. **Object storage** — Large objects stored externally - -## Database Backup - -### Using mysqldump - -```bash -# Backup single schema -mysqldump -h host -u user -p database_name > backup.sql - -# Backup multiple schemas -mysqldump -h host -u user -p --databases schema1 schema2 > backup.sql - -# Backup all schemas -mysqldump -h host -u user -p --all-databases > backup.sql -``` - -### Include Routines and Triggers - -```bash -mysqldump -h host -u user -p \ - --routines \ - --triggers \ - database_name > backup.sql -``` - -### Compressed Backup - -```bash -mysqldump -h host -u user -p database_name | gzip > backup.sql.gz -``` - -## Database Restore - -```bash -# From SQL file -mysql -h host -u user -p database_name < backup.sql - -# From compressed file -gunzip < backup.sql.gz | mysql -h host -u user -p database_name -``` - -## Object Storage Backup - -### Filesystem Store - -```bash -# Sync to backup location -rsync -av /data/datajoint-store/ /backup/datajoint-store/ - -# With compression -tar -czvf store-backup.tar.gz /data/datajoint-store/ -``` - -### S3/MinIO Store - -```bash -# Using AWS CLI -aws s3 sync s3://source-bucket s3://backup-bucket - -# Using MinIO client -mc mirror source/bucket backup/bucket -``` - -## Backup Script Example - -```bash -#!/bin/bash -# backup-datajoint.sh - -DATE=$(date +%Y%m%d_%H%M%S) -BACKUP_DIR=/backups/datajoint - -# Backup database -mysqldump -h $DJ_HOST -u $DJ_USER -p$DJ_PASS \ - --databases my_schema \ - | gzip > $BACKUP_DIR/db_$DATE.sql.gz - -# Backup object storage -rsync -av /data/store/ $BACKUP_DIR/store_$DATE/ - -# Cleanup old backups (keep 7 days) -find $BACKUP_DIR -mtime +7 -delete - -echo "Backup completed: $DATE" -``` - -## Point-in-Time Recovery - -### Enable Binary Logging - -In MySQL configuration: - -```ini -[mysqld] -log-bin = mysql-bin -binlog-format = ROW -expire_logs_days = 7 -``` - -### Restore to Point in Time - -```bash -# Restore base backup -mysql -h host -u user -p < backup.sql - -# Apply binary logs up to specific time -mysqlbinlog --stop-datetime="2024-01-15 14:30:00" \ - mysql-bin.000001 mysql-bin.000002 \ - | mysql -h host -u user -p -``` - -## Schema-Level Export - -Export schema structure without data: - -```bash -# Structure only -mysqldump -h host -u user -p --no-data database_name > schema.sql -``` - -## Table-Level Backup - -Backup specific tables: - -```bash -mysqldump -h host -u user -p database_name table1 table2 > tables.sql -``` - -## DataJoint-Specific Considerations - -### Foreign Key Order - -When restoring, tables must be created in dependency order. mysqldump handles this automatically, but manual restoration may require: - -```bash -# Disable FK checks during restore -mysql -h host -u user -p -e "SET FOREIGN_KEY_CHECKS=0; SOURCE backup.sql; SET FOREIGN_KEY_CHECKS=1;" -``` - -### Jobs Tables - -Jobs tables (`~~table_name`) are recreated automatically. You can exclude them: - -```bash -# Exclude jobs tables from backup -mysqldump -h host -u user -p database_name \ - --ignore-table=database_name.~~table1 \ - --ignore-table=database_name.~~table2 \ - > backup.sql -``` - -### Blob Data - -Blobs stored internally (in database) are included in mysqldump. External objects need separate backup. - -## Verification - -### Verify Database Backup - -```bash -# Check backup file -gunzip -c backup.sql.gz | head -100 - -# Restore to test database -mysql -h host -u user -p test_restore < backup.sql -``` - -### Verify Object Storage - -```python -import datajoint as dj - -# Check external objects are accessible -for key in MyTable().keys(): - try: - (MyTable & key).fetch1('blob_column') - except Exception as e: - print(f"Missing: {key} - {e}") -``` - -## Disaster Recovery Plan - -1. **Regular backups**: Daily database, continuous object sync -2. **Offsite copies**: Replicate to different location/cloud -3. **Test restores**: Monthly restore verification -4. **Document procedures**: Written runbooks for recovery -5. **Monitor backups**: Alert on backup failures - -## See Also - -- [Configure Object Storage](configure-storage.md) — Storage setup -- [Manage Large Data](manage-large-data.md) — Object storage patterns - - ---- -## File: how-to/choose-storage-type.md - -# Choose a Storage Type - -Select the right storage codec for your data based on size, access patterns, and lifecycle requirements. - -## Quick Decision Tree - -``` -Start: What type of data are you storing? - -├─ Small data (typically < 1-10 MB per row)? -│ ├─ Python objects (dicts, arrays)? → Use (in-table) -│ └─ Files with filename? → Use (in-table) -│ -├─ Externally managed files? -│ └─ YES → Use (reference only) -│ └─ NO → Continue... -│ -├─ Need browsable storage or access by external tools? -│ └─ YES → Use or (schema-addressed) -│ └─ NO → Continue... -│ -├─ Need streaming or partial reads? -│ └─ YES → Use (schema-addressed, Zarr/HDF5) -│ └─ NO → Continue... -│ -├─ NumPy arrays that benefit from lazy loading? -│ └─ YES → Use (optimized NumPy storage) -│ └─ NO → Continue... -│ -├─ Python objects (dicts, arrays)? -│ └─ YES → Use (hash-addressed) -│ └─ NO → Use (files with filename preserved) -``` - -## Storage Types Overview - -| Codec | Location | Addressing | Python Objects | Dedup | Best For | -|-------|----------|------------|----------------|-------|----------| -| `` | In-table (database) | Row-based | ✅ Yes | No | Small Python objects (typically < 1-10 MB) | -| `` | In-table (database) | Row-based | ❌ No (file path) | No | Small files with filename preserved | -| `` | Object store | Content hash | ✅ Yes | Yes | Large Python objects (with dedup) | -| `` | Object store | Content hash | ❌ No (file path) | Yes | Large files with filename preserved | -| `` | Object store | Schema + key | ✅ Yes (arrays) | No | NumPy arrays (lazy load, navigable) | -| `` | Object store | Schema + key | ❌ No (you manage format) | No | Zarr, HDF5 (browsable, streaming) | -| `` | Object store | User path | ❌ No (you manage format) | No | External file references | - -## Key Usability: Python Object Convenience - -**Major advantage of ``, ``, and ``:** You work with Python objects directly. No manual serialization, file handling, or IO management. - -```python -# and : Insert Python objects, get Python objects back -@schema -class Analysis(dj.Computed): - definition = """ - -> Experiment - --- - results : # Any Python object: dicts, lists, arrays - """ - - def make(self, key): - # Insert nested Python structures directly - results = { - 'accuracy': 0.95, - 'confusion_matrix': np.array([[10, 2], [1, 15]]), - 'metadata': {'method': 'SVM', 'params': [1, 2, 3]} - } - self.insert1({**key, 'results': results}) - -# Fetch: Get Python object back (no manual unpickling) -data = (Analysis & key).fetch1('results') -print(data['accuracy']) # 0.95 -print(data['confusion_matrix']) # numpy array -``` - -```python -# : Insert array-like objects, get array-like objects back -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid - --- - traces : # NumPy arrays (no manual .npy files) - """ - -# Insert: Just pass the array -Recording.insert1({'recording_id': uuid.uuid4(), 'traces': np.random.randn(1000, 32)}) - -# Fetch: Get array-like object (NpyRef with lazy loading) -ref = (Recording & key).fetch1('traces') -print(ref.shape) # (1000, 32) - metadata without download -subset = ref[:100, :] # Lazy slicing -``` - -**Contrast with `` and ``:** You manage the format (Zarr, HDF5, etc.) and handle file IO yourself. More flexible, but requires format knowledge. - -## Detailed Decision Criteria - -### Size and Storage Location - -**Technical Limits:** -- **MySQL**: In-table blobs up to 4 GiB (`LONGBLOB`) -- **PostgreSQL**: In-table blobs unlimited (`BYTEA`) -- **Object stores**: Effectively unlimited (S3, file systems, etc.) - -**Practical Guidance:** - -The choice between in-table (``) and object storage (``, ``, ``) is a complex decision involving: - -- **Accessibility**: How fast do you need to access the data? -- **Cost**: Database storage vs object storage pricing -- **Performance**: Query speed, backup time, replication overhead - -**General recommendations:** - -Try to keep in-table blobs under ~1-10 MB, but this depends on your specific use case: - -```python -@schema -class Experiment(dj.Manual): - definition = """ - experiment_id : uuid - --- - metadata : # Small: config, parameters (< 1 MB) - thumbnail : # Medium: preview images (< 10 MB) - raw_data : # Large: raw recordings (> 10 MB) - """ -``` - -**When to use in-table storage (``):** -- Fast access needed (no external fetch) -- Data frequently queried alongside other columns -- Transactional consistency critical -- Automatic backup with database important -- No object storage configuration available - -**When to use object storage (``, ``, ``):** -- Data larger than ~10 MB -- Infrequent access patterns -- Need deduplication (hash-addressed types) -- Need browsable structure (schema-addressed types) -- Want to separate hot data (DB) from cold data (object store) - -**Examples by size:** -- **< 1 MB**: Configuration JSON, metadata, small parameter arrays → `` -- **1-10 MB**: Thumbnails, processed features, small waveforms → `` or `` depending on access pattern -- **10-100 MB**: Neural recordings, images, PDFs → `` or `` -- **> 100 MB**: Zarr arrays, HDF5 datasets, large videos → `` or `` - -### Access Pattern Guidelines - -**Full Access Every Time** - -Use `` (hash-addressed): - -```python -class ProcessedImage(dj.Computed): - definition = """ - -> RawImage - --- - processed : # Always load full image - """ -``` - -**Typical pattern:** -```python -# Fetch always gets full data -img = (ProcessedImage & key).fetch1('processed') -``` - ---- - -**Streaming / Partial Reads** - -Use `` (schema-addressed): - -```python -class ScanVolume(dj.Manual): - definition = """ - scan_id : uuid - --- - volume : # Stream chunks as needed - """ -``` - -**Typical pattern:** -```python -# Get reference without downloading -ref = (ScanVolume & key).fetch1('volume') - -# Stream specific chunks -import zarr -z = zarr.open(ref.fsmap, mode='r') -slice_data = z[100:200, :, :] # Fetch only this slice -``` - ---- - -**NumPy Arrays with Lazy Loading** - -Use `` (optimized for NumPy): - -```python -class NeuralActivity(dj.Computed): - definition = """ - -> Recording - --- - traces : # NumPy array, lazy load - """ -``` - -**Typical pattern:** -```python -# Returns NpyRef (lazy) -ref = (NeuralActivity & key).fetch1('traces') - -# Access like NumPy array (loads on demand) -subset = ref[:100, :] # Efficient slicing -shape = ref.shape # Metadata without loading -``` - -**Why `` over `` for arrays:** -- Lazy loading (doesn't load until accessed) -- Efficient slicing (can fetch subsets) -- Preserves shape/dtype metadata -- Native NumPy serialization - -### Lifecycle and Management - -**DataJoint-Managed (Integrated)** - -Use ``, ``, or ``: - -```python -class ManagedData(dj.Manual): - definition = """ - data_id : uuid - --- - content : # DataJoint manages lifecycle - """ -``` - -**DataJoint provides:** -- ✅ Automatic cleanup (garbage collection) -- ✅ Transactional integrity (atomic with database) -- ✅ Referential integrity (cascading deletes) -- ✅ Content deduplication (for ``, ``) - -**User manages:** -- ❌ File paths (DataJoint decides) -- ❌ Cleanup (automatic) -- ❌ Integrity (enforced) - ---- - -**User-Managed (References)** - -Use ``: - -```python -class ExternalData(dj.Manual): - definition = """ - data_id : uuid - --- - raw_file : # User manages file - """ -``` - -**User provides:** -- ✅ File paths (you control organization) -- ✅ File lifecycle (you create/delete) -- ✅ Existing files (reference external data) - -**DataJoint provides:** -- ✅ Path validation (file exists on insert) -- ✅ ObjectRef for lazy access -- ❌ No garbage collection -- ❌ No transaction safety for files -- ❌ No deduplication - -**Use when:** -- Files managed by external systems -- Referencing existing data archives -- Custom file organization required -- Large instrument output directories - -## Storage Type Comparison - -### In-Table: `` - -**Storage:** Database column (LONGBLOB) - -**Syntax:** -```python -small_data : -``` - -**Characteristics:** -- ✅ Fast access (in database) -- ✅ Transactional consistency -- ✅ Automatic backup -- ✅ No store configuration needed -- ✅ **Python object convenience**: Insert/fetch dicts, lists, arrays directly (no manual IO) -- ✅ Automatic serialization + gzip compression -- ✅ Technical limit: 4 GiB (MySQL), unlimited (PostgreSQL) -- ❌ Practical limit: Keep under ~1-10 MB for performance -- ❌ No deduplication -- ❌ Database bloat for large data - -**Best for:** -- Configuration JSON (dicts/lists) -- Small arrays/matrices -- Thumbnails -- Nested data structures - ---- - -### In-Table: `` - -**Storage:** Database column (LONGBLOB) - -**Syntax:** -```python -config_file : -``` - -**Characteristics:** -- ✅ Fast access (in database) -- ✅ Transactional consistency -- ✅ Automatic backup -- ✅ No store configuration needed -- ✅ **Filename preserved**: Original filename stored with content -- ✅ Automatic gzip compression -- ✅ Technical limit: 4 GiB (MySQL), unlimited (PostgreSQL) -- ❌ Practical limit: Keep under ~1-10 MB for performance -- ❌ No deduplication -- ❌ Returns file path (extracts to download directory), not Python object - -**Best for:** -- Small configuration files -- Document attachments (< 10 MB) -- Files where original filename matters -- When you need the file extracted to disk - -**Difference from ``:** -- ``: Stores Python objects (dicts, arrays) → returns Python object -- ``: Stores files with filename → returns local file path - ---- - -### Hash-Addressed: `` or `` - -**Storage:** Object store at `{store}/_hash/{schema}/{hash}` - -**Syntax:** -```python -data : # Default store -data : # Named store -file : # File attachments -``` - -**Characteristics (both):** -- ✅ Content deduplication (identical data stored once) -- ✅ Automatic gzip compression -- ✅ Garbage collection -- ✅ Transaction safety -- ✅ Referential integrity -- ✅ Moderate to large files (1 MB - 100 GB) -- ❌ Full download on fetch (no streaming) -- ❌ Storage path not browsable (hash-based) - -**`` specific:** -- ✅ **Python object convenience**: Insert/fetch dicts, lists, arrays directly (no manual IO) -- Returns: Python objects - -**`` specific:** -- ✅ **Filename preserved**: Original filename stored with content -- Returns: Local file path (extracts to download directory) - -**Best for ``:** -- Large Python objects (NumPy arrays, dicts) -- Processed results (nested structures) -- Any Python data with duplicates - -**Best for ``:** -- PDF/document files -- Images, videos -- Files where original filename/format matters - -**Key difference:** -- ``: Python objects in, Python objects out (no file handling) -- ``: Files in, file paths out (preserves filename) - ---- - -### Schema-Addressed: `` or `` - -**Storage:** Object store at `{store}/_schema/{schema}/{table}/{key}/{field}.{token}.ext` - -**Syntax:** -```python -array : # NumPy arrays -dataset : # Zarr, HDF5, custom -``` - -**Characteristics:** -- ✅ Streaming access (no full download) -- ✅ Partial reads (fetch chunks) -- ✅ Browsable paths (organized by key) -- ✅ Accessible by external tools (not just DataJoint) -- ✅ Very large files (100 MB - TB+) -- ✅ Multi-file datasets (e.g., Zarr directory structures) -- ❌ No deduplication -- ❌ One file per field per row - -**Key advantages:** -- **Schema-addressed storage is browsable** - can be navigated and accessed by external tools (Zarr viewers, HDF5 utilities, direct filesystem access), not just through DataJoint -- **`` provides array convenience** - insert/fetch array-like objects directly (no manual .npy file handling) -- **`` provides flexibility** - you manage the format (Zarr, HDF5, custom), DataJoint provides storage and references - -**Best for:** -- ``: NumPy arrays with lazy loading (no manual IO) -- ``: Zarr arrays, HDF5 datasets, custom formats (you manage format) -- Large video files -- Multi-file experimental outputs -- Data that needs to be accessed by non-DataJoint tools - -**Difference `` vs ``:** -- ``: Insert/fetch array-like objects (like `` but lazy) - no manual .npy handling -- ``: You manage format and IO (Zarr, HDF5, custom) - more flexible but requires format knowledge - ---- - -### Filepath References: `` - -**Storage:** User-managed paths in object store - -**Syntax:** -```python -raw_data : # User-managed file -``` - -**Characteristics:** -- ✅ Reference existing files -- ✅ User controls paths -- ✅ External system compatibility -- ✅ Custom organization -- ❌ No lifecycle management -- ❌ No garbage collection -- ❌ No transaction safety -- ❌ No deduplication -- ❌ Must avoid `_hash/` and `_schema/` prefixes - -**Best for:** -- Large instrument data directories -- Externally managed archives -- Legacy data integration -- Custom file organization requirements - -## Common Scenarios - -### Scenario 1: Image Processing Pipeline - -```python -@schema -class RawImage(dj.Manual): - """Imported from microscope""" - definition = """ - image_id : uuid - --- - raw_file : # Reference microscope output - """ - -@schema -class CalibratedImage(dj.Computed): - """Calibrated, moderate size""" - definition = """ - -> RawImage - --- - calibrated : # 5 MB processed image - """ - -@schema -class Thumbnail(dj.Computed): - """Preview for dashboard""" - definition = """ - -> CalibratedImage - --- - preview : # 100 KB thumbnail, in-table - """ -``` - -**Rationale:** -- ``: Reference existing microscope files (large, externally managed) -- ``: Processed images (moderate size, deduplicated if reprocessed) -- ``: Thumbnails (tiny, fast access for UI) - ---- - -### Scenario 2: Electrophysiology Recording - -```python -@schema -class RecordingSession(dj.Manual): - """Recording metadata""" - definition = """ - session_id : uuid - --- - config : # 50 KB parameters, in-table - """ - -@schema -class ContinuousData(dj.Imported): - """Raw voltage traces""" - definition = """ - -> RecordingSession - --- - raw_voltage : # 10 GB Zarr array, streaming - """ - -@schema -class SpikeWaveforms(dj.Computed): - """Extracted spike shapes""" - definition = """ - -> ContinuousData - unit_id : uint32 - --- - waveforms : # 20 MB array, lazy load - """ - -@schema -class UnitStats(dj.Computed): - """Summary statistics""" - definition = """ - -> SpikeWaveforms - --- - stats : # 10 KB stats dict, in-table - """ -``` - -**Rationale:** -- ``: Config and stats (small metadata, fast access) -- ``: Raw voltage (huge, stream for spike detection) -- ``: Waveforms (moderate arrays, load for clustering) - ---- - -### Scenario 3: Calcium Imaging Analysis - -```python -@schema -class Movie(dj.Manual): - """Raw calcium imaging movie""" - definition = """ - movie_id : uuid - --- - frames : # 2 GB TIFF stack, streaming - """ - -@schema -class SegmentedCells(dj.Computed): - """Cell masks""" - definition = """ - -> Movie - --- - masks : # 50 MB mask array, lazy load - """ - -@schema -class FluorescenceTraces(dj.Computed): - """Extracted time series""" - definition = """ - -> SegmentedCells - cell_id : uint32 - --- - trace : # 500 KB per cell, deduplicated - """ - -@schema -class TraceSummary(dj.Computed): - """Event detection results""" - definition = """ - -> FluorescenceTraces - --- - events : # 5 KB event times, in-table - """ -``` - -**Rationale:** -- ``: Movies (huge, stream for segmentation) -- ``: Masks (moderate, load for trace extraction) -- ``: Traces (per-cell, many rows, deduplication helps) -- ``: Event summaries (tiny, fast query results) - -## Configuration Examples - -### Single Store (Development) - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/my-project" - } - } -} -``` - -All `@` codecs use this store: -- `` → `/data/my-project/_hash/{schema}/{hash}` -- `` → `/data/my-project/_schema/{schema}/{table}/{key}/` - ---- - -### Multiple Stores (Production) - -```json -{ - "stores": { - "default": "main", - "filepath_default": "acquisition", - "main": { - "protocol": "file", - "location": "/data/processed" - }, - "acquisition": { - "protocol": "file", - "location": "/mnt/microscope" - }, - "archive": { - "protocol": "s3", - "bucket": "long-term-storage", - "location": "lab-data/archive" - } - } -} -``` - -Usage in table definitions: -```python -raw : # Uses filepath_default (acquisition) -processed : # Uses default (main) -backup : # Uses named store (archive) -``` - -## Performance Considerations - -### Read Performance - -| Codec | Random Access | Streaming | Latency | -|-------|---------------|-----------|---------| -| `` | ⚡ Excellent | N/A | <1ms | -| `` | ✅ Good | ❌ No | ~100ms | -| `` | ✅ Good (lazy) | ✅ Yes | ~100ms + chunk time | -| `` | ✅ Excellent | ✅ Yes | ~100ms + chunk time | -| `` | ✅ Good | ✅ Yes | ~100ms + network | - -### Write Performance - -| Codec | Insert Speed | Transaction Safe | Deduplication | -|-------|--------------|------------------|---------------| -| `` | ⚡ Fastest | ✅ Yes | ❌ No | -| `` | ✅ Fast | ✅ Yes | ✅ Yes | -| `` | ✅ Fast | ✅ Yes | ❌ No | -| `` | ✅ Fast | ✅ Yes | ❌ No | -| `` | ⚡ Fastest | ⚠️ Path only | ❌ No | - -### Storage Efficiency - -| Codec | Deduplication | Compression | Overhead | -|-------|---------------|-------------|----------| -| `` | ❌ No | ✅ gzip (automatic) | Low | -| `` | ✅ Yes | ✅ gzip (automatic) | Medium | -| `` | ❌ No | ⚠️ Format-specific | Low | -| `` | ❌ No | ⚠️ Format-specific | Low | -| `` | ❌ No | User-managed | Minimal | - -## Migration Between Storage Types - -### In-Table → Object Store - -```python -# Add new column with object storage -@schema -class MyTable(dj.Manual): - definition = """ - id : int - --- - data_old : # Legacy in-table - data_new : # New object storage - """ - -# Migrate data -for key in MyTable.fetch('KEY'): - old_data = (MyTable & key).fetch1('data_old') - (MyTable & key).update1({**key, 'data_new': old_data}) - -# After verification, drop old column via alter() -``` - -### Hash-Addressed → Schema-Addressed - -```python -# For large files that need streaming -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid - --- - data_blob : # Old: full download - data_stream : # New: streaming access - """ - -# Convert and store as Zarr -import zarr -for key in Recording.fetch('KEY'): - data = (Recording & key).fetch1('data_blob') - - # Create Zarr array - ref = (Recording & key).create_object_ref('data_stream', '.zarr') - z = zarr.open(ref.fsmap, mode='w', shape=data.shape, dtype=data.dtype) - z[:] = data - - # Update row - (Recording & key).update1({**key, 'data_stream': ref}) -``` - -## Troubleshooting - -### "DataJointError: Store not configured" - -**Problem:** Using `@` without store configuration - -**Solution:** -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/storage" - } - } -} -``` - -### "ValueError: Path conflicts with reserved section" - -**Problem:** `` path uses `_hash/` or `_schema/` - -**Solution:** Use different path: -```python -# Bad -table.insert1({'id': 1, 'file': '_hash/mydata.bin'}) # Error! - -# Good -table.insert1({'id': 1, 'file': 'raw/mydata.bin'}) # OK -``` - -### Data not deduplicated - -**Problem:** Using `` or `` expecting deduplication - -**Solution:** Use `` for deduplication: -```python -# No deduplication -data : - -# With deduplication -data : -``` - -### Out of memory loading large array - -**Problem:** Using `` for huge files - -**Solution:** Use `` or `` for streaming: -```python -# Bad: loads 10 GB into memory -large_data : - -# Good: streaming access -large_data : -``` - -## See Also - -- [Use Object Storage](use-object-storage.md) — How to use codecs in practice -- [Configure Object Storage](configure-storage.md) — Store configuration -- [Type System](../explanation/type-system.md) — Complete type system overview -- [Type System Specification](../reference/specs/type-system.md) — Technical details -- [NPY Codec Specification](../reference/specs/npy-codec.md) — NumPy array storage - - ---- -## File: how-to/configure-database.md - -# Configure Database Connection - -Set up your DataJoint database connection. - -> **Tip:** [DataJoint.com](https://datajoint.com) handles database configuration automatically with fully managed infrastructure and support. - -## Configuration Structure - -DataJoint separates configuration into two parts: - -1. **`datajoint.json`** — Non-sensitive settings (checked into version control) -2. **`.secrets/` directory** — Credentials and secrets (never committed) - -## Project Configuration (`datajoint.json`) - -Create `datajoint.json` in your project root for non-sensitive settings: - -```json -{ - "database.host": "db.example.com", - "database.port": 3306, - "database.use_tls": true, - "safemode": true -} -``` - -This file should be committed to version control. - -## Secrets Directory (`.secrets/`) - -Store credentials in `.secrets/datajoint.json`: - -```json -{ - "database.user": "myuser", - "database.password": "mypassword" -} -``` - -**Important:** Add `.secrets/` to your `.gitignore`: - -```gitignore -.secrets/ -``` - -## Environment Variables - -For CI/CD and production, use environment variables: - -```bash -export DJ_HOST=db.example.com -export DJ_USER=myuser -export DJ_PASS=mypassword -``` - -Environment variables take precedence over config files. - -## Configuration Settings - -| Setting | Environment | Default | Description | -|---------|-------------|---------|-------------| -| `database.host` | `DJ_HOST` | `localhost` | Database server hostname | -| `database.port` | `DJ_PORT` | `3306` | Database server port | -| `database.user` | `DJ_USER` | — | Database username | -| `database.password` | `DJ_PASS` | — | Database password | -| `database.use_tls` | `DJ_TLS` | `True` | Use TLS encryption | -| `database.reconnect` | — | `True` | Auto-reconnect on timeout | -| `safemode` | — | `True` | Prompt before destructive operations | - -## Test Connection - -```python -import datajoint as dj - -# Connects using configured credentials -conn = dj.conn() -print(f"Connected to {conn.host}") -``` - -## Programmatic Configuration - -For scripts, you can set configuration programmatically: - -```python -import datajoint as dj - -dj.config['database.host'] = 'localhost' -# Credentials from environment or secrets file -``` - -## Temporary Override - -```python -with dj.config.override(database={'host': 'test-server'}): - # Uses test-server for this block only - conn = dj.conn() -``` - -## Configuration Precedence - -1. Programmatic settings (highest priority) -2. Environment variables -3. `.secrets/datajoint.json` -4. `datajoint.json` -5. Default values (lowest priority) - -## TLS Configuration - -For production, always use TLS: - -```json -{ - "database.use_tls": true -} -``` - -For local development without TLS: - -```json -{ - "database.use_tls": false -} -``` - -## Connection Lifecycle - -### Persistent Connection (Default) - -DataJoint uses a persistent singleton connection by default: - -```python -import datajoint as dj - -# First call establishes connection -conn = dj.conn() - -# Subsequent calls return the same connection -conn2 = dj.conn() # Same as conn - -# Reset to create a new connection -conn3 = dj.conn(reset=True) # New connection -``` - -This is ideal for interactive sessions and notebooks. - -### Context Manager (Explicit Cleanup) - -For serverless environments (AWS Lambda, Cloud Functions) or when you need explicit connection lifecycle control, use the context manager: - -```python -import datajoint as dj - -with dj.Connection(host, user, password) as conn: - schema = dj.Schema('my_schema', connection=conn) - MyTable().insert(data) -# Connection automatically closed when exiting the block -``` - -The connection closes automatically even if an exception occurs: - -```python -try: - with dj.Connection(**creds) as conn: - schema = dj.Schema('my_schema', connection=conn) - MyTable().insert(data) - raise SomeError() -except SomeError: - pass -# Connection is still closed properly -``` - -### Manual Close - -You can also close a connection explicitly: - -```python -conn = dj.conn() -# ... do work ... -conn.close() -``` - - - ---- -## File: how-to/configure-storage.md - -# Configure Object Stores - -Set up S3, MinIO, or filesystem storage for DataJoint's Object-Augmented Schema (OAS). - -> **Tip:** [DataJoint.com](https://datajoint.com) provides pre-configured object stores integrated with your database—no setup required. - -## Overview - -DataJoint's Object-Augmented Schema (OAS) integrates relational tables with object storage as a single coherent system. Large data objects (arrays, files, Zarr datasets) are stored in file systems or cloud storage while maintaining full referential integrity with the relational database. - -**Storage models:** - -- **Hash-addressed** and **schema-addressed** storage are **integrated** into the OAS. DataJoint manages paths, lifecycle, integrity, garbage collection, transaction safety, and deduplication. -- **Filepath** storage stores only path strings. DataJoint provides no lifecycle management, garbage collection, transaction safety, or deduplication. Users control file creation, organization, and lifecycle. - -Storage is configured per-project using named stores. Each store can be used for: - -- **Hash-addressed storage** (``, ``) — content-addressed with deduplication using `_hash/` section -- **Schema-addressed storage** (``, ``) — key-based paths with streaming access using `_schema/` section -- **Filepath storage** (``) — user-managed paths anywhere in the store **except** `_hash/` and `_schema/` (reserved for DataJoint) - -Multiple stores can be configured for different data types or storage tiers. One store is designated as the default. - -## Configuration Methods - -DataJoint loads configuration in priority order: - -1. **Environment variables** (highest priority) -2. **Secrets directory** (`.secrets/`) -3. **Config file** (`datajoint.json`) -4. **Defaults** (lowest priority) - -## Single Store Configuration - -### File System Store - -For local or network-mounted storage: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/my-project/production" - } - } -} -``` - -Paths will be: - -- Hash: `/data/my-project/production/_hash/{schema}/{hash}` -- Schema: `/data/my-project/production/_schema/{schema}/{table}/{key}/` - -### S3 Store - -For Amazon S3 or S3-compatible storage: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-bucket", - "location": "my-project/production", - "secure": true - } - } -} -``` - -Store credentials separately in `.secrets/`: - -``` -.secrets/ -├── stores.main.access_key -└── stores.main.secret_key -``` - -Paths will be: - -- Hash: `s3://my-bucket/my-project/production/_hash/{schema}/{hash}` -- Schema: `s3://my-bucket/my-project/production/_schema/{schema}/{table}/{key}/` - -### MinIO Store - -MinIO uses the S3 protocol with a custom endpoint: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "s3", - "endpoint": "minio.example.com:9000", - "bucket": "datajoint", - "location": "lab-data", - "secure": false - } - } -} -``` - -## Multiple Stores Configuration - -Define multiple stores for different data types or storage tiers: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/my-project/main", - "partition_pattern": "subject_id/session_date" - }, - "raw": { - "protocol": "file", - "location": "/data/my-project/raw", - "subfolding": [2, 2] - }, - "archive": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive-bucket", - "location": "my-project/long-term" - } - } -} -``` - -Store credentials in `.secrets/`: - -``` -.secrets/ -├── stores.archive.access_key -└── stores.archive.secret_key -``` - -Use named stores in table definitions: - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid - --- - raw_data : # Hash: _hash/{schema}/{hash} - zarr_scan : # Schema: _schema/{schema}/{table}/{key}/ - summary : # Uses default store (main) - old_data : # Archive store, hash-addressed - """ -``` - -Notice that `` and `` both use the "raw" store, just different `_hash` and `_schema` sections. - -**Example paths with partitioning:** - -For a Recording with `subject_id=042`, `session_date=2024-01-15` in the main store: -``` -/data/my-project/main/_schema/subject_id=042/session_date=2024-01-15/experiment/Recording/recording_id=uuid-value/zarr_scan.x8f2a9b1.zarr -``` - -Without those attributes, it follows normal structure: -``` -/data/my-project/main/_schema/experiment/Recording/recording_id=uuid-value/zarr_scan.x8f2a9b1.zarr -``` - -## Verify Configuration - -```python -import datajoint as dj - -# Check default store -spec = dj.config.get_store_spec() # Uses stores.default -print(spec) - -# Check named store -spec = dj.config.get_store_spec("archive") -print(spec) - -# List all configured stores -print(dj.config.stores.keys()) -``` - -## Configuration Options - -| Option | Required | Description | -|--------|----------|-------------| -| `stores.default` | Yes | Name of the default store | -| `stores..protocol` | Yes | `file`, `s3`, `gcs`, or `azure` | -| `stores..location` | Yes | Base path or prefix (includes project context) | -| `stores..bucket` | S3/GCS | Bucket name | -| `stores..endpoint` | S3 | S3 endpoint URL | -| `stores..secure` | No | Use HTTPS (default: true) | -| `stores..access_key` | S3 | Access key ID (store in `.secrets/`) | -| `stores..secret_key` | S3 | Secret access key (store in `.secrets/`) | -| `stores..subfolding` | No | Hash-addressed hierarchy: `[2, 2]` for 2-level nesting (default: no subfolding) | -| `stores..partition_pattern` | No | Schema-addressed path partitioning: `"subject_id/session_date"` (default: no partitioning) | -| `stores..token_length` | No | Random token length for schema-addressed filenames (default: `8`) | - -## Subfolding (Hash-Addressed Storage Only) - -Hash-addressed storage (``, ``) stores content using a Base32-encoded hash as the filename. By default, all files are stored in a flat directory structure: - -``` -_hash/{schema}/abcdefghijklmnopqrstuvwxyz -``` - -Some filesystems perform poorly with large directories (thousands of files). Subfolding creates a directory hierarchy to distribute files: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/store", - "project_name": "my_project", - "subfolding": [2, 2] - } - } -} -``` - -With `[2, 2]` subfolding, hash-addressed paths become: - -``` -_hash/{schema}/ab/cd/abcdefghijklmnopqrstuvwxyz -``` - -Schema-addressed storage (``, ``) does not use subfolding—it uses key-based paths: - -``` -{location}/_schema/{partition}/{schema}/{table}/{key}/{field_name}.{token}.{ext} -``` - -### Filesystem Recommendations - -| Filesystem | Subfolding Needed | Notes | -|------------|-------------------|-------| -| ext3 | Yes | Limited directory indexing | -| FAT32/exFAT | Yes | Linear directory scans | -| NFS | Yes | Network latency amplifies directory lookups | -| CIFS/SMB | Yes | Windows network shares | -| ext4 | No | HTree indexing handles large directories | -| XFS | No | B+ tree directories scale well | -| ZFS | No | Efficient directory handling | -| Btrfs | No | B-tree based | -| S3/MinIO | No | Object storage uses hash-based lookups | -| GCS | No | Object storage | -| Azure Blob | No | Object storage | - -**Recommendation:** Use `[2, 2]` for network-mounted filesystems and legacy systems. -Modern local filesystems and cloud object storage work well without subfolding. - -## URL Representation - -DataJoint uses consistent URL representation for all storage backends internally. This means: - -- Local filesystem paths are represented as `file://` URLs -- S3 paths use `s3://bucket/path` -- GCS paths use `gs://bucket/path` -- Azure paths use `az://container/path` - -You can use either format when specifying paths: - -```python -# Both are equivalent for local files -"/data/myfile.dat" -"file:///data/myfile.dat" -``` - -This unified approach enables: - -- **Consistent internal handling** across all storage types -- **Seamless switching** between local and cloud storage -- **Integration with fsspec** for streaming access - -## Customizing Storage Sections - -Each store is divided into sections for different storage types. By default, DataJoint uses `_hash/` for hash-addressed storage and `_schema/` for schema-addressed storage. You can customize the path prefix for each section using the `*_prefix` configuration parameters to map DataJoint to existing storage layouts: - -```json -{ - "stores": { - "legacy": { - "protocol": "file", - "location": "/data/existing_storage", - "hash_prefix": "content_addressed", - "schema_prefix": "structured_data", - "filepath_prefix": "raw_files" - } - } -} -``` - -**Requirements:** - -- Sections must be mutually exclusive (path prefixes cannot nest) -- The `hash_prefix` and `schema_prefix` sections are reserved for DataJoint-managed storage -- The `filepath_prefix` is optional (`null` = unrestricted, or set a required prefix) - -**Example with hierarchical layout:** - -```json -{ - "stores": { - "organized": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "neuroscience-data", - "location": "lab-project-2024", - "hash_prefix": "managed/blobs", // Path prefix for hash section - "schema_prefix": "managed/arrays", // Path prefix for schema section - "filepath_prefix": "imported" // Path prefix for filepath section - } - } -} -``` - -Storage section paths become: - -- Hash: `s3://neuroscience-data/lab-project-2024/managed/blobs/{schema}/{hash}` -- Schema: `s3://neuroscience-data/lab-project-2024/managed/arrays/{schema}/{table}/{key}/` -- Filepath: `s3://neuroscience-data/lab-project-2024/imported/{user_path}` - -## Reserved Sections and Filepath Storage - -DataJoint reserves sections within each store for managed storage. These sections are defined by prefix configuration parameters: - -- **Hash-addressed section** (configured via `hash_prefix`, default: `_hash/`) — Content-addressed storage for `` and `` with deduplication -- **Schema-addressed section** (configured via `schema_prefix`, default: `_schema/`) — Key-based storage for `` and `` with streaming access - -### User-Managed Filepath Storage - -The `` codec stores paths to files that you manage. DataJoint does not manage lifecycle (no garbage collection), integrity (no transaction safety), or deduplication for filepath storage. You can reference existing files or create new ones—DataJoint simply stores the path string. Files can be anywhere in the store **except** the reserved sections: - -```python -@schema -class RawData(dj.Manual): - definition = """ - session_id : int - --- - recording : # User-managed file path - """ - -# Valid paths (user-managed) -table.insert1({'session_id': 1, 'recording': 'subject01/session001/data.bin'}) # Existing or new file -table.insert1({'session_id': 2, 'recording': 'raw/experiment_2024/data.nwb'}) # Existing or new file - -# Invalid paths (reserved for DataJoint - will raise ValueError) -# These use the default prefixes (_hash and _schema) -table.insert1({'session_id': 3, 'recording': '_hash/abc123...'}) # Error! -table.insert1({'session_id': 4, 'recording': '_schema/myschema/...'}) # Error! - -# If you configured custom prefixes like "content_addressed", those would also be blocked -# table.insert1({'session_id': 5, 'recording': 'content_addressed/file.dat'}) # Error! -``` - -**Key characteristics of ``:** - -- Stores path string only (DataJoint does not manage the files) -- No lifecycle management: no garbage collection, no transaction safety, no deduplication -- User controls file creation, organization, and deletion -- Can reference existing files or create new ones -- Returns ObjectRef for lazy access on fetch -- Validates file exists on insert -- Cannot use reserved sections (configured by `hash_prefix` and `schema_prefix`) -- Can be restricted to specific prefix using `filepath_prefix` configuration - -## See Also - -- [Use Object Storage](use-object-storage.md) — When and how to use object storage -- [Manage Large Data](manage-large-data.md) — Working with blobs and objects - - ---- -## File: how-to/create-custom-codec.md - -# Create Custom Codecs - -Define domain-specific types for seamless storage and retrieval. - -## Overview - -Codecs transform Python objects for storage. Create custom codecs for: - -- Domain-specific data types (graphs, images, alignments) -- Specialized serialization formats -- Integration with external libraries - -## Basic Codec Structure - -```python -import datajoint as dj - -class GraphCodec(dj.Codec): - """Store NetworkX graphs.""" - - name = "graph" # Used as in definitions - - def get_dtype(self, is_store: bool) -> str: - return "" # Delegate to blob for serialization - - def encode(self, value, *, key=None, store_name=None): - import networkx as nx - assert isinstance(value, nx.Graph) - return list(value.edges) - - def decode(self, stored, *, key=None): - import networkx as nx - return nx.Graph(stored) -``` - -## Use in Table Definition - -```python -@schema -class Connectivity(dj.Manual): - definition = """ - conn_id : int - --- - network : # Uses GraphCodec - network_large : # External storage - """ -``` - -## Required Methods - -### `get_dtype(is_store)` - -Return the storage type: - -- `is_store=False`: Inline storage (in database column) -- `is_store=True`: Object store (with `@` modifier) - -```python -def get_dtype(self, is_store: bool) -> str: - if is_store: - return "" # Hash-addressed storage - return "bytes" # Inline database blob -``` - -Common return values: - -- `"bytes"` — Binary in database -- `"json"` — JSON in database -- `""` — Chain to blob codec (hash-addressed when `@`) -- `""` — Hash-addressed storage - -### `encode(value, *, key=None, store_name=None)` - -Convert Python object to storable format: - -```python -def encode(self, value, *, key=None, store_name=None): - # value: Python object to store - # key: Primary key dict (for path construction) - # store_name: Target store name - return serialized_representation -``` - -### `decode(stored, *, key=None)` - -Reconstruct Python object: - -```python -def decode(self, stored, *, key=None): - # stored: Data from storage - # key: Primary key dict - return python_object -``` - -## Optional: Validation - -Override `validate()` for type checking: - -```python -def validate(self, value): - import networkx as nx - if not isinstance(value, nx.Graph): - raise TypeError(f"Expected nx.Graph, got {type(value).__name__}") -``` - -## Codec Chaining - -Codecs can delegate to other codecs: - -```python -class ImageCodec(dj.Codec): - name = "image" - - def get_dtype(self, is_store: bool) -> str: - return "" # Chain to blob codec - - def encode(self, value, *, key=None, store_name=None): - # Convert PIL Image to numpy array - # Blob codec handles numpy serialization - return np.array(value) - - def decode(self, stored, *, key=None): - from PIL import Image - return Image.fromarray(stored) -``` - -## Store-Only Codecs - -Some codecs require object storage (@ modifier): - -```python -class ZarrCodec(dj.Codec): - name = "zarr" - - def get_dtype(self, is_store: bool) -> str: - if not is_store: - raise DataJointError(" requires @ (store only)") - return "" # Schema-addressed storage - - def encode(self, path, *, key=None, store_name=None): - return path # Path to zarr directory - - def decode(self, stored, *, key=None): - return stored # Returns ObjectRef for lazy access -``` - -For custom file formats, consider inheriting from `SchemaCodec`: - -```python -class ParquetCodec(dj.SchemaCodec): - """Store DataFrames as Parquet files.""" - name = "parquet" - - # get_dtype inherited: requires @, returns "json" - - def encode(self, df, *, key=None, store_name=None): - schema, table, field, pk = self._extract_context(key) - path, _ = self._build_path(schema, table, field, pk, ext=".parquet") - backend = self._get_backend(store_name) - # ... upload parquet file - return {"path": path, "store": store_name, "shape": list(df.shape)} - - def decode(self, stored, *, key=None): - return ParquetRef(stored, self._get_backend(stored.get("store"))) -``` - -## Auto-Registration - -Codecs register automatically when defined: - -```python -class MyCodec(dj.Codec): - name = "mytype" # Registers as - ... - -# Now usable in table definitions: -# my_attr : -``` - -Skip registration for abstract bases: - -```python -class BaseCodec(dj.Codec, register=False): - # Abstract base, not registered - pass -``` - -## Complete Example - -```python -import datajoint as dj -import SimpleITK as sitk -import numpy as np - -class MedicalImageCodec(dj.Codec): - """Store SimpleITK medical images with metadata.""" - - name = "medimage" - - def get_dtype(self, is_store: bool) -> str: - return "" if is_store else "" - - def encode(self, image, *, key=None, store_name=None): - return { - 'array': sitk.GetArrayFromImage(image), - 'spacing': image.GetSpacing(), - 'origin': image.GetOrigin(), - 'direction': image.GetDirection(), - } - - def decode(self, stored, *, key=None): - image = sitk.GetImageFromArray(stored['array']) - image.SetSpacing(stored['spacing']) - image.SetOrigin(stored['origin']) - image.SetDirection(stored['direction']) - return image - - def validate(self, value): - if not isinstance(value, sitk.Image): - raise TypeError(f"Expected sitk.Image, got {type(value).__name__}") - - -@schema -class Scan(dj.Manual): - definition = """ - scan_id : uuid - --- - ct_image : # CT scan with metadata - """ -``` - -## See Also - -- [Use Object Storage](use-object-storage.md) — Storage patterns -- [Manage Large Data](manage-large-data.md) — Working with large objects - - ---- -## File: how-to/define-tables.md - -# Define Tables - -Create DataJoint table classes with proper definitions. - -## Create a Schema - -```python -import datajoint as dj - -schema = dj.Schema('my_schema') # Creates schema in database if it doesn't exist -``` - -The `Schema` object connects to the database and creates the schema (database) if it doesn't already exist. - -## Basic Table Structure - -```python -@schema -class MyTable(dj.Manual): - definition = """ - # Table comment (optional) - primary_attr : type # attribute comment - --- - secondary_attr : type # attribute comment - optional_attr = null : type - """ -``` - -## Table Types - -| Type | Base Class | Purpose | -|------|------------|---------| -| Manual | `dj.Manual` | User-entered data | -| Lookup | `dj.Lookup` | Reference data with `contents` | -| Imported | `dj.Imported` | Data from external sources | -| Computed | `dj.Computed` | Derived data | -| Part | `dj.Part` | Child of master table | - -## Primary Key (Above `---`) - -```python -definition = """ -subject_id : varchar(16) # Subject identifier -session_idx : uint16 # Session number ---- -... -""" -``` - -Primary key attributes: - -- Cannot be NULL -- Must be unique together -- Cannot be changed after insertion - -## Secondary Attributes (Below `---`) - -```python -definition = """ -... ---- -session_date : date # Required attribute -notes = '' : varchar(1000) # Optional with default -score = null : float32 # Nullable attribute -""" -``` - -## Default Values and Nullable Attributes - -Default values are specified with `= value` before the type: - -```python -definition = """ -subject_id : varchar(16) ---- -weight = null : float32 # Nullable (default is NULL) -notes = '' : varchar(1000) # Default empty string -is_active = 1 : bool # Default true -created = CURRENT_TIMESTAMP : timestamp -""" -``` - -**Key rules:** - -- The **only** way to make an attribute nullable is `= null` -- Attributes without defaults are required (NOT NULL) -- Primary key attributes cannot be nullable -- Primary key attributes cannot have static defaults - -**Timestamp defaults:** - -Primary keys can use time-dependent defaults like `CURRENT_TIMESTAMP`: - -```python -definition = """ -created_at = CURRENT_TIMESTAMP : timestamp(6) # Microsecond precision ---- -data : -""" -``` - -Timestamp precision options: - -- `timestamp` or `datetime` — Second precision -- `timestamp(3)` or `datetime(3)` — Millisecond precision -- `timestamp(6)` or `datetime(6)` — Microsecond precision - -## Auto-Increment (Not Recommended) - -DataJoint core types do not support `AUTO_INCREMENT`. This is intentional—explicit key values enforce entity integrity and prevent silent creation of duplicate records. - -Use `uuid` or natural keys instead: - -```python -definition = """ -recording_id : uuid # Globally unique, client-generated ---- -... -""" -``` - -If you must use auto-increment, native MySQL types allow it (with a warning): - -```python -definition = """ -record_id : int unsigned auto_increment # Native type ---- -... -""" -``` - -See [Design Primary Keys](design-primary-keys.md) for detailed guidance on key selection and why DataJoint avoids auto-increment. - -## Core DataJoint Types - -| Type | Description | -|------|-------------| -| `bool` | Boolean (true/false) | -| `int8`, `int16`, `int32`, `int64` | Signed integers | -| `uint8`, `uint16`, `uint32`, `uint64` | Unsigned integers | -| `float32`, `float64` | Floating point | -| `decimal(m,n)` | Fixed precision decimal | -| `varchar(n)` | Variable-length string | -| `char(n)` | Fixed-length string | -| `date` | Date (YYYY-MM-DD) | -| `datetime` | Date and time | -| `datetime(3)` | With millisecond precision | -| `datetime(6)` | With microsecond precision | -| `uuid` | UUID type | -| `enum('a', 'b', 'c')` | Enumerated values | -| `json` | JSON data | -| `bytes` | Raw binary data | - -## Built-in Codecs - -Codecs serialize Python objects to database storage. Use angle brackets for codec types: - -| Codec | Description | -|-------|-------------| -| `` | Serialized Python objects (NumPy arrays, etc.) stored in database | -| `` | Serialized objects in object storage | -| `` | File attachments in database | -| `` | File attachments in object storage | -| `` | Files/folders via ObjectRef (path-addressed, supports Zarr/HDF5) | - -Example: - -```python -definition = """ -recording_id : uuid ---- -neural_data : # NumPy array in 'raw' store -config_file : # Attached file in database -parameters : json # JSON data (core type, no brackets) -""" -``` - -## Native Database Types - -You can also use native MySQL/MariaDB types directly when needed: - -```python -definition = """ -record_id : int unsigned # Native MySQL type -data : mediumblob # For larger binary data -description : text # Unlimited text -""" -``` - -Native types are flagged with a warning at declaration time but are allowed. Core DataJoint types (like `int32`, `float64`) are portable and recommended for most use cases. Native database types provide access to database-specific features when needed. - -## Foreign Keys - -```python -@schema -class Session(dj.Manual): - definition = """ - -> Subject # References Subject table - session_idx : uint16 - --- - session_date : date - """ -``` - -The `->` inherits primary key attributes from the referenced table. - -## Lookup Tables with Contents - -```python -@schema -class TaskType(dj.Lookup): - definition = """ - task_type : varchar(32) - --- - description : varchar(200) - """ - contents = [ - {'task_type': 'detection', 'description': 'Detect target stimulus'}, - {'task_type': 'discrimination', 'description': 'Distinguish between stimuli'}, - ] -``` - -## Part Tables - -```python -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 - --- - session_date : date - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - outcome : enum('hit', 'miss') - reaction_time : float32 - """ -``` - -## Computed Tables - -```python -@schema -class SessionStats(dj.Computed): - definition = """ - -> Session - --- - n_trials : uint32 - hit_rate : float32 - """ - - def make(self, key): - trials = (Session.Trial & key).to_dicts() - self.insert1({ - **key, - 'n_trials': len(trials), - 'hit_rate': sum(t['outcome'] == 'hit' for t in trials) / len(trials) - }) -``` - -## Indexes - -Declare indexes at the end of the definition, after all attributes: - -```python -definition = """ -subject_id : varchar(16) -session_idx : uint16 ---- -session_date : date -experimenter : varchar(50) -index (session_date) # Index for faster queries -index (experimenter) # Another index -unique index (external_id) # Unique constraint -""" -``` - -## Declaring Tables - -Tables are declared in the database when the `@schema` decorator applies to the class: - -```python -@schema # Table is declared here -class Session(dj.Manual): - definition = """ - session_id : uint16 - --- - session_date : date - """ -``` - -The decorator reads the `definition` string, parses it, and creates the corresponding table in the database if it doesn't exist. - -## Dropping Tables and Schemas - -During prototyping (before data are populated), you can drop and recreate tables: - -```python -# Drop a single table -Session.drop() - -# Drop entire schema (all tables) -schema.drop() -``` - -**Warning:** These operations permanently delete data. Use only during development. - -## View Table Definition - -```python -# Show SQL definition -print(Session().describe()) - -# Show heading -print(Session().heading) -``` - - ---- -## File: how-to/delete-data.md - -# Delete Data - -Remove data safely with proper cascade handling. - -## Basic Delete - -Delete rows matching a restriction: - -```python -# Delete specific subject -(Subject & {'subject_id': 'M001'}).delete() - -# Delete with condition -(Session & 'session_date < "2024-01-01"').delete() -``` - -## Cascade Behavior - -Deleting a row automatically cascades to all dependent tables: - -```python -# Deletes subject AND all their sessions AND all trials -(Subject & {'subject_id': 'M001'}).delete() -``` - -This maintains referential integrity—no orphaned records remain. - -## Confirmation Prompt - -The `prompt` parameter controls confirmation behavior: - -```python -# Uses dj.config['safemode'] setting (default behavior) -(Subject & key).delete() - -# Explicitly skip confirmation -(Subject & key).delete(prompt=False) - -# Explicitly require confirmation -(Subject & key).delete(prompt=True) -``` - -When prompted, you'll see what will be deleted: - -``` -About to delete: - 1 rows from `lab`.`subject` - 5 rows from `lab`.`session` - 127 rows from `lab`.`trial` - -Proceed? [yes, No]: -``` - -## Safe Mode Configuration - -Control the default prompting behavior: - -```python -import datajoint as dj - -# Check current setting -print(dj.config['safemode']) - -# Disable prompts globally (use with caution) -dj.config['safemode'] = False - -# Re-enable prompts -dj.config['safemode'] = True -``` - -Or temporarily override: - -```python -with dj.config.override(safemode=False): - (Subject & restriction).delete() -``` - -## Transaction Handling - -Deletes are atomic—all cascading deletes succeed or none do: - -```python -# All-or-nothing delete (default) -(Subject & restriction).delete(transaction=True) -``` - -Within an existing transaction: - -```python -with dj.conn().transaction: - (Table1 & key1).delete(transaction=False) - (Table2 & key2).delete(transaction=False) - Table3.insert(rows) -``` - -## Part Tables - -Part tables cannot be deleted directly by default (master-part integrity): - -```python -# This raises an error -Session.Trial.delete() # DataJointError - -# Delete from master instead (cascades to parts) -(Session & key).delete() -``` - -Use `part_integrity` to control this behavior: - -```python -# Allow direct deletion (breaks master-part integrity) -(Session.Trial & key).delete(part_integrity="ignore") - -# Delete parts AND cascade up to delete master -(Session.Trial & key).delete(part_integrity="cascade") -``` - -| Policy | Behavior | -|--------|----------| -| `"enforce"` | (default) Error if parts deleted without masters | -| `"ignore"` | Allow deleting parts without masters | -| `"cascade"` | Also delete masters when parts are deleted | - -## Quick Delete - -Delete without cascade (fails if dependent rows exist): - -```python -# Only works if no dependent tables have matching rows -(Subject & key).delete_quick() -``` - -## Delete Patterns - -### By Primary Key - -```python -(Session & {'subject_id': 'M001', 'session_idx': 1}).delete() -``` - -### By Condition - -```python -(Trial & 'outcome = "miss"').delete() -``` - -### By Join - -```python -# Delete trials from sessions before 2024 -old_sessions = Session & 'session_date < "2024-01-01"' -(Trial & old_sessions).delete() -``` - -### All Rows - -```python -# Delete everything in table (and dependents) -MyTable.delete() -``` - -## The Recomputation Pattern - -When source data needs correction, use **delete → insert → populate**: - -```python -key = {'subject_id': 'M001', 'session_idx': 1} - -# 1. Delete cascades to computed tables -(Session & key).delete(prompt=False) - -# 2. Reinsert with corrected data -with dj.conn().transaction: - Session.insert1({**key, 'session_date': '2024-01-08', 'duration': 40.0}) - Session.Trial.insert(corrected_trials) - -# 3. Recompute derived data -ProcessedData.populate() -``` - -This ensures all derived data remains consistent with source data. - -## Return Value - -`delete()` returns the count of deleted rows from the primary table: - -```python -count = (Subject & restriction).delete(prompt=False) -print(f"Deleted {count} subjects") -``` - -## See Also - -- [Model Relationships](model-relationships.ipynb) — Foreign key patterns -- [Insert Data](insert-data.md) — Adding data to tables -- [Run Computations](run-computations.md) — Recomputing after changes - - ---- -## File: how-to/design-primary-keys.md - -# Design Primary Keys - -Choose effective primary keys for your tables. - -## Primary Key Principles - -Primary key attributes: - -- Uniquely identify each entity -- Cannot be NULL -- Cannot be changed after insertion -- Are inherited by dependent tables via foreign keys - -## Natural Keys - -Use meaningful identifiers when they exist: - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) # Lab-assigned ID like 'M001' - --- - species : varchar(32) - """ -``` - -**Good candidates:** -- Lab-assigned IDs -- Standard identifiers (NCBI accession, DOI) -- Meaningful codes with enforced uniqueness - -## Composite Keys - -Combine attributes when a single attribute isn't unique: - -```python -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 # Session number within subject - --- - session_date : date - """ -``` - -The primary key is `(subject_id, session_idx)`. - -## Surrogate Keys - -Use UUIDs when natural keys don't exist: - -```python -@schema -class Experiment(dj.Manual): - definition = """ - experiment_id : uuid - --- - description : varchar(500) - """ -``` - -Generate UUIDs: - -```python -import uuid - -Experiment.insert1({ - 'experiment_id': uuid.uuid4(), - 'description': 'Pilot study' -}) -``` - -## Why DataJoint Avoids Auto-Increment - -DataJoint discourages `auto_increment` for primary keys: - -1. **Encourages lazy design** — Users treat it as "row number" rather than thinking about what uniquely identifies the entity in their domain. - -2. **Incompatible with composite keys** — DataJoint schemas routinely use composite keys like `(subject_id, session_idx, trial_idx)`. MySQL allows only one auto_increment column per table, and it must be first in the key. - -3. **Breaks reproducibility** — Auto_increment values depend on insertion order. Rebuilding a pipeline produces different IDs. - -4. **No client-server handshake** — The client discovers the ID only *after* insertion, complicating error handling and concurrent access. - -5. **Meaningless foreign keys** — Downstream tables inherit opaque integers rather than traceable lineage. - -**Instead, use:** -- Natural keys that identify entities in your domain -- UUIDs when no natural identifier exists -- Composite keys combining foreign keys with sequence numbers - -## Foreign Keys in Primary Key - -Foreign keys above the `---` become part of the primary key: - -```python -@schema -class Trial(dj.Manual): - definition = """ - -> Session # In primary key - trial_idx : uint16 # In primary key - --- - -> Stimulus # NOT in primary key - outcome : enum('hit', 'miss') - """ -``` - -## Key Design Guidelines - -### Keep Keys Small - -Prefer `uint16` over `int64` when the range allows: - -```python -# Good: Appropriate size -session_idx : uint16 # Max 65,535 sessions per subject - -# Avoid: Unnecessarily large -session_idx : int64 # Wastes space, slower joins -``` - -### Use Fixed-Width for Joins - -Fixed-width types join faster: - -```python -# Good: Fixed width -subject_id : char(8) - -# Acceptable: Variable width -subject_id : varchar(16) -``` - -### Avoid Dates as Primary Keys - -Dates alone rarely guarantee uniqueness: - -```python -# Bad: Date might not be unique -session_date : date ---- -... - -# Good: Add a sequence number --> Subject -session_idx : uint16 ---- -session_date : date -``` - -### Avoid Computed Values - -Primary keys should be stable inputs, not derived: - -```python -# Bad: Derived from other data -hash_id : varchar(64) # MD5 of some content - -# Good: Assigned identifier -recording_id : uuid -``` - -## Migration Considerations - -Once a table has data, primary keys cannot be changed. Plan carefully: - -```python -# Consider future needs -@schema -class Scan(dj.Manual): - definition = """ - -> Session - scan_idx : uint8 # Might need uint16 for high-throughput - --- - ... - """ -``` - -## See Also - -- [Define Tables](define-tables.md) — Table definition syntax -- [Model Relationships](model-relationships.ipynb) — Foreign key patterns - - ---- -## File: how-to/distributed-computing.md - -# Distributed Computing - -Run computations across multiple workers with job coordination. - -## Enable Distributed Mode - -Use `reserve_jobs=True` to enable job coordination: - -```python -# Single worker (default) -ProcessedData.populate() - -# Distributed mode with job reservation -ProcessedData.populate(reserve_jobs=True) -``` - -## How It Works - -With `reserve_jobs=True`: -1. Worker checks the jobs table for pending work -2. Atomically reserves a job before processing -3. Other workers see the job as reserved and skip it -4. On completion, job is marked success (or error) - -## Multi-Process on Single Machine - -```python -# Use multiple processes -ProcessedData.populate(reserve_jobs=True, processes=4) -``` - -Each process: - -- Opens its own database connection -- Reserves jobs independently -- Processes in parallel - -## Multi-Machine Cluster - -Run the same script on multiple machines: - -```python -# worker_script.py - run on each machine -import datajoint as dj -from my_pipeline import ProcessedData - -# Each worker reserves and processes different jobs -ProcessedData.populate( - reserve_jobs=True, - display_progress=True, - suppress_errors=True -) -``` - -Workers automatically coordinate through the jobs table. - -## Job Table - -Each auto-populated table has a jobs table (`~~table_name`): - -```python -# View job status -ProcessedData.jobs - -# Filter by status -ProcessedData.jobs.pending -ProcessedData.jobs.reserved -ProcessedData.jobs.errors -ProcessedData.jobs.completed -``` - -## Job Statuses - -| Status | Description | -|--------|-------------| -| `pending` | Queued, ready to process | -| `reserved` | Being processed by a worker | -| `success` | Completed successfully | -| `error` | Failed with error | -| `ignore` | Marked to skip | - -## Refresh Job Queue - -Sync the job queue with current key_source: - -```python -# Add new pending jobs, remove stale ones -result = ProcessedData.jobs.refresh() -print(f"Added: {result['added']}, Removed: {result['removed']}") -``` - -## Priority Scheduling - -Control processing order with priorities: - -```python -# Refresh with specific priority -ProcessedData.jobs.refresh(priority=1) # Lower = more urgent - -# Process only high-priority jobs -ProcessedData.populate(reserve_jobs=True, priority=3) -``` - -## Error Recovery - -Handle failed jobs: - -```python -# View errors -errors = ProcessedData.jobs.errors -for job in errors.to_dicts(): - print(f"Key: {job}, Error: {job['error_message']}") - -# Clear errors to retry -errors.delete() -ProcessedData.populate(reserve_jobs=True) -``` - -## Orphan Detection - -Jobs from crashed workers are automatically recovered: - -```python -# Refresh with orphan timeout (seconds) -ProcessedData.jobs.refresh(orphan_timeout=3600) -``` - -Reserved jobs older than the timeout are reset to pending. - -## Configuration - -```python -import datajoint as dj - -# Auto-refresh on populate (default: True) -dj.config.jobs.auto_refresh = True - -# Keep completed job records (default: False) -dj.config.jobs.keep_completed = True - -# Stale job timeout in seconds (default: 3600) -dj.config.jobs.stale_timeout = 3600 - -# Default job priority (default: 5) -dj.config.jobs.default_priority = 5 - -# Track code version (default: None) -dj.config.jobs.version_method = "git" -``` - -## Populate Options - -| Option | Default | Description | -|--------|---------|-------------| -| `reserve_jobs` | `False` | Enable job coordination | -| `processes` | `1` | Number of worker processes | -| `max_calls` | `None` | Limit jobs per run | -| `display_progress` | `False` | Show progress bar | -| `suppress_errors` | `False` | Continue on errors | -| `priority` | `None` | Filter by priority | -| `refresh` | `None` | Force refresh before run | - -## Example: Cluster Setup - -```python -# config.py - shared configuration -import datajoint as dj - -dj.config.jobs.auto_refresh = True -dj.config.jobs.keep_completed = True -dj.config.jobs.version_method = "git" - -# worker.py - run on each node -from config import * -from my_pipeline import ProcessedData - -while True: - result = ProcessedData.populate( - reserve_jobs=True, - max_calls=100, - suppress_errors=True, - display_progress=True - ) - if result['success_count'] == 0: - break # No more work -``` - -## See Also - -- [Run Computations](run-computations.md) — Basic populate usage -- [Handle Errors](handle-errors.md) — Error recovery patterns -- [Monitor Progress](monitor-progress.md) — Tracking job status - - ---- -## File: how-to/fetch-results.md - -# Fetch Results - -Retrieve query results in various formats. - -## List of Dictionaries - -```python -rows = Subject.to_dicts() -# [{'subject_id': 'M001', 'species': 'Mus musculus', ...}, ...] - -for row in rows: - print(row['subject_id'], row['species']) -``` - -## pandas DataFrame - -```python -df = Subject.to_pandas() -# Primary key becomes the index - -# With multi-column primary key -df = Session.to_pandas() -# MultiIndex on (subject_id, session_idx) -``` - -## NumPy Arrays - -```python -# Structured array (all columns) -arr = Subject.to_arrays() - -# Specific columns as separate arrays -species, weights = Subject.to_arrays('species', 'weight') -``` - -## Primary Keys Only - -```python -keys = Session.keys() -# [{'subject_id': 'M001', 'session_idx': 1}, ...] - -for key in keys: - process(Session & key) -``` - -## Single Row - -```python -# As dictionary (raises if not exactly 1 row) -row = (Subject & {'subject_id': 'M001'}).fetch1() - -# Specific attributes -species, weight = (Subject & {'subject_id': 'M001'}).fetch1('species', 'weight') -``` - -## Ordering and Limiting - -```python -# Sort by single attribute -Subject.to_dicts(order_by='weight DESC') - -# Sort by multiple attributes -Session.to_dicts(order_by=['session_date DESC', 'duration']) - -# Sort by primary key -Subject.to_dicts(order_by='KEY') - -# Limit rows -Subject.to_dicts(limit=10) - -# Pagination -Subject.to_dicts(order_by='KEY', limit=10, offset=20) -``` - -## Streaming (Lazy Iteration) - -```python -# Memory-efficient iteration -for row in Subject: - process(row) - if done: - break # Early termination -``` - -## polars DataFrame - -```python -# Requires: pip install datajoint[polars] -df = Subject.to_polars() -``` - -## PyArrow Table - -```python -# Requires: pip install datajoint[arrow] -table = Subject.to_arrow() -``` - -## Method Summary - -| Method | Returns | Use Case | -|--------|---------|----------| -| `to_dicts()` | `list[dict]` | JSON, iteration | -| `to_pandas()` | `DataFrame` | Data analysis | -| `to_polars()` | `polars.DataFrame` | Fast analysis | -| `to_arrow()` | `pyarrow.Table` | Interop | -| `to_arrays()` | `np.ndarray` | Numeric computation | -| `to_arrays('a', 'b')` | `tuple[array, ...]` | Specific columns | -| `keys()` | `list[dict]` | Primary keys | -| `fetch1()` | `dict` | Single row | -| `for row in table:` | Iterator | Streaming | - -## Common Parameters - -All output methods accept: - -| Parameter | Description | -|-----------|-------------| -| `order_by` | Sort by column(s): `'name'`, `'name DESC'`, `['a', 'b DESC']`, `'KEY'` | -| `limit` | Maximum rows to return | -| `offset` | Rows to skip | - -## See Also - -- [Query Data](query-data.md) — Building queries -- [Fetch API Specification](../reference/specs/fetch-api.md) — Complete reference - - ---- -## File: how-to/garbage-collection.md - -# Clean Up External Storage - -Remove orphaned data from object storage after deleting database rows. - -## Why Garbage Collection? - -When you delete rows from tables with external storage (``, ``, -``, ``), the database records are removed but the external files -remain. This is by design: - -- **Hash-addressed storage** (``, ``) uses deduplication—the - same content may be referenced by multiple rows -- **Schema-addressed storage** (``, ``) stores each row's data - at a unique path, but immediate deletion could cause issues with concurrent - operations - -Run garbage collection periodically to reclaim storage space. - -## Basic Usage - -```python -import datajoint as dj - -# Scan for orphaned items (dry run) -stats = dj.gc.scan(schema1, schema2) -print(dj.gc.format_stats(stats)) - -# Remove orphaned items -stats = dj.gc.collect(schema1, schema2, dry_run=False) -print(dj.gc.format_stats(stats)) -``` - -## Scan Before Collecting - -Always scan first to see what would be deleted: - -```python -# Check what's orphaned -stats = dj.gc.scan(my_schema) - -print(f"Hash-addressed orphaned: {stats['hash_orphaned']}") -print(f"Schema paths orphaned: {stats['schema_paths_orphaned']}") -print(f"Total bytes: {stats['orphaned_bytes'] / 1e6:.1f} MB") -``` - -## Dry Run Mode - -The default `dry_run=True` reports what would be deleted without deleting: - -```python -# Safe: shows what would be deleted -stats = dj.gc.collect(my_schema, dry_run=True) -print(dj.gc.format_stats(stats)) - -# After review, actually delete -stats = dj.gc.collect(my_schema, dry_run=False) -``` - -## Multiple Schemas - -If your data spans multiple schemas, scan all of them together: - -```python -# Important: include ALL schemas that might share storage -stats = dj.gc.collect( - schema_raw, - schema_processed, - schema_analysis, - dry_run=False -) -``` - -!!! note "Per-schema deduplication" - Hash-addressed storage is deduplicated **within** each schema. Different - schemas have independent storage, so you only need to scan schemas that - share the same database. - -## Named Stores - -If you use multiple named stores, specify which to clean: - -```python -# Clean specific store -stats = dj.gc.collect(my_schema, store_name='archive', dry_run=False) - -# Or clean default store -stats = dj.gc.collect(my_schema, dry_run=False) # uses default store -``` - -## Verbose Mode - -See detailed progress: - -```python -stats = dj.gc.collect( - my_schema, - dry_run=False, - verbose=True # logs each deletion -) -``` - -## Understanding the Statistics - -```python -stats = dj.gc.scan(my_schema) - -# Hash-addressed storage (, , ) -stats['hash_referenced'] # Items still in database -stats['hash_stored'] # Items in storage -stats['hash_orphaned'] # Unreferenced (can be deleted) -stats['hash_orphaned_bytes'] # Size of orphaned items - -# Schema-addressed storage (, ) -stats['schema_paths_referenced'] # Paths still in database -stats['schema_paths_stored'] # Paths in storage -stats['schema_paths_orphaned'] # Unreferenced paths -stats['schema_paths_orphaned_bytes'] - -# Totals -stats['referenced'] # Total referenced items -stats['stored'] # Total stored items -stats['orphaned'] # Total orphaned items -stats['orphaned_bytes'] -``` - -## Scheduled Collection - -Run GC periodically in production: - -```python -# In a cron job or scheduled task -import datajoint as dj -from myproject import schema1, schema2, schema3 - -stats = dj.gc.collect( - schema1, schema2, schema3, - dry_run=False, - verbose=True -) - -if stats['errors'] > 0: - logging.warning(f"GC completed with {stats['errors']} errors") -else: - logging.info(f"GC freed {stats['bytes_freed'] / 1e6:.1f} MB") -``` - -## How Storage Addressing Works - -DataJoint uses two storage patterns: - -### Hash-Addressed (``, ``, ``) - -``` -_hash/ - {schema}/ - ab/ - cd/ - abcdefghij... # Content identified by Base32-encoded MD5 hash -``` - -- Duplicate content shares storage within each schema -- Paths are stored in metadata—safe from config changes -- Cannot delete until no rows reference the content -- GC compares stored paths against filesystem - -### Schema-Addressed (``, ``) - -``` -myschema/ - mytable/ - primary_key_values/ - attribute_name/ - data.zarr/ - data.npy -``` - -- Each row has unique path based on schema structure -- Paths mirror database organization -- GC removes paths not referenced by any row - -## Troubleshooting - -### "At least one schema must be provided" - -```python -# Wrong -dj.gc.scan() - -# Right -dj.gc.scan(my_schema) -``` - -### Storage not decreasing - -Check that you're scanning all schemas: - -```python -# List all schemas that use this store -# Make sure to include them all in the scan -``` - -### Permission errors - -Ensure your storage credentials allow deletion: - -```python -# Check store configuration -spec = dj.config.get_object_store_spec('mystore') -# Verify write/delete permissions -``` - -## See Also - -- [Manage Large Data](manage-large-data.md) — Storage patterns and streaming -- [Configure Object Storage](configure-storage.md) — Storage setup -- [Delete Data](delete-data.md) — Row deletion with cascades - - ---- -## File: how-to/handle-errors.md - -# Handle Errors - -Manage computation errors and recover failed jobs. - -## Suppress Errors During Populate - -Continue processing despite individual failures: - -```python -# Stop on first error (default) -ProcessedData.populate() - -# Log errors but continue -ProcessedData.populate(suppress_errors=True) -``` - -## View Failed Jobs - -Check the jobs table for errors: - -```python -# All error jobs -ProcessedData.jobs.errors - -# View error details -for job in ProcessedData.jobs.errors.to_dicts(): - print(f"Key: {job}") - print(f"Message: {job['error_message']}") -``` - -## Get Full Stack Trace - -Error stack traces are stored in the jobs table: - -```python -job = (ProcessedData.jobs.errors & key).fetch1() -print(job['error_stack']) -``` - -## Retry Failed Jobs - -Clear error status and rerun: - -```python -# Delete error records to retry -ProcessedData.jobs.errors.delete() - -# Reprocess -ProcessedData.populate(reserve_jobs=True) -``` - -## Retry Specific Jobs - -Target specific failed jobs: - -```python -# Clear one error -(ProcessedData.jobs & key & 'status="error"').delete() - -# Retry just that key -ProcessedData.populate(key, reserve_jobs=True) -``` - -## Ignore Problematic Jobs - -Mark jobs to skip permanently: - -```python -# Mark job as ignored -ProcessedData.jobs.ignore(key) - -# View ignored jobs -ProcessedData.jobs.ignored -``` - -## Error Handling in make() - -Handle expected errors gracefully: - -```python -@schema -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - result : float64 - """ - - def make(self, key): - try: - data = (RawData & key).fetch1('data') - result = risky_computation(data) - except ValueError as e: - # Log and skip this key - logger.warning(f"Skipping {key}: {e}") - return # Don't insert, job remains pending - - self.insert1({**key, 'result': result}) -``` - -## Transaction Rollback - -Failed `make()` calls automatically rollback: - -```python -def make(self, key): - # These inserts are in a transaction - self.insert1({**key, 'result': value1}) - PartTable.insert(parts) - - # If this raises, all inserts are rolled back - validate_result(key) -``` - -## Return Exception Objects - -Get exception objects for programmatic handling: - -```python -result = ProcessedData.populate( - suppress_errors=True, - return_exception_objects=True -) - -for key, exception in result['error_list']: - if isinstance(exception, TimeoutError): - # Handle timeout differently - schedule_for_later(key) -``` - -## Monitor Error Rate - -Track errors over time: - -```python -progress = ProcessedData.jobs.progress() -print(f"Pending: {progress.get('pending', 0)}") -print(f"Errors: {progress.get('error', 0)}") -print(f"Success: {progress.get('success', 0)}") - -error_rate = progress.get('error', 0) / sum(progress.values()) -print(f"Error rate: {error_rate:.1%}") -``` - -## Common Error Patterns - -### Data Quality Issues - -```python -def make(self, key): - data = (RawData & key).fetch1('data') - - if not validate_data(data): - raise DataJointError(f"Invalid data for {key}") - - # Process valid data - self.insert1({**key, 'result': process(data)}) -``` - -### Resource Constraints - -```python -def make(self, key): - try: - result = memory_intensive_computation(key) - except MemoryError: - # Clear caches and retry once - gc.collect() - result = memory_intensive_computation(key) - - self.insert1({**key, 'result': result}) -``` - -### External Service Failures - -```python -def make(self, key): - for attempt in range(3): - try: - data = fetch_from_external_api(key) - break - except ConnectionError: - if attempt == 2: - raise - time.sleep(2 ** attempt) # Exponential backoff - - self.insert1({**key, 'result': process(data)}) -``` - -## See Also - -- [Run Computations](run-computations.md) — Basic populate usage -- [Distributed Computing](distributed-computing.md) — Multi-worker error handling -- [Monitor Progress](monitor-progress.md) — Tracking job status - - ---- -## File: how-to/index.md - -# How-To Guides - -Practical guides for common tasks. - -These guides help you accomplish specific tasks with DataJoint. Unlike tutorials, -they assume you understand the basics and focus on getting things done. - -## Setup - -- [Installation](installation.md) — Installing DataJoint -- [Manage Secrets and Credentials](manage-secrets.md) — Secure configuration management -- [Configure Database Connection](configure-database.md) — Connection settings -- [Configure Object Storage](configure-storage.md) — S3, MinIO, file stores -- [Use the Command-Line Interface](use-cli.md) — Interactive REPL - -## Schema Design - -- [Define Tables](define-tables.md) — Table definition syntax -- [Model Relationships](model-relationships.ipynb) — Foreign key patterns -- [Design Primary Keys](design-primary-keys.md) — Key selection strategies - -## Project Management - -- [Manage a Pipeline Project](manage-pipeline-project.md) — Multi-schema pipelines, team collaboration - -## Data Operations - -- [Insert Data](insert-data.md) — Single rows, batches, transactions -- [Query Data](query-data.md) — Operators, restrictions, projections -- [Fetch Results](fetch-results.md) — DataFrames, dicts, streaming -- [Delete Data](delete-data.md) — Safe deletion with cascades - -## Computation - -- [Run Computations](run-computations.md) — populate() basics -- [Distributed Computing](distributed-computing.md) — Multi-process, cluster -- [Handle Errors](handle-errors.md) — Error recovery and job management -- [Monitor Progress](monitor-progress.md) — Dashboards and status - -## Object Storage - -- [Object Storage Overview](object-storage-overview.md) — Navigation guide for all storage docs -- [Choose a Storage Type](choose-storage-type.md) — Decision guide for codecs -- [Use Object Storage](use-object-storage.md) — When and how -- [Create Custom Codecs](create-custom-codec.md) — Domain-specific types -- [Manage Large Data](manage-large-data.md) — Blobs, streaming, efficiency -- [Clean Up External Storage](garbage-collection.md) — Garbage collection - -## Maintenance - -- [Migrate to v2.0](migrate-to-v20.md) — Upgrading existing pipelines -- [Alter Tables](alter-tables.md) — Schema evolution -- [Backup and Restore](backup-restore.md) — Data protection - - ---- -## File: how-to/insert-data.md - -# Insert Data - -Add data to DataJoint tables. - -## Single Row - -```python -Subject.insert1({ - 'subject_id': 'M001', - 'species': 'Mus musculus', - 'date_of_birth': '2026-01-15', - 'sex': 'M' -}) -``` - -## Multiple Rows - -```python -Subject.insert([ - {'subject_id': 'M001', 'species': 'Mus musculus', 'date_of_birth': '2026-01-15', 'sex': 'M'}, - {'subject_id': 'M002', 'species': 'Mus musculus', 'date_of_birth': '2026-02-01', 'sex': 'F'}, - {'subject_id': 'M003', 'species': 'Mus musculus', 'date_of_birth': '2026-02-15', 'sex': 'M'}, -]) -``` - -## From pandas DataFrame - -```python -import pandas as pd - -df = pd.DataFrame({ - 'subject_id': ['M004', 'M005'], - 'species': ['Mus musculus', 'Mus musculus'], - 'date_of_birth': ['2026-03-01', '2026-03-15'], - 'sex': ['F', 'M'] -}) - -Subject.insert(df) -``` - -## Handle Duplicates - -```python -# Skip rows with existing primary keys -Subject.insert(rows, skip_duplicates=True) - -# Replace existing rows (use sparingly—breaks immutability) -Subject.insert(rows, replace=True) -``` - -## Ignore Extra Fields - -```python -# Ignore fields not in the table definition -Subject.insert(rows, ignore_extra_fields=True) -``` - -## Master-Part Tables - -Use a transaction to maintain compositional integrity: - -```python -with dj.conn().transaction: - Session.insert1({ - 'subject_id': 'M001', - 'session_idx': 1, - 'session_date': '2026-01-20' - }) - Session.Trial.insert([ - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 1, 'outcome': 'hit', 'reaction_time': 0.35}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 2, 'outcome': 'miss', 'reaction_time': 0.82}, - ]) -``` - -## Insert from Query - -```python -# Copy data from another table or query result -NewTable.insert(OldTable & 'condition') - -# With projection -NewTable.insert(OldTable.proj('attr1', 'attr2', new_name='old_name')) -``` - -## Validate Before Insert - -```python -result = Subject.validate(rows) - -if result: - Subject.insert(rows) -else: - print("Validation errors:") - for error in result.errors: - print(f" {error}") -``` - -## Insert with Blobs - -```python -import numpy as np - -data = np.random.randn(100, 100) - -ImageData.insert1({ - 'image_id': 1, - 'pixel_data': data # Automatically serialized -}) -``` - -## Insert Options Summary - -| Option | Default | Description | -|--------|---------|-------------| -| `skip_duplicates` | `False` | Skip rows with existing keys | -| `replace` | `False` | Replace existing rows | -| `ignore_extra_fields` | `False` | Ignore unknown fields | - -## Best Practices - -### Batch inserts for performance - -```python -# Good: Single insert call -Subject.insert(all_rows) - -# Slow: Loop of insert1 calls -for row in all_rows: - Subject.insert1(row) -``` - -### Use transactions for related inserts - -```python -with dj.conn().transaction: - Parent.insert1(parent_row) - Child.insert(child_rows) -``` - -### Validate before bulk inserts - -```python -if Subject.validate(rows): - Subject.insert(rows) -``` - - ---- -## File: how-to/installation.md - -# Installation - -Install DataJoint Python and set up your environment. - -!!! warning "Pre-Release Documentation" - - This documentation covers **DataJoint 2.0**, which is currently in pre-release. - - - **For production use:** Install the stable version (0.14.x) and use [legacy docs](https://datajoint.github.io/datajoint-python) - - **For testing 2.0:** Follow the pre-release installation instructions below - - **For migration:** See the [Migration Guide](migrate-to-v20.md) - -## Choose Your Installation - -### Pre-Release (2.0) — For Testing and Development - -**Note:** DataJoint 2.0 is not yet on PyPI/conda. Install from the pre-release branch: - -```bash -git clone -b pre/v2.0 https://github.com/datajoint/datajoint-python.git -cd datajoint-python -pip install -e ".[dev]" -``` - -**With optional dependencies:** - -```bash -# For polars DataFrame support -pip install -e ".[polars]" - -# For all optional dependencies -pip install -e ".[all]" -``` - -### Stable (0.14.x) — For Production Use - -```bash -pip install datajoint -``` - -**Note:** This installs DataJoint 0.14.x. The tutorials and guides in this documentation are written for 2.0 and will not match the stable API. Use [legacy documentation](https://datajoint.github.io/datajoint-python) instead. - -## Verify Installation - -Check your installed version: - -```python -import datajoint as dj -print(dj.__version__) -``` - -**Expected output for this documentation:** -- `2.0.0` or higher — You're ready to follow this documentation -- `0.14.x` or lower — You have the stable version, use [legacy docs](https://datajoint.github.io/datajoint-python) instead - -### If You Have the Wrong Version - -| Your Situation | Action | -|----------------|--------| -| Installed 0.14.x but want to test 2.0 | Follow pre-release installation above | -| Installed 2.0 but need production stability | `pip uninstall datajoint && pip install datajoint` | -| Have existing 0.14.x pipeline to upgrade | Follow [Migration Guide](migrate-to-v20.md) | - -## Database Server - -DataJoint requires a MySQL-compatible database server: - -### Local Development (Docker) - -```bash -docker run -d \ - --name datajoint-db \ - -p 3306:3306 \ - -e MYSQL_ROOT_PASSWORD=simple \ - mysql:8.0 -``` - -### DataJoint.com (Recommended) - -[DataJoint.com](https://datajoint.com) provides fully managed infrastructure for scientific data pipelines—cloud or on-premises—with comprehensive support, automatic backups, object storage, and team collaboration features. - -### Self-Managed Cloud Databases - -- **Amazon RDS** — MySQL or Aurora -- **Google Cloud SQL** — MySQL -- **Azure Database** — MySQL - -See [Configure Database Connection](configure-database.md) for connection setup. - -## Requirements - -- Python 3.10+ -- MySQL 8.0+ or MariaDB 10.6+ -- Network access to database server - -## Troubleshooting - -### `pymysql` connection errors - -```bash -pip install pymysql --force-reinstall -``` - -### SSL/TLS connection issues - -Set `use_tls=False` for local development: - -```python -dj.config['database.use_tls'] = False -``` - -### Permission denied - -Ensure your database user has appropriate privileges: - -```sql -GRANT ALL PRIVILEGES ON `your_schema%`.* TO 'username'@'%'; -``` - - ---- -## File: how-to/manage-large-data.md - -# Manage Large Data - -Work effectively with blobs and object storage. - -## Choose the Right Storage - -| Data Size | Recommended | Syntax | -|-----------|-------------|--------| -| < 1 MB | Database | `` | -| 1 MB - 1 GB | Hash-addressed | `` | -| > 1 GB | Schema-addressed | ``, `` | - -## Streaming Large Results - -Avoid loading everything into memory: - -```python -# Bad: loads all data at once -all_data = LargeTable().to_arrays('big_column') - -# Good: stream rows lazily (single cursor, one row at a time) -for row in LargeTable(): - process(row['big_column']) - -# Good: batch by ID range -keys = LargeTable().keys() -batch_size = 100 -for i in range(0, len(keys), batch_size): - batch_keys = keys[i:i + batch_size] - data = (LargeTable() & batch_keys).to_arrays('big_column') - process(data) -``` - -## Lazy Loading with ObjectRef - -`` and `` return lazy references: - -```python -# Returns ObjectRef, not the actual data -ref = (Dataset & key).fetch1('large_file') - -# Stream without full download -with ref.open('rb') as f: - # Process in chunks - while chunk := f.read(1024 * 1024): - process(chunk) - -# Or download when needed -local_path = ref.download('/tmp/working') -``` - -## Selective Fetching - -Fetch only what you need: - -```python -# Bad: fetches all columns including blobs -row = MyTable.fetch1() - -# Good: fetch only metadata -metadata = (MyTable & key).fetch1('name', 'date', 'status') - -# Then fetch blob only if needed -if needs_processing(metadata): - data = (MyTable & key).fetch1('large_data') -``` - -## Projection for Efficiency - -Exclude large columns from joins: - -```python -# Slow: joins include blob columns -result = Table1 * Table2 - -# Fast: project away blobs before join -result = Table1.proj('id', 'name') * Table2.proj('id', 'status') -``` - -## Batch Inserts - -Insert large data efficiently: - -```python -# Good: single transaction for related data -with dj.conn().transaction: - for item in large_batch: - MyTable.insert1(item) -``` - -## Content Deduplication - -`` and `` automatically deduplicate within each schema: - -```python -# Same array inserted twice -data = np.random.randn(1000, 1000) -Table.insert1({'id': 1, 'data': data}) -Table.insert1({'id': 2, 'data': data}) # References same storage - -# Only one copy exists in object storage (per schema) -``` - -Deduplication is per-schema—identical content in different schemas is stored separately. -This enables independent garbage collection per schema. - -## Storage Cleanup - -External storage items are not automatically deleted with rows. Run garbage -collection periodically: - -```python -import datajoint as dj - -# Objects are NOT automatically deleted with rows -(MyTable & old_data).delete() - -# Scan for orphaned items -stats = dj.gc.scan(my_schema) -print(dj.gc.format_stats(stats)) - -# Remove orphaned items -stats = dj.gc.collect(my_schema, dry_run=False) -``` - -See [Clean Up External Storage](garbage-collection.md) for details. - -## Monitor Storage Usage - -Check object store size: - -```python -# Get store configuration -spec = dj.config.get_object_store_spec() - -# For S3/MinIO, use boto3 or similar -# For filesystem, use standard tools -``` - -## Compression - -Blobs are compressed by default: - -```python -# Compression happens automatically in -large_array = np.zeros((10000, 10000)) # Compresses well -sparse_data = np.random.randn(10000, 10000) # Less compression -``` - -## Memory Management - -For very large computations: - -```python -def make(self, key): - # Process in chunks - for chunk_idx in range(n_chunks): - chunk_data = load_chunk(key, chunk_idx) - result = process(chunk_data) - save_partial_result(key, chunk_idx, result) - del chunk_data # Free memory - - # Combine results - final = combine_results(key) - self.insert1({**key, 'result': final}) -``` - -## External Tools for Very Large Data - -For datasets too large for DataJoint: - -```python -@schema -class LargeDataset(dj.Manual): - definition = """ - dataset_id : uuid - --- - zarr_path : # Reference to external Zarr - """ - -# Store path reference, process with specialized tools -import zarr -store = zarr.open(local_zarr_path) -# ... process with Zarr/Dask ... - -LargeDataset.insert1({ - 'dataset_id': uuid.uuid4(), - 'zarr_path': local_zarr_path -}) -``` - -## See Also - -- [Use Object Storage](use-object-storage.md) — Storage patterns -- [Configure Object Storage](configure-storage.md) — Storage setup -- [Create Custom Codecs](create-custom-codec.md) — Domain-specific types - - ---- -## File: how-to/manage-pipeline-project.md - -# Manage a Pipeline Project - -Organize multi-schema pipelines for team collaboration. - -## Overview - -A production DataJoint pipeline typically involves: - -- **Multiple schemas** — Organized by experimental modality or processing stage -- **Team of users** — With different roles and access levels -- **Shared infrastructure** — Database server, object storage, code repository -- **Coordination** — Between code, database, and storage permissions - -This guide covers practical project organization. For conceptual background on pipeline architecture and the DAG structure, see [Data Pipelines](../explanation/data-pipelines.md). - -For a fully managed solution, [request a DataJoint Platform account](https://www.datajoint.com/sign-up). - -## Project Structure - -Use a modern Python project layout with source code under `src/`: - -``` -my_pipeline/ -├── datajoint.json # Shared settings (committed) -├── .secrets/ # Local credentials (gitignored) -│ ├── database.password -│ └── storage.credentials -├── .gitignore -├── pyproject.toml # Package metadata and dependencies -├── README.md -├── src/ -│ └── my_pipeline/ -│ ├── __init__.py -│ ├── subject.py # subject schema -│ ├── session.py # session schema -│ ├── ephys.py # ephys schema -│ ├── imaging.py # imaging schema -│ ├── analysis.py # analysis schema -│ └── utils/ -│ └── __init__.py -├── tests/ -│ ├── conftest.py -│ └── test_ephys.py -└── docs/ - └── ... -``` - -### One Module Per Schema - -Each module defines and binds to its schema: - -```python -# src/my_pipeline/ephys.py -import datajoint as dj -from . import session # Import dependency - -schema = dj.Schema('ephys') - -@schema -class Probe(dj.Lookup): - definition = """ - probe_type : varchar(32) - --- - num_channels : uint16 - """ - -@schema -class Recording(dj.Imported): - definition = """ - -> session.Session - -> Probe - --- - recording_path : varchar(255) - """ -``` - -### Import Dependencies Mirror Foreign Keys - -Module imports reflect the schema DAG: - -```python -# analysis.py depends on both ephys and imaging -from . import ephys -from . import imaging - -schema = dj.Schema('analysis') - -@schema -class MultiModalAnalysis(dj.Computed): - definition = """ - -> ephys.Recording - -> imaging.Scan - --- - correlation : float64 - """ -``` - -## Repository Configuration - -### Shared Settings - -Store non-secret configuration in `datajoint.json` at the project root: - -**datajoint.json** (committed): -```json -{ - "database": { - "host": "db.example.com", - "port": 3306 - }, - "stores": { - "main": { - "protocol": "s3", - "endpoint": "s3.example.com", - "bucket": "my-org-data", - "location": "my_pipeline" - } - } -} -``` - -### Credentials Management - -Credentials are stored locally and never committed: - -**Option 1: `.secrets/` directory** -``` -.secrets/ -├── database.user -├── database.password -├── storage.access_key -└── storage.secret_key -``` - -**Option 2: Environment variables** -```bash -export DJ_USER=alice -export DJ_PASS=alice_password -export DJ_STORES__MAIN__ACCESS_KEY=... -export DJ_STORES__MAIN__SECRET_KEY=... -``` - -### Essential `.gitignore` - -```gitignore -# Credentials -.secrets/ - -# Python -__pycache__/ -*.pyc -*.egg-info/ -dist/ -build/ - -# Environment -.env -.venv/ - -# IDE -.idea/ -.vscode/ -``` - -### `pyproject.toml` Example - -```toml -[project] -name = "my-pipeline" -version = "1.0.0" -requires-python = ">=3.10" -dependencies = [ - "datajoint>=2.0", - "numpy", -] - -[project.optional-dependencies] -dev = ["pytest", "pytest-cov"] - -[tool.setuptools.packages.find] -where = ["src"] -``` - -## Database Access Control - -### The Complexity - -Multi-user database access requires: - -1. **User accounts** — Individual credentials per team member -2. **Schema permissions** — Which users can access which schemas -3. **Operation permissions** — SELECT, INSERT, UPDATE, DELETE, CREATE, DROP -4. **Role hierarchy** — Admin, developer, analyst, viewer -5. **Audit trail** — Who modified what and when - -### Basic MySQL Grants - -```sql --- Create user -CREATE USER 'alice'@'%' IDENTIFIED BY 'password'; - --- Grant read-only on specific schema -GRANT SELECT ON ephys.* TO 'alice'@'%'; - --- Grant read-write on specific schema -GRANT SELECT, INSERT, UPDATE, DELETE ON analysis.* TO 'alice'@'%'; - --- Grant full access (developers) -GRANT ALL PRIVILEGES ON my_pipeline_*.* TO 'bob'@'%'; -``` - -### Role-Based Access Patterns - -| Role | Permissions | Typical Use | -|------|-------------|-------------| -| Viewer | SELECT | Browse data, run queries | -| Analyst | SELECT, INSERT on analysis | Add analysis results | -| Operator | SELECT, INSERT, DELETE on data schemas | Run pipeline | -| Developer | ALL on development schemas | Schema changes | -| Admin | ALL + GRANT | User management | - -### Considerations - -- Users need SELECT on parent schemas to INSERT into child schemas (FK validation) -- Cascading deletes require DELETE on all dependent schemas -- Schema creation requires CREATE privilege -- Coordinating permissions across many schemas becomes complex - -## Object Storage Access Control - -### The Complexity - -Object storage permissions must align with database permissions: - -1. **Bucket/prefix policies** — Map to schema access -2. **Read vs write** — Match SELECT vs INSERT/UPDATE -3. **Credential distribution** — Per-user or shared service accounts -4. **Cross-schema objects** — When computed tables reference multiple inputs - -### Hierarchical Storage Structure - -A DataJoint project creates a structured storage pattern: - -``` -📁 project_name/ -├── 📁 schema_name1/ -├── 📁 schema_name2/ -├── 📁 schema_name3/ -│ ├── objects/ -│ │ └── table1/ -│ │ └── key1-value1/ -│ └── fields/ -│ └── table1-field1/ -└── ... -``` - -### S3/MinIO Policy Example - -```json -{ - "Version": "2012-10-17", - "Statement": [ - { - "Effect": "Allow", - "Action": ["s3:GetObject"], - "Resource": "arn:aws:s3:::my-lab-data/datajoint/ephys/*" - }, - { - "Effect": "Allow", - "Action": ["s3:GetObject", "s3:PutObject"], - "Resource": "arn:aws:s3:::my-lab-data/datajoint/analysis/*" - } - ] -} -``` - -### Considerations - -- Object paths include schema name: `{project}/{schema}/{table}/...` -- Users need read access to fetch blobs from upstream schemas -- Content-addressed storage (``) shares objects across tables -- Garbage collection requires coordinated delete permissions - -## Pipeline Initialization - -### Schema Creation Order - -Initialize schemas in dependency order: - -```python -# src/my_pipeline/__init__.py -from . import subject # No dependencies -from . import session # Depends on subject -from . import ephys # Depends on session -from . import imaging # Depends on session -from . import analysis # Depends on ephys, imaging - -def initialize(): - """Create all schemas in dependency order.""" - # Schemas are created when modules are imported - # and tables are first accessed - subject.Subject() - session.Session() - ephys.Recording() - imaging.Scan() - analysis.MultiModalAnalysis() -``` - -### Version Coordination - -Track schema versions with your code: - -```python -# src/my_pipeline/version.py -__version__ = "1.2.0" - -SCHEMA_VERSIONS = { - 'subject': '1.0.0', - 'session': '1.1.0', - 'ephys': '1.2.0', - 'imaging': '1.2.0', - 'analysis': '1.2.0', -} -``` - -## Team Workflows - -### Development vs Production - -``` -┌─────────────────┐ ┌─────────────────┐ -│ Development │ │ Production │ -├─────────────────┤ ├─────────────────┤ -│ dev_subject │ │ subject │ -│ dev_session │ │ session │ -│ dev_ephys │ │ ephys │ -└─────────────────┘ └─────────────────┘ - │ │ - │ Schema promotion │ - └───────────────────────┘ -``` - -### Branching Strategy - -``` -main ────────────────────────────────────▶ - │ │ - │ feature/ │ hotfix/ - ▼ ▼ - ephys-v2 fix-recording - │ │ - └──────────────┴──▶ main -``` - -## Summary of Complexities - -Managing a team pipeline requires coordinating: - -| Component | Challenges | -|-----------|------------| -| **Code** | Module dependencies, version control, deployment | -| **Database** | User accounts, schema permissions, role hierarchy | -| **Object Storage** | Bucket policies, credential distribution, path alignment | -| **Compute** | Worker deployment, job distribution, resource allocation | -| **Monitoring** | Progress tracking, error alerting, audit logging | - -These challenges grow with team size and pipeline complexity. The [DataJoint Platform](https://www.datajoint.com/sign-up) provides integrated management for all these concerns. - -## See Also - -- [Data Pipelines](../explanation/data-pipelines.md) — Conceptual overview and architecture -- [Configure Object Storage](configure-storage.md) — Storage setup -- [Distributed Computing](distributed-computing.md) — Multi-worker pipelines -- [Model Relationships](model-relationships.ipynb) — Foreign key patterns - - ---- -## File: how-to/manage-secrets.md - -# Manage Secrets and Credentials - -Secure configuration management for database credentials, storage access keys, and other sensitive settings. - -## Overview - -DataJoint separates configuration into sensitive and non-sensitive components: - -| Component | Location | Purpose | Version Control | -|-----------|----------|---------|-----------------| -| **Non-sensitive** | `datajoint.json` | Project settings, defaults | ✅ Commit to git | -| **Sensitive** | `.secrets/` directory | Credentials, API keys | ❌ Never commit | -| **Dynamic** | Environment variables | CI/CD, production | ⚠️ Context-dependent | - -## Configuration Priority - -DataJoint loads configuration in this priority order (highest to lowest): - -1. **Programmatic settings** — `dj.config['key'] = value` -2. **Environment variables** — `DJ_HOST`, `DJ_USER`, etc. -3. **Secrets directory** — `.secrets/datajoint.json`, `.secrets/stores.*` -4. **Project configuration** — `datajoint.json` -5. **Default values** — Built-in defaults - -Higher priority sources override lower ones. - -## `.secrets/` Directory Structure - -Create a `.secrets/` directory in your project root: - -``` -project/ -├── datajoint.json # Non-sensitive settings (commit) -├── .gitignore # Must include .secrets/ -├── .secrets/ -│ ├── datajoint.json # Database credentials -│ ├── stores.main.access_key # S3/cloud storage credentials -│ ├── stores.main.secret_key -│ ├── stores.archive.access_key -│ └── stores.archive.secret_key -└── ... -``` - -**Critical:** Add `.secrets/` to `.gitignore`: - -```gitignore -# .gitignore -.secrets/ -``` - -## Database Credentials - -### Option 1: Secrets Directory (Recommended for Development) - -Create `.secrets/datajoint.json`: - -```json -{ - "database.user": "myuser", - "database.password": "mypassword" -} -``` - -Non-sensitive database settings go in `datajoint.json`: - -```json -{ - "database.host": "db.example.com", - "database.port": 3306, - "database.use_tls": true, - "safemode": true -} -``` - -### Option 2: Environment Variables (Recommended for Production) - -For CI/CD and production environments: - -```bash -export DJ_HOST=db.example.com -export DJ_USER=myuser -export DJ_PASS=mypassword -export DJ_PORT=3306 -export DJ_TLS=true -``` - -### Option 3: Programmatic Configuration - -For scripts and applications: - -```python -import datajoint as dj - -dj.config['database.host'] = 'localhost' -dj.config['database.user'] = 'myuser' -dj.config['database.password'] = 'mypassword' -``` - -**Security note:** Only use this when credentials come from secure sources (environment, vault, secrets manager). - -## Object Storage Credentials - -### File Storage (No Credentials) - -Local or network-mounted file systems don't require credentials: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/my-project" - } - } -} -``` - -### S3/MinIO Storage (With Credentials) - -#### Config in `datajoint.json` (non-sensitive): - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-bucket", - "location": "my-project/data" - }, - "archive": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive-bucket", - "location": "my-project/archive" - } - } -} -``` - -#### Credentials in `.secrets/` directory: - -Create separate files for each store's credentials: - -``` -.secrets/stores.main.access_key -.secrets/stores.main.secret_key -.secrets/stores.archive.access_key -.secrets/stores.archive.secret_key -``` - -**File format:** Plain text, one credential per file: - -```bash -# .secrets/stores.main.access_key -AKIAIOSFODNN7EXAMPLE - -# .secrets/stores.main.secret_key -wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY -``` - -#### Alternative: Environment Variables - -For cloud deployments: - -```bash -export DJ_STORES_MAIN_ACCESS_KEY=AKIAIOSFODNN7EXAMPLE -export DJ_STORES_MAIN_SECRET_KEY=wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY -``` - -## Environment Variable Reference - -### Database Connections - -| Setting | Environment Variable | Description | -|---------|---------------------|-------------| -| `database.host` | `DJ_HOST` | Database hostname | -| `database.port` | `DJ_PORT` | Database port (default: 3306) | -| `database.user` | `DJ_USER` | Database username | -| `database.password` | `DJ_PASS` | Database password | -| `database.use_tls` | `DJ_TLS` | Use TLS encryption (true/false) | - -### Object Stores - -| Pattern | Example | Description | -|---------|---------|-------------| -| `DJ_STORES__ACCESS_KEY` | `DJ_STORES_MAIN_ACCESS_KEY` | S3 access key ID | -| `DJ_STORES__SECRET_KEY` | `DJ_STORES_MAIN_SECRET_KEY` | S3 secret access key | - -**Note:** `` is the uppercase store name with `_` replacing special characters. - -## Security Best Practices - -### Development Environment - -```bash -# 1. Initialize secrets directory -mkdir -p .secrets -chmod 700 .secrets # Owner-only access - -# 2. Create .gitignore -echo ".secrets/" >> .gitignore - -# 3. Store credentials in .secrets/ -cat > .secrets/datajoint.json < **This guide is optimized for AI coding assistants.** Point your AI agent at this -> document and it will execute the migration with your oversight. - -## Requirements - -### System Requirements - -| Component | Legacy (pre-2.0) | DataJoint 2.0 | -|-----------|-----------------|---------------| -| **Python** | 3.8+ | **3.10+** | -| **MySQL** | 5.7+ | **8.0+** | -| **Character encoding** | (varies) | **UTF-8 (utf8mb4)** | -| **Collation** | (varies) | **utf8mb4_bin** | - -**Action required:** Upgrade your Python environment and MySQL server before installing DataJoint 2.0. - -**Character encoding and collation:** DataJoint 2.0 standardizes on UTF-8 encoding with binary collation (case-sensitive comparisons). This is configured **server-wide** and is assumed by DataJoint: - -- **MySQL:** `utf8mb4` character set with `utf8mb4_bin` collation -- **PostgreSQL (future):** `UTF8` encoding with `C` collation - -Like timezone handling, encoding is infrastructure configuration, not part of the data model. Ensure your MySQL server is configured with these defaults before migration. - -### License Change - -DataJoint 2.0 is licensed under **Apache 2.0** (previously LGPL-2.1). - -- More permissive for commercial and academic use -- Compatible with broader ecosystem of tools -- Clearer patent grant provisions - -No action required—the new license is more permissive. - -### Future Backend Support - -DataJoint 2.0 introduces portable type aliases (`uint32`, `float64`, etc.) that prepare the codebase for **PostgreSQL backend compatibility** in a future release. Migration to core types ensures your schemas will work seamlessly when Postgres support is available. - -### Before You Start: Testing Recommendation - -**⚡ Want AI agents to automate Phases I-II for you?** - -Create unit and integration tests for your pipeline against a QA database before -starting migration. This enables AI agents to perform most migration work -automatically, reducing manual effort by 50-80%. - -→ **See [Recommendation: Create Tests Before Migration](#recommendation-create-tests-before-migration)** for details. - -**Why this matters:** - -- **With tests:** Agents migrate code → run tests → fix failures → verify automatically -- **Without tests:** Manual verification at every step, higher risk, more time - -Tests provide immediate ROI during migration and ongoing value for development. - ---- - -## What's New in 2.0 - -### 3-Tier Column Type System - -DataJoint 2.0 introduces a unified type system with three tiers: - -| Tier | Description | Examples | Migration | -|------|-------------|----------|-----------| -| **Native** | Raw MySQL types | `int unsigned`, `tinyint` | Auto-converted to core types | -| **Core** | Standardized portable types | `uint32`, `float64`, `varchar(100)`, `json` | Phase I | -| **Codec** | Serialization to blob or storage | ``, ``, `` | Phase I-III | - -**Learn more:** [Type System Concept](../explanation/type-system.md) · [Type System Reference](../reference/specs/type-system.md) - -### Codecs - -DataJoint 2.0 makes serialization **explicit** with codecs. In pre-2.0, `longblob` automatically serialized Python objects; in 2.0, you explicitly choose ``. - -#### Migration: Legacy → 2.0 - -| pre-2.0 (Implicit) | 2.0 (Explicit) | Storage | Migration | -|-------------------|----------------|---------|-----------| -| `longblob` | `` | In-table | Phase I code, Phase III data | -| `mediumblob` | `` | In-table | Phase I code, Phase III data | -| `attach` | `` | In-table | Phase I code, Phase III data | -| `blob@store` | `` | In-store (hash) | Phase I code, Phase III data | -| `attach@store` | `` | In-store (hash) | Phase I code, Phase III data | -| `filepath@store` | `` | In-store (filepath) | Phase I code, Phase III data | - -#### New in 2.0: Schema-Addressed Storage - -These codecs are NEW—there's no legacy equivalent to migrate: - -| Codec | Description | Storage | Adoption | -|-------|-------------|---------|----------| -| `` | NumPy arrays with lazy loading | In-store (schema) | Phase IV (optional) | -| `` | Zarr, HDF5, custom formats | In-store (schema) | Phase IV (optional) | - -**Key principles:** - -- All **legacy codec** conversions happen in Phase I (code) and Phase III (data) -- **New codecs** (``, ``) are adopted in Phase IV for new features or enhanced workflows -- Schema-addressed storage organizes data by table structure—no migration needed, just new functionality - -**Learn more:** [Codec API Reference](../reference/specs/codec-api.md) · [Custom Codecs](../explanation/custom-codecs.md) - -### Unified Stores Configuration - -DataJoint 2.0 replaces `external.*` with unified `stores.*` configuration: - -**pre-2.0 (legacy):** -```json -{ - "external": { - "protocol": "file", - "location": "/data/external" - } -} -``` - -**2.0 (unified stores):** -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/stores" - } - } -} -``` - -**Learn more:** [Configuration Reference](../reference/configuration.md) · [Configure Object Storage](configure-storage.md) - -### Query API Changes - -| pre-2.0 | 2.0 | Phase | -|--------|-----|-------| -| `table.fetch()` | `table.to_arrays()` or `table.to_dicts()` | I | -| `table.fetch(..., format="frame")` | `table.to_pandas(...)` | I | -| `table.fetch1()` | `table.fetch1()` (unchanged) | — | -| `table.fetch1('KEY')` | `table.keys()` | I | -| `(table & key)._update('attr', val)` | `table.update1({**key, 'attr': val})` | I | -| `table1 @ table2` | `table1 * table2` (natural join with semantic checks) | I | -| `a.join(b, left=True)` | Consider `a.extend(b)` | I | -| `dj.U('attr') & table` | Unchanged (correct pattern) | — | -| `dj.U('attr') * table` | `table` (was a hack to change primary key) | I | -| `dj.ERD(schema)` | `dj.Diagram(schema)` | I | -| `table.insert([(1, 'a'), (2, 'b')])` | Must use dicts/DataFrames (no positional tuples) | I | - -**Learn more:** [Fetch API Reference](../reference/specs/fetch-api.md) · [Query Operators Reference](../reference/operators.md) · [Semantic Matching](../reference/specs/semantic-matching.md) - ---- - -## Migration Overview - -| Phase | Goal | Code Changes | Schema/Store Changes | Production Impact | -|-------|------|--------------|----------------------|-------------------| -| **I** | Branch & code migration | All API updates, type syntax, **all codecs** (in-table and in-store) | Empty `_v2` schemas + test stores | **None** | -| **II** | Test compatibility | — | Populate `_v2` schemas with sample data, test equivalence | **None** | -| **III** | Migrate production data | — | Multiple migration options | **Varies** | -| **IV** | Adopt new features | Optional enhancements | Optional | Running on 2.0 | - -**Key principles:** - -- Phase I implements ALL code changes including in-store codecs (using test stores) -- Production runs on pre-2.0 undisturbed through Phase II -- Phase III is data migration only—the code is already complete - -**Timeline:** - -- **Phase I:** ~1-4 hours (with AI assistance) -- **Phase II:** ~1-2 days -- **Phase III:** ~1-7 days (depends on data size and option chosen) -- **Phase IV:** Ongoing feature adoption - ---- - -## Recommendation: Create Tests Before Migration - -**Highly recommended for automated, agent-driven migration.** - -If you create unit and integration tests for your pipeline before starting -Phase I, AI coding agents can perform most of the migration work automatically, -substantially reducing manual effort. - -### Why Tests Enable Automated Migration - -**With tests:** - -1. **Phase I automation** - Agent can: - - Migrate code to 2.0 API - - Run tests to verify correctness - - Fix failures iteratively - - Complete migration with high confidence - -2. **Phase II automation** - Agent can: - - Populate `_v2` schemas with test data - - Run tests against both legacy and v2 pipelines - - Verify equivalence automatically - - Generate validation reports - -3. **Phase III guidance** - Agent can: - - Run tests after data migration - - Catch issues immediately - - Guide production cutover with confidence - -**Without tests:** - -- Manual verification at each step -- Higher risk of missed issues -- More time-intensive validation -- Uncertainty about correctness - -### What Tests to Create - -Create tests against a **QA database and object store** (separate from -production): - -**Unit tests:** - -- Table definitions compile correctly -- Schema relationships are valid -- Populate methods work for individual tables -- Query patterns return expected results - -**Integration tests:** - -- End-to-end pipeline execution -- Data flows through computed tables correctly -- External file references work (if using ``) -- Object storage operations work (if using in-store codecs) - -**Example test structure:** - -```python -# tests/test_tables.py -import pytest -import datajoint as dj -from my_pipeline import Mouse, Session, Neuron - -@pytest.fixture -def test_schema(): - """Use QA database for testing.""" - dj.config['database.host'] = 'qa-db.example.com' - schema = dj.schema('test_pipeline') - yield schema - schema.drop() # Cleanup after test - -def test_mouse_insert(test_schema): - """Test manual table insertion.""" - Mouse.insert1({'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}) - assert len(Mouse()) == 1 - -def test_session_populate(test_schema): - """Test session insertion and relationships.""" - Mouse.insert1({'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}) - Session.insert1({ - 'mouse_id': 0, - 'session_date': '2024-06-01', - 'experimenter': 'Alice' - }) - assert len(Session() & 'mouse_id=0') == 1 - -def test_neuron_computation(test_schema): - """Test computed table populate.""" - # Insert upstream data - Mouse.insert1({'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}) - Session.insert1({ - 'mouse_id': 0, - 'session_date': '2024-06-01', - 'experimenter': 'Alice' - }) - - # Populate computed table - Neuron.populate() - - # Verify results - assert len(Neuron()) > 0 - -def test_query_patterns(test_schema): - """Test common query patterns.""" - # Setup data - Mouse.insert1({'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}) - - # Test fetch - mice = Mouse.fetch(as_dict=True) - assert len(mice) == 1 - - # Test restriction - male_mice = Mouse & 'sex="M"' - assert len(male_mice) == 1 -``` - -**Integration test example:** - -```python -# tests/test_pipeline.py -def test_full_pipeline(test_schema): - """Test complete pipeline execution.""" - # 1. Insert manual data - Mouse.insert([ - {'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}, - {'mouse_id': 1, 'dob': '2024-01-15', 'sex': 'F'}, - ]) - - Session.insert([ - {'mouse_id': 0, 'session_date': '2024-06-01', 'experimenter': 'Alice'}, - {'mouse_id': 1, 'session_date': '2024-06-03', 'experimenter': 'Bob'}, - ]) - - # 2. Populate computed tables - Neuron.populate() - Analysis.populate() - - # 3. Verify data flows correctly - assert len(Mouse()) == 2 - assert len(Session()) == 2 - assert len(Neuron()) > 0 - assert len(Analysis()) > 0 - - # 4. Test queries work - alice_sessions = Session & 'experimenter="Alice"' - assert len(alice_sessions) == 1 -``` - -### How to Use Tests with AI Agents - -Once tests are created, an AI agent can: - -```bash -# Agent workflow for Phase I -1. git checkout -b pre/v2.0 -2. Update schema declarations to _v2 -3. Convert table definitions to 2.0 syntax -4. Convert API calls (fetch → to_dicts, etc.) -5. Run: pytest tests/ -6. Fix any failures iteratively -7. Repeat 5-6 until all tests pass -8. Phase I complete automatically! - -# Agent workflow for Phase II -1. Populate _v2 schemas with test data -2. Run tests against _v2 schemas -3. Compare with legacy results -4. Generate validation report -5. Phase II complete automatically! -``` - -### Investment vs. Return - -**Time investment:** - -- Creating tests: ~1-3 days -- QA database setup: ~1-2 hours - -**Time saved:** - -- Phase I: ~50-75% reduction (mostly automated) -- Phase II: ~80% reduction (fully automated validation) -- Phase III: Higher confidence, faster debugging - -**Net benefit:** Tests pay for themselves during migration and provide ongoing -value for future development. - -### When to Skip Tests - -Skip test creation if: - -- Pipeline is very simple (few tables, no computation) -- One-time migration with no ongoing development -- Team has extensive manual testing procedures already -- Time pressure requires starting migration immediately - -**Note:** Even minimal tests (just table insertion and populate) provide -significant value for automated migration. - ---- - -## Phase I: Branch and Code Migration - -**Goal:** Implement complete 2.0 API in code using test schemas and test stores. - -**End state:** - -- All Python code uses 2.0 API patterns (fetch, types, codecs) -- All codecs implemented (in-table ``, `` AND in-store ``, legacy only) -- Code points to `schema_v2` databases (empty) and test object stores -- Production continues on main branch with pre-2.0 undisturbed - -**What's NOT migrated yet:** Production data and production stores (Phase III) - -### Step 1: Pin Legacy DataJoint on Main Branch - -Ensure production code stays on pre-2.0: - -```bash -git checkout main - -# Pin legacy version in requirements -echo "datajoint<2.0.0" > requirements.txt - -git add requirements.txt -git commit -m "chore: pin legacy datajoint for production" -git push origin main -``` - -**Why:** This prevents accidental upgrades to 2.0 in production. - -### Step 2: Create Migration Branch - -```bash -# Create feature branch -git checkout -b pre/v2.0 - -# Install DataJoint 2.0 -pip install --upgrade pip -pip install "datajoint>=2.0.0" - -# Update requirements -echo "datajoint>=2.0.0" > requirements.txt - -git add requirements.txt -git commit -m "chore: upgrade to datajoint 2.0" -``` - -### Step 3: Update Schema Declarations - -**Critical early step:** Update all `dj.schema()` calls to use `_v2` suffix -for parallel testing and validation. - -**Why do this first:** - -- Creates parallel schemas alongside production (e.g., `my_pipeline_v2`) -- Allows testing 2.0 code without affecting production schemas -- Enables side-by-side validation in Phase II -- Production schemas remain untouched on `main` branch - -#### Find All Schema Declarations - -```bash -# Find all schema() calls in your codebase -grep -rn "dj.schema\|dj.Schema" --include="*.py" . - -# Example output: -# pipeline/session.py:5:schema = dj.schema('my_pipeline') -# pipeline/analysis.py:8:schema = dj.schema('my_pipeline') -# pipeline/ephys.py:3:schema = dj.schema('ephys_pipeline') -``` - -#### Update Schema Names - -**For each schema declaration, add `_v2` suffix:** - -```python -# BEFORE (production, on main branch) -schema = dj.schema('my_pipeline') - -# AFTER (testing, on pre/v2.0 branch) -schema = dj.schema('my_pipeline_v2') -``` - -**Multiple schemas example:** - -```python -# BEFORE -session_schema = dj.schema('sessions') -analysis_schema = dj.schema('analysis') -ephys_schema = dj.schema('ephys') - -# AFTER -session_schema = dj.schema('sessions_v2') -analysis_schema = dj.schema('analysis_v2') -ephys_schema = dj.schema('ephys_v2') -``` - -#### AI Agent Prompt: Update Schema Declarations - ---- - -**🤖 AI Agent Prompt: Phase I - Update Schema Declarations with _v2 -Suffix** - -``` -You are updating DataJoint schema declarations for 2.0 migration testing. - -TASK: Add _v2 suffix to all dj.schema() calls for parallel testing. - -CONTEXT: -- Branch: pre/v2.0 (just created) -- Production schemas on main branch remain unchanged -- _v2 schemas will be empty until table definitions are converted -- This enables side-by-side testing without affecting production - -STEPS: - -1. Find all schema declarations: - grep -rn "dj.schema\|dj.Schema" --include="*.py" . - -2. For EACH schema declaration, add _v2 suffix: - OLD: schema = dj.schema('my_pipeline') - NEW: schema = dj.schema('my_pipeline_v2') - -3. Preserve all other arguments: - OLD: schema = dj.schema('sessions', locals()) - NEW: schema = dj.schema('sessions_v2', locals()) - - OLD: schema = dj.schema('analysis', create_schema=True) - NEW: schema = dj.schema('analysis_v2', create_schema=True) - -4. Update any string references to schema names: - OLD: conn.query("USE my_pipeline") - NEW: conn.query("USE my_pipeline_v2") - - OLD: if schema_name == 'my_pipeline': - NEW: if schema_name == 'my_pipeline_v2': - -NAMING CONVENTION: - -- my_pipeline → my_pipeline_v2 -- sessions → sessions_v2 -- ephys_pipeline → ephys_pipeline_v2 -- lab.mouse → lab.mouse_v2 - -VERIFICATION: - -After updating, verify: -- All dj.schema() calls have _v2 suffix -- No hard-coded schema names without _v2 suffix -- No duplicate schema names (each should be unique) - -COMMIT: - -git add -A -git commit -m "feat(phase-i): add _v2 suffix to all schema declarations - -- Update all dj.schema() calls to use _v2 suffix -- Enables parallel testing without affecting production schemas -- Production schemas on main branch remain unchanged - -Schemas updated: -- my_pipeline → my_pipeline_v2 -- [list other schemas...]" -``` - ---- - -#### Verify Schema Name Changes - -```python -import datajoint as dj - -# Test connection (should work before any tables created) -conn = dj.conn() -print("✓ Connected to database") - -# At this point, _v2 schemas don't exist yet -# They will be created in Step 5 when table definitions are applied -``` - -#### Commit Schema Declaration Changes - -```bash -git add -A -git commit -m "feat(phase-i): add _v2 suffix to all schema declarations - -- Update all dj.schema() calls to use _v2 suffix -- Enables parallel testing without affecting production schemas -- Next: configure stores and convert table definitions" -``` - -**Next steps:** - -- Step 4: Configure object stores (if applicable) -- Step 5: Convert table definitions to 2.0 syntax -- When table definitions are applied, `_v2` schemas will be created - -### Step 4: Configure DataJoint 2.0 - -Create new configuration files for 2.0. - -**Note:** Schema declarations already updated in Step 3 with `_v2` suffix. -Now configure database connection and stores. - -#### Background: Configuration Changes - -DataJoint 2.0 uses: - -- **`.secrets/datajoint.json`** for credentials (gitignored) -- **`datajoint.json`** for non-sensitive settings (checked in) -- **`stores.*`** instead of `external.*` - -**Learn more:** [Configuration Reference](../reference/configuration.md) - -#### Create Configuration Files - -```bash -# Create .secrets directory -mkdir -p .secrets -echo ".secrets/" >> .gitignore - -# Create template -python -c "import datajoint as dj; dj.config.save_template()" -``` - -**Edit `.secrets/datajoint.json`:** -```json -{ - "database.host": "your-database-host", - "database.user": "your-username", - "database.password": "your-password" -} -``` - -**Edit `datajoint.json`:** -```json -{ - "loglevel": "INFO", - "safemode": true, - "display.limit": 12, - "display.width": 100, - "display.show_tuple_count": true -} -``` - -#### Verify Connection - -```python -import datajoint as dj - -# Test connection -conn = dj.conn() -print(f"Connected to {conn.conn_info['host']}") -``` - -### Step 5: Configure Test Object Stores (If Applicable) - -**Skip this step if:** Your legacy pipeline uses only in-table storage (`longblob`, `mediumblob`, `blob`, `attach`). You can skip to Step 6. - -**Configure test stores if:** Your legacy pipeline uses pre-2.0 in-store formats: - -- `blob@store` (hash-addressed blobs in object store) -- `attach@store` (hash-addressed attachments in object store) -- `filepath@store` (filepath references to external files) - -**Note:** `` and `` are NEW in 2.0 (schema-addressed storage). They have no legacy equivalent and don't need migration. Adopt them in Phase IV for new features. - -#### Background: pre-2.0 Implicit vs 2.0 Explicit Codecs - -**pre-2.0 implicit serialization:** - -- `longblob` → automatic Python object serialization (pickle) -- `mediumblob` → automatic Python object serialization (pickle) -- `blob` → automatic Python object serialization (pickle) -- No explicit codec choice - serialization was built-in - -**2.0 explicit codecs:** - -- `` → explicit Python object serialization (same behavior, now explicit) -- `` → explicit file attachment (was separate feature) -- Legacy in-store formats converted to explicit ``, ``, `` syntax - -#### Background: Unified Stores - -2.0 uses **unified stores** configuration: - -- Single `stores.*` config for all storage types (hash-addressed + schema-addressed + filepath) -- Named stores with `default` pointer -- Supports multiple stores with different backends - -**Learn more:** [Configure Object Storage](configure-storage.md) · [Object Store Configuration Spec](../reference/specs/object-store-configuration.md) - -#### Configure Test Stores - -**Edit `datajoint.json` to use test directories:** -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/v2_test_stores/main" - } - } -} -``` - -**Note:** Use separate test locations (e.g., `/data/v2_test_stores/`) to avoid conflicts with production stores. - -**For multiple test stores:** -```json -{ - "stores": { - "default": "main", - "filepath_default": "raw_data", - "main": { - "protocol": "file", - "location": "/data/v2_test_stores/main" - }, - "raw_data": { - "protocol": "file", - "location": "/data/v2_test_stores/raw" - } - } -} -``` - -**For cloud storage (using test bucket/prefix):** -```json -{ - "stores": { - "default": "s3_store", - "s3_store": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-datajoint-test-bucket", - "location": "v2-test" - } - } -} -``` - -**Store credentials in `.secrets/stores.s3_store.access_key` and -`.secrets/stores.s3_store.secret_key`:** -```bash -echo "YOUR_ACCESS_KEY" > .secrets/stores.s3_store.access_key -echo "YOUR_SECRET_KEY" > .secrets/stores.s3_store.secret_key -``` - -#### AI Agent Prompt: Configure and Test Stores - ---- - -**🤖 AI Agent Prompt: Phase I - Configure Test Stores and Verify Codecs** - -``` -You are configuring test object stores for DataJoint 2.0 migration. - -TASK: Set up test stores and verify all in-store codecs work correctly -before migrating production data. - -CONTEXT: -- Phase I uses TEST stores (separate from production) -- Testing verifies codecs work with legacy schema structure -- File organization must match expectations -- Production data migration happens in Phase III - -STEPS: - -1. Configure test stores in datajoint.json: - - Use test locations (e.g., /data/v2_test_stores/) - - For cloud: use test bucket or prefix (e.g., "v2-test") - - Configure hash_prefix, schema_prefix, filepath_prefix if needed - -2. Store credentials in .secrets/ directory: - - Create .secrets/stores..access_key (S3/GCS/Azure) - - Create .secrets/stores..secret_key (S3) - - Verify .secrets/ is gitignored - -3. Test ALL in-store codecs from legacy schema: - - (hash-addressed blob storage) - - (hash-addressed attachments) - - (filepath references) - -4. Create test table with all three codecs: - ```python - @schema - class StoreTest(dj.Manual): - definition = """ - test_id : int - --- - blob_data : - attach_data : - filepath_data : - """ - ``` - -5. Insert test data and verify: - - Insert sample data for each codec - - Fetch data back successfully - - Verify files appear at expected paths - -6. Understand hash-addressed storage structure: - {location}/{hash_prefix}/{schema_name}/{hash}[.ext] - - With subfolding [2, 2]: - {location}/{hash_prefix}/{schema_name}/{h1}{h2}/{h3}{h4}/{hash}[.ext] - - Properties: - - Immutable (content-addressed) - - Deduplicated (same content → same path) - - Integrity (hash validates content) - -7. Verify file organization meets expectations: - - Check files exist at {location}/{hash_prefix}/{schema}/ - - Verify subfolding structure if configured - - Confirm filepath references work correctly - -8. Clean up test: - - Delete test data - - Drop test schema - - Verify no errors during cleanup - -HASH-ADDRESSED STORAGE: - -Understanding the hash-addressed section is critical for migration: - -- Path format: {location}/{hash_prefix}/{schema}/{hash} -- Hash computed from serialized content (Blake2b) -- Hash encoded as base32 (lowercase, no padding) -- Subfolding splits hash into directory levels -- Same content always produces same path (deduplication) - -Example with hash_prefix="_hash", subfolding=[2,2]: - /data/store/_hash/my_schema/ab/cd/abcdef123456... - -Learn more: Object Store Configuration Spec -(../reference/specs/object-store-configuration.md#hash-addressed-storage) - -VERIFICATION: - -- [ ] Test stores configured in datajoint.json -- [ ] Credentials stored in .secrets/ (not committed) -- [ ] Connection to test stores successful -- [ ] codec tested and working -- [ ] codec tested and working -- [ ] codec tested and working -- [ ] Files appear at expected locations -- [ ] Hash-addressed structure understood -- [ ] Test cleanup successful - -REPORT: - -Test results for store configuration: -- Store names: [list configured stores] -- Store protocol: [file/s3/gcs/azure] -- Store location: [test path/bucket] -- Hash prefix: [configured value] -- Codecs tested: [blob@, attach@, filepath@] -- Files verified at: [example paths] -- Issues found: [any errors or unexpected behavior] - -COMMIT MESSAGE: -"feat(phase-i): configure test stores and verify codecs - -- Configure test stores with [protocol] at [location] -- Store credentials in .secrets/ directory -- Test all in-store codecs: blob@, attach@, filepath@ -- Verify hash-addressed file organization -- Confirm codecs work with legacy schema structure - -Test stores ready for table definition conversion." -``` - ---- - -#### Test In-Store Codecs - -After configuring test stores, verify that in-store codecs work correctly -and understand the file organization. - -**Create test table with all in-store codecs:** - -```python -import datajoint as dj -import numpy as np - -# Create test schema -schema = dj.schema('test_stores_v2') - -@schema -class StoreTest(dj.Manual): - definition = """ - test_id : int - --- - blob_data : # Hash-addressed blob - attach_data : # Hash-addressed attachment - filepath_data : # Filepath reference - """ - -# Test data -test_blob = {'key': 'value', 'array': [1, 2, 3]} -test_attach = {'metadata': 'test attachment'} - -# For filepath, create test file first -import tempfile -import os -temp_dir = tempfile.gettempdir() -test_file_path = 'test_data/sample.txt' -full_path = os.path.join( - dj.config['stores']['default']['location'], - test_file_path -) -os.makedirs(os.path.dirname(full_path), exist_ok=True) -with open(full_path, 'w') as f: - f.write('test content') - -# Insert test data -StoreTest.insert1({ - 'test_id': 1, - 'blob_data': test_blob, - 'attach_data': test_attach, - 'filepath_data': test_file_path -}) - -print("✓ Test data inserted successfully") -``` - -**Verify file organization:** - -```python -# Fetch and verify -result = (StoreTest & {'test_id': 1}).fetch1() -print(f"✓ blob_data: {result['blob_data']}") -print(f"✓ attach_data: {result['attach_data']}") -print(f"✓ filepath_data: {result['filepath_data']}") - -# Inspect hash-addressed file organization -store_spec = dj.config.get_store_spec() -hash_prefix = store_spec.get('hash_prefix', '_hash') -location = store_spec['location'] - -print(f"\nStore organization:") -print(f" Location: {location}") -print(f" Hash prefix: {hash_prefix}/") -print(f" Expected structure: {hash_prefix}/{{schema}}/{{hash}}") -print(f"\nVerify files exist at:") -print(f" {location}/{hash_prefix}/test_stores_v2/") -``` - -**Review hash-addressed storage structure:** - -Hash-addressed storage (``, ``) uses content-based paths: - -``` -{location}/{hash_prefix}/{schema_name}/{hash}[.ext] -``` - -With subfolding enabled (e.g., `[2, 2]`): - -``` -{location}/{hash_prefix}/{schema_name}/{h1}{h2}/{h3}{h4}/{hash}[.ext] -``` - -**Properties:** - -- **Immutable**: Content defines path, cannot be changed -- **Deduplicated**: Identical content stored once -- **Integrity**: Hash validates content on retrieval - -**Learn more:** [Object Store Configuration — Hash-Addressed Storage] -(../reference/specs/object-store-configuration.md#hash-addressed-storage) - -**Cleanup test:** - -```python -# Remove test data -(StoreTest & {'test_id': 1}).delete() -schema.drop() -print("✓ Test cleanup complete") -``` - -### Step 6: Convert Table Definitions - -Update table definitions in topological order (tables before their dependents). - -**Note:** Schema declarations already updated to `_v2` suffix in Step 3. - -#### Background: Type Syntax Changes - -Convert ALL types and codecs in Phase I: - -**Integer and Float Types:** - -| pre-2.0 | 2.0 | Category | -|--------|-----|----------| -| `int unsigned` | `uint32` | Core type | -| `int` | `int32` | Core type | -| `smallint unsigned` | `uint16` | Core type | -| `tinyint unsigned` | `uint8` | Core type | -| `bigint unsigned` | `uint64` | Core type | -| `float` | `float32` | Core type | -| `double` | `float64` | Core type | - -**String, Date, and Structured Types:** - -| pre-2.0 | 2.0 | Notes | -|--------|-----|-------| -| `varchar(N)`, `char(N)` | Unchanged | Core types | -| `date` | Unchanged | Core type | -| `enum('a', 'b')` | Unchanged | Core type | -| `bool`, `boolean` | `bool` | Core type (MySQL stores as tinyint(1)) | -| `datetime` | `datetime` | Core type; UTC standard in 2.0 | -| `timestamp` | `datetime` | **Ask user:** Review timezone convention, convert to UTC datetime | -| `json` | `json` | Core type (was available but underdocumented) | -| `uuid` | `uuid` | Core type (widely used in legacy) | -| `text` | `varchar(N)` or keep as native | **Native type:** Consider migrating to `varchar(n)` | -| `time` | `datetime` or keep as native | **Native type:** Consider using `datetime` | -| `tinyint(1)` | `bool` or `uint8` | **Ask user:** was this boolean or small integer? | - -**Codecs:** - -| pre-2.0 | 2.0 | Category | -|--------|-----|----------| -| `longblob` | `` | Codec (in-table) | -| `attach` | `` | Codec (in-table) | -| `blob@store` | `` | Codec (in-store) | -| `attach@store` | `` | Codec (in-store) | -| `filepath@store` | `` | Codec (in-store) | - -**Important Notes:** - -- **Core vs Native Types:** DataJoint 2.0 distinguishes **core types** (portable, standardized) from **native types** (backend-specific). Core types are preferred. Native types like `text` and `time` are allowed but discouraged—they may generate warnings and lack portability guarantees. - -- **Datetime/Timestamp:** DataJoint 2.0 adopts **UTC as the standard for all datetime storage**. The database stores UTC; timezones are handled by application front-ends and client APIs. For `timestamp` columns, review your existing timezone convention—you may need data conversion. We recommend adopting UTC throughout your pipeline and converting `timestamp` to `datetime`. - -- **Bool:** Legacy DataJoint supported `bool` and `boolean` types (MySQL stores as `tinyint(1)`). Keep as `bool` in 2.0. Only explicit `tinyint(1)` declarations need review: - - If used for boolean semantics (yes/no, active/inactive) → `bool` - - If used for small integers (counts, indices 0-255) → `uint8` - -- **Text Type:** `text` is a native MySQL type, not a core type. Consider migrating to `varchar(n)` with appropriate length. If your text truly needs unlimited length, you can keep `text` as a native type (will generate a warning). - -- **Time Type:** `time` is a native MySQL type with no core equivalent. We recommend migrating to `datetime` (which can represent both date and time components). If you only need time-of-day without date, you can keep `time` as a native type (will generate a warning). - -- **JSON:** Core type that was available in pre-2.0 but underdocumented. Many users serialized JSON into blobs. If you have custom JSON serialization in blobs, you can migrate to native `json` type (optional). - -- **Enum:** Core type—no changes needed. - -- **In-store codecs:** Code is converted in Phase I using test stores. Production data migration happens in Phase III. - -**Learn more:** [Type System Reference](../reference/specs/type-system.md) · [Definition Syntax](../reference/definition-syntax.md) - -#### AI Agent Prompt: Convert Table Definitions - -Use this prompt with your AI coding assistant: - ---- - -**🤖 AI Agent Prompt: Phase I - Table Definition Conversion** - -``` -You are converting DataJoint pre-2.0 table definitions to 2.0 syntax. - -TASK: Update all table definitions in this repository to DataJoint 2.0 type syntax. - -CONTEXT: - -- We are on branch: pre/v2.0 -- Production (main branch) remains on pre-2.0 -- Schema declarations ALREADY updated with _v2 suffix (Step 3) -- Now converting table definitions to match -- Schemas will be created empty when definitions are applied - -SCOPE - PHASE I: - -1. Convert ALL type syntax to 2.0 core types -2. Convert ALL legacy codecs (in-table AND in-store) - - In-table: longblob → , mediumblob → , attach → - - In-store (legacy only): blob@store → , attach@store → , filepath@store → -3. Code will use TEST stores configured in datajoint.json -4. Do NOT add new 2.0 codecs (, ) - these are for Phase IV adoption -5. Production data migration happens in Phase III (code is complete after Phase I) - -TYPE CONVERSIONS: - -Core Types (Integer and Float): - int unsigned → uint32 - int → int32 - smallint unsigned → uint16 - smallint → int16 - tinyint unsigned → uint8 - tinyint → int8 - bigint unsigned → uint64 - bigint → int64 - float → float32 - double → float64 - decimal(M,D) → decimal(M,D) # unchanged - -Core Types (String and Date): - varchar(N) → varchar(N) # unchanged (core type) - char(N) → char(N) # unchanged (core type) - date → date # unchanged (core type) - enum('a', 'b') → enum('a', 'b') # unchanged (core type) - bool → bool # unchanged (core type, MySQL stores as tinyint(1)) - boolean → bool # unchanged (core type, MySQL stores as tinyint(1)) - datetime → datetime # unchanged (core type) - -Core Types (Structured Data): - json → json # unchanged (core type, was available but underdocumented in pre-2.0) - uuid → uuid # unchanged (core type, widely used in pre-2.0) - -Native Types (Discouraged but Allowed): - text → Consider varchar(N) with appropriate length, or keep as native type - time → Consider datetime (can represent date+time), or keep as native type - -Special Cases - REQUIRE USER REVIEW: - - tinyint(1) → ASK USER: bool or uint8? - Note: Legacy DataJoint had bool/boolean types. Only explicit tinyint(1) needs review. - - Boolean semantics (yes/no, active/inactive) → bool - - Small integer (counts, indices 0-255) → uint8 - Example: - is_active : tinyint(1) # Boolean semantics → bool - priority : tinyint(1) # 0-10 scale → uint8 - has_data : bool # Already bool → keep as bool - - timestamp → ASK USER about timezone convention, then convert to datetime - Example: - created_at : timestamp # pre-2.0 (UNKNOWN timezone convention) - created_at : datetime # 2.0 (UTC standard) - -IMPORTANT - Datetime and Timestamp Conversion: - -DataJoint 2.0 adopts UTC as the standard for all datetime storage (no timezone information). -The database stores UTC; timezones are handled by application front-ends and client APIs. - -Conversion rules: - -- datetime → Keep as datetime (assume UTC, core type) -- timestamp → ASK USER about timezone convention, then convert to datetime -- date → Keep as date (core type) -- time → ASK USER: recommend datetime (core type) or keep as time (native type) - -For EACH timestamp column, ASK THE USER: - -1. "What timezone convention was used for [column_name]?" - - UTC (no conversion needed) - - Server local time (requires conversion to UTC) - - Application local time (requires conversion to UTC) - - Mixed/unknown (requires data audit) - -2. "Does this use MySQL's auto-update behavior (ON UPDATE CURRENT_TIMESTAMP)?" - - If yes, may need to update table schema - - If no, application controls the value - -3. After clarifying, recommend: - - Convert type: timestamp → datetime - - If not already UTC: Add data conversion script to Phase III - - Update application code to store UTC times - - Handle timezone display in application front-ends and client APIs - -Example conversation: - AI: "I found timestamp column 'session_time'. What timezone was used?" - User: "Server time (US/Eastern)" - AI: "I recommend converting to UTC. I'll convert the type to datetime and add a - data conversion step in Phase III to convert US/Eastern times to UTC." - -Example: - # pre-2.0 - session_time : timestamp # Was storing US/Eastern - event_time : timestamp # Already UTC - - # 2.0 (after user confirmation) - session_time : datetime # Converted to UTC in Phase III - event_time : datetime # No data conversion needed - -IMPORTANT - Bool Type: - -Legacy DataJoint already supported bool and boolean types (MySQL stores as tinyint(1)). - -Conversion rules: - -- bool → Keep as bool (no change) -- boolean → Keep as bool (no change) -- tinyint(1) → ASK USER: was this boolean or small integer? - -Only explicit tinyint(1) declarations need review because: - -- Legacy had bool/boolean for true/false values -- Some users explicitly used tinyint(1) for small integers (0-255) - -Example: - # pre-2.0 - is_active : bool # Already bool → no change - enabled : boolean # Already boolean → bool - is_valid : tinyint(1) # ASK: Boolean semantics? → bool - n_retries : tinyint(1) # ASK: Small integer? → uint8 - - # 2.0 - is_active : bool # Unchanged - enabled : bool # boolean → bool - is_valid : bool # Boolean semantics - n_retries : uint8 # Small integer - -IMPORTANT - Enum Types: - -enum is a core type—no changes required. - -Example: - sex : enum('M', 'F', 'U') # No change needed - -IMPORTANT - JSON Type: - -json is a core type that was available in pre-2.0 but underdocumented. Many users -serialized JSON into blobs. If you have custom JSON serialization in blobs, you can -migrate to native json type (optional migration, not required). - -Example: - # Optional: migrate blob with JSON to native json - config : longblob # Contains serialized JSON - config : json # Core JSON type (optional improvement) - -IMPORTANT - Native Types (text and time): - -text and time are NATIVE MySQL types, NOT core types. They are allowed but discouraged. - -For text: - -- ASK USER: What is the maximum expected length? -- Recommend migrating to varchar(n) with appropriate length (core type) -- Or keep as text (native type, will generate warning) - -For time: - -- ASK USER: Is this time-of-day only, or is date also relevant? -- Recommend migrating to datetime (core type, can represent date+time) -- Or keep as time (native type, will generate warning) - -Example: - # pre-2.0 - description : text # Native type - session_start : time # Native type (time-of-day) - - # 2.0 (recommended) - description : varchar(1000) # Core type (after asking user about max length) - session_start : datetime # Core type (if date is also relevant) - - # 2.0 (alternative - keep native) - description : text # Native type (if truly unlimited length needed) - session_start : time # Native type (if only time-of-day needed) - -In-Table Codecs: - longblob → - attach → - -In-Store Codecs (LEGACY formats only - convert these): - blob@store → # Add angle brackets - attach@store → # Add angle brackets - filepath@store → # Add angle brackets - -IMPORTANT - Do NOT use these during migration (NEW in 2.0): - # Schema-addressed storage - NEW feature - # Schema-addressed storage - NEW feature - # These have NO legacy equivalent - # Adopt in Phase IV AFTER migration is complete - # Do NOT convert existing attributes to these codecs - -SCHEMA DECLARATIONS: - OLD: schema = dj.schema('my_pipeline') - NEW: schema = dj.schema('my_pipeline_v2') - -PROCESS: -1. Identify all Python files with DataJoint schemas -2. For each schema: - a. Update schema declaration (add _v2 suffix) - b. Create schema on database (empty for now) -3. For each table definition in TOPOLOGICAL ORDER: - a. Convert ALL type syntax (core types + all codecs) - b. Verify syntax is valid -4. Test that all tables can be declared (run file to create tables) -5. Verify in-store codecs work with test stores - -VERIFICATION: - -- All schema declarations use _v2 suffix -- All native types converted to core types -- All codecs converted (in-table AND in-store) -- Test stores configured and accessible -- No syntax errors -- All tables create successfully (empty) - -EXAMPLE CONVERSION: - -# pre-2.0 -schema = dj.schema('neuroscience_pipeline') - -@schema -class Recording(dj.Manual): - definition = """ - recording_id : int unsigned - --- - sampling_rate : float - signal : blob@raw # pre-2.0 in-store syntax - waveforms : blob@raw # pre-2.0 in-store syntax - metadata : longblob # pre-2.0 in-table - """ - -# 2.0 (Phase I with test stores) -schema = dj.schema('neuroscience_pipeline_v2') - -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uint32 - --- - sampling_rate : float32 - signal : # Converted: blob@raw → - waveforms : # Converted: blob@raw → - metadata : # Converted: longblob → - """ - -# Phase I: Only convert existing legacy formats -# Do NOT add new codecs like during migration - -# If you want to adopt later (Phase IV), that's a separate step: -# - After migration is complete -# - For new features or performance improvements -# - Not required for migration - -REPORT: - -- Schemas converted: [list with _v2 suffix] -- Tables converted: [count by schema] -- Type conversions: [count by type] -- Codecs converted: - - In-table: [count of , ] - - In-store: [count of , , ] -- Tables created successfully: [list] -- Test stores configured: [list store names] - -COMMIT MESSAGE FORMAT: -"feat(phase-i): convert table definitions to 2.0 syntax - -- Update schema declarations to *_v2 -- Convert native types to core types (uint32, float64, etc.) -- Convert all codecs (in-table + in-store) -- Configure test stores for development/testing - -Tables converted: X -Codecs converted: Y (in-table: Z, in-store: W)" -``` - ---- - -### Step 7: Convert Query and Insert Code - -Update all DataJoint API calls to 2.0 patterns. - -#### Background: API Changes - -**Fetch API:** - -- `fetch()` → `to_arrays()` (recarray-like) or `to_dicts()` (list of dicts) -- `fetch(..., format="frame")` → `to_pandas()` (pandas DataFrame) -- `fetch('attr1', 'attr2')` → `to_arrays('attr1', 'attr2')` (returns tuple) -- `fetch1()` → unchanged (still returns dict for single row) - -**Update Method:** - -- `(table & key)._update('attr', val)` → `table.update1({**key, 'attr': val})` - -**Join Operators:** - -- `table1 @ table2` → `table1 * table2` (natural join with semantic checks enabled) -- `a.join(b, left=True)` → Consider `a.extend(b)` - -**Universal Set:** - -- `dj.U('attr') & table` → Unchanged (correct pattern for projecting attributes) -- `dj.U('attr') * table` → `table` (was a hack to change primary key) - -**Visualization:** - -- `dj.ERD(schema)` → `dj.Diagram(schema)` (ERD deprecated) - -**Learn more:** [Fetch API Reference](../reference/specs/fetch-api.md) · [Query Operators](../reference/operators.md) - -#### AI Agent Prompt: Convert Query and Insert Code - ---- - -**🤖 AI Agent Prompt: Phase I - Query and Insert Code Conversion** - -``` -You are converting DataJoint pre-2.0 query and insert code to 2.0 API. - -TASK: Update all query, fetch, and insert code to use DataJoint 2.0 API -patterns. - -LEARN MORE: See Fetch API Reference (../reference/specs/fetch-api.md), -Query Operators (../reference/operators.md), and Semantic Matching -(../reference/specs/semantic-matching.md). - -CONTEXT: -- Branch: pre/v2.0 -- Schema declarations already updated to _v2 suffix -- Table definitions already converted -- Production code on main branch unchanged - -API CONVERSIONS: - -1. Fetch API (always convert): - OLD: data = table.fetch() - NEW: data = table.to_arrays() # recarray-like - - OLD: data = table.fetch(as_dict=True) - NEW: data = table.to_dicts() # list of dicts - - OLD: data = table.fetch(format="frame") - NEW: data = table.to_pandas() # pandas DataFrame - - OLD: data = table.fetch('attr1', 'attr2') - NEW: data = table.to_arrays('attr1', 'attr2') # returns tuple - - OLD: row = table.fetch1() - NEW: row = table.fetch1() # UNCHANGED - - OLD: keys = table.fetch1('KEY') - NEW: keys = table.keys() # Returns list of dicts with primary key values - - OLD: keys, a, b = table.fetch("KEY", "a", "b") - NEW: a, b = table.to_arrays('a', 'b', include_key=True) - # Returns tuple with keys included - -2. Update Method (always convert): - OLD: (table & key)._update('attr', value) - NEW: table.update1({**key, 'attr': value}) - -3. Join Operator (always convert): - OLD: result = table1 @ table2 - NEW: result = table1 * table2 # Natural join WITH semantic checks - - IMPORTANT: The @ operator bypassed semantic checks. The * operator - enables semantic checks by default. If semantic checks fail, - INVESTIGATE—this may reveal errors in your schema or data. - - For left joins: - OLD: result = a.join(b, left=True) - NEW: result = a.extend(b) # Consider using extend for left joins - -4. Universal Set (CHECK - distinguish correct from hack): - CORRECT (unchanged): - result = dj.U('attr') & table # Projects specific attributes, unchanged - - HACK (always refactor): - OLD: result = dj.U('attr') * table # Was hack to change primary key - NEW: result = table # Simply use table directly - - Note: The * operator with dj.U() was a hack. Replace with just table. - -5. Insert (CHANGED - requires named keys): - OLD: table.insert([(1, 'Alice'), (2, 'Bob')]) # Positional tuples - NEW: table.insert([{'id': 1, 'name': 'Alice'}, - {'id': 2, 'name': 'Bob'}]) # Dicts - - DataJoint 2.0 requires named key-value mappings for insert: - - Dicts (most common) - - DataFrames - - Other DataJoint queries - - Positional tuples/lists are NO LONGER SUPPORTED. - -6. Delete (unchanged): - (table & key).delete() # unchanged - (table & restriction).delete() # unchanged - -PROCESS: -1. Find all Python files with DataJoint code -2. For each file: - a. Search for fetch patterns - b. Replace with 2.0 equivalents - c. Search for update patterns - d. Replace with update1() - e. Search for @ operator (replace with * for natural join) - f. Search for .join(x, left=True) patterns (consider .extend(x)) - g. Search for dj.U() * patterns (replace with just table) - h. Verify dj.U() & patterns remain unchanged -3. Run syntax checks -4. Run existing tests if available -5. If semantic checks fail after @ → * conversion, investigate schema/data - -VERIFICATION: - -- No .fetch() calls remaining (except fetch1) -- No .fetch1('KEY') calls remaining (replaced with .keys()) -- No ._update() calls remaining -- No @ operator between tables -- dj.U() * patterns replaced with just table -- dj.U() & patterns remain unchanged -- All tests pass (if available) -- Semantic check failures investigated and resolved - -COMMON PATTERNS: - -Pattern 1: Fetch all as dicts -OLD: sessions = Session.fetch(as_dict=True) -NEW: sessions = Session.to_dicts() - -Pattern 2: Fetch specific attributes -OLD: mouse_ids, dobs = Mouse.fetch('mouse_id', 'dob') -NEW: mouse_ids, dobs = Mouse.to_arrays('mouse_id', 'dob') - -Pattern 3: Fetch as pandas DataFrame -OLD: df = Mouse.fetch(format="frame") -NEW: df = Mouse.to_pandas() - -Pattern 4: Fetch single row -OLD: row = (Mouse & key).fetch1() # unchanged -NEW: row = (Mouse & key).fetch1() # unchanged - -Pattern 5: Update attribute -OLD: (Session & key)._update('experimenter', 'Alice') -NEW: Session.update1({**key, 'experimenter': 'Alice'}) - -Pattern 6: Fetch primary keys -OLD: keys = Mouse.fetch1('KEY') -NEW: keys = Mouse.keys() - -Pattern 7: Fetch with keys included -OLD: keys, weights, ages = Mouse.fetch("KEY", "weight", "age") -NEW: weights, ages = Mouse.to_arrays('weight', 'age', include_key=True) - -Pattern 8: Natural join (now WITH semantic checks) -OLD: result = Neuron @ Session -NEW: result = Neuron * Session -# Semantic checks enabled—may reveal schema errors - -Pattern 9: Left join -OLD: result = Session.join(Experiment, left=True) -NEW: result = Session.extend(Experiment) # Consider using extend - -Pattern 10: Universal set (distinguish correct from hack) -CORRECT (unchanged): -OLD: all_dates = dj.U('session_date') & Session -NEW: all_dates = dj.U('session_date') & Session # Unchanged, correct - -HACK (always replace): -OLD: result = dj.U('new_pk') * Session # Hack to change primary key -NEW: result = Session # Simply use table directly - -REPORT: - -- Files modified: [list] -- fetch() → to_arrays/to_dicts: [count] -- fetch(..., format="frame") → to_pandas(): [count] -- fetch1('KEY') → keys(): [count] -- _update() → update1(): [count] -- @ → * (natural join): [count] -- .join(x, left=True) → .extend(x): [count] -- dj.U() * table → table: [count] -- dj.U() & table patterns (unchanged): [count] -- dj.ERD() → dj.Diagram(): [count] -- Semantic check failures: [count and resolution] -- Tests passed: [yes/no] - -COMMIT MESSAGE FORMAT: -"feat(phase-i): convert query and insert code to 2.0 API - -- Replace fetch() with to_arrays()/to_dicts()/to_pandas() -- Replace fetch1('KEY') with keys() -- Replace _update() with update1() -- Replace @ operator with * (enables semantic checks) -- Replace .join(x, left=True) with .extend(x) -- Replace dj.ERD() with dj.Diagram() -- Replace dj.U() * table with just table (was hack) -- Keep dj.U() & table patterns unchanged (correct) -- Investigate and resolve semantic check failures - -API conversions: X fetch, Y update, Z join" -``` - ---- - -### Step 8: Update Populate Methods - -`make()` methods in Computed and Imported tables use the same API patterns covered in Steps 6-7. - -**Apply the following conversions to all `make()` methods:** - -1. **Fetch API conversions** (from Step 7) - - - `fetch()` → `to_arrays()` or `to_dicts()` - - `fetch(..., format="frame")` → `to_pandas()` - - `fetch1('KEY')` → `keys()` - - All other fetch patterns - -2. **Join conversions** (from Step 7) - - - `@` → `*` (enables semantic checks) - - `a.join(b, left=True)` → `a.extend(b)` - - `dj.U() * table` → `table` (was a hack) - -3. **Insert conversions** (NEW REQUIREMENT) - - - Positional tuples NO LONGER SUPPORTED - - Must use named key-value mappings: - ```python - # OLD (no longer works) - self.insert1((key['id'], computed_value, timestamp)) - - # NEW (required) - self.insert1({ - **key, - 'computed_value': computed_value, - 'timestamp': timestamp - }) - ``` - -**Note:** Since these are the same conversions from Step 7, you can apply them in a single pass. The only additional consideration is ensuring insert statements use dicts. - ---- - -### Step 9: Verify Phase I Complete - -#### Checklist - -- [ ] `pre/v2.0` branch created -- [ ] DataJoint 2.0 installed (`pip list | grep datajoint`) -- [ ] Configuration files created (`.secrets/`, `datajoint.json`) -- [ ] Test stores configured (if using in-store codecs) -- [ ] In-store codecs tested (``, ``, ``) -- [ ] Hash-addressed file organization verified and understood -- [ ] All schema declarations use `_v2` suffix -- [ ] All table definitions use 2.0 type syntax -- [ ] All in-table codecs converted (``, ``) -- [ ] All in-store codecs converted (``, ``, ``) -- [ ] All `fetch()` calls converted (except `fetch1()`) -- [ ] All `fetch(..., format="frame")` converted to `to_pandas()` -- [ ] All `fetch1('KEY')` converted to `keys()` -- [ ] All `._update()` calls converted -- [ ] All `@` operators converted to `*` -- [ ] All `dj.U() * table` patterns replaced with just `table` (was a hack) -- [ ] All `dj.U() & table` patterns verified as unchanged (correct) -- [ ] All `dj.ERD()` calls converted to `dj.Diagram()` -- [ ] All populate methods updated -- [ ] No syntax errors -- [ ] All `_v2` schemas created (empty) - -#### Test Schema Creation - -```python -# Run your main module to create all tables -import your_pipeline_v2 - -# Verify schemas exist -import datajoint as dj -conn = dj.conn() - -schemas = conn.query("SHOW DATABASES LIKE '%_v2'").fetchall() -print(f"Created {len(schemas)} _v2 schemas:") -for schema in schemas: - print(f" - {schema[0]}") - -# Verify tables created -for schema_name in [s[0] for s in schemas]: - tables = conn.query( - f"SELECT COUNT(*) FROM information_schema.TABLES " - f"WHERE TABLE_SCHEMA='{schema_name}'" - ).fetchone()[0] - print(f"{schema_name}: {tables} tables") -``` - -#### Commit Phase I - -```bash -# Review all changes -git status -git diff - -# Commit -git add . -git commit -m "feat: complete Phase I migration to DataJoint 2.0 - -Summary: -- Created _v2 schemas (empty) -- Converted all table definitions to 2.0 syntax -- Converted all query/insert code to 2.0 API -- Converted all populate methods -- Configured test stores for in-store codecs -- Production data migration deferred to Phase III - -Schemas: X -Tables: Y -Code files: Z" - -git push origin pre/v2.0 -``` - -✅ **Phase I Complete!** - -**You now have:** - -- 2.0-compatible code on `pre/v2.0` branch -- Empty `_v2` schemas ready for testing -- Production still running on `main` branch with pre-2.0 - -**Next:** Phase II - Test with sample data - ---- - -## Phase II: Test Compatibility and Equivalence - -**Goal:** Validate that the 2.0 pipeline produces equivalent results to the legacy pipeline. - -**End state:** - -- 2.0 pipeline runs correctly with sample data in `_v2` schemas and test stores -- Results are equivalent to running legacy pipeline on same data -- Confidence that migration is correct before touching production -- Production still untouched - -**Key principle:** Test with identical data in both legacy and v2 schemas to verify equivalence. - -### Step 1: Run Your Regular Workflow - -Use your existing data entry and populate processes on the `_v2` schemas: - -```python -# Import your v2 pipeline -from your_pipeline_v2 import schema # Points to my_pipeline_v2 - -# Follow your normal workflow: -# 1. Insert test data into manual tables (same process as usual) -# 2. Run populate on computed/imported tables (same process as usual) -# 3. Run any queries or analysis scripts (using 2.0 API) - -# Example (adapt to your pipeline): -# YourManualTable.insert([...]) # Your usual insert process -# YourComputedTable.populate(display_progress=True) # Your usual populate -``` - -**Key points:** - -- Use a **representative subset** of data (not full production dataset) -- Follow your **existing workflow** - don't create artificial examples -- Populate computed tables using your **normal populate process** -- Run any **existing analysis or query scripts** you have -- Test that everything works with the 2.0 API - -### Step 2: Compare with Legacy Schema (Equivalence Testing) - -**Critical:** Run identical data through both legacy and v2 pipelines to verify equivalence. - -#### Option A: Side-by-Side Comparison - -```python -# compare_legacy_v2.py -import datajoint as dj -import numpy as np - -# Import both legacy and v2 modules -import your_pipeline as legacy # pre-2.0 on main branch (checkout to test) -import your_pipeline_v2 as v2 # 2.0 on pre/v2.0 branch - -def compare_results(): - """Compare query results between legacy and v2.""" - - # Insert same data into both schemas - test_data = [ - {'mouse_id': 0, 'dob': '2024-01-01', 'sex': 'M'}, - {'mouse_id': 1, 'dob': '2024-01-15', 'sex': 'F'}, - ] - - legacy.Mouse.insert(test_data, skip_duplicates=True) - v2.Mouse.insert(test_data, skip_duplicates=True) - - # Compare query results - legacy_mice = legacy.Mouse.fetch(as_dict=True) # pre-2.0 syntax - v2_mice = v2.Mouse.to_dicts() # 2.0 syntax - - assert len(legacy_mice) == len(v2_mice), "Row count mismatch!" - - # Compare values (excluding fetch-specific artifacts) - for leg, v2_row in zip(legacy_mice, v2_mice): - for key in leg.keys(): - if leg[key] != v2_row[key]: - print(f"MISMATCH: {key}: {leg[key]} != {v2_row[key]}") - return False - - print("✓ Query results are equivalent!") - return True - -def compare_populate(): - """Compare populate results.""" - - # Populate both - legacy.Neuron.populate(display_progress=True) - v2.Neuron.populate(display_progress=True) - - # Compare counts - legacy_count = len(legacy.Neuron()) - v2_count = len(v2.Neuron()) - - assert legacy_count == v2_count, f"Count mismatch: {legacy_count} != {v2_count}" - - print(f"✓ Populate generated same number of rows: {v2_count}") - - # Compare computed values (if numeric) - for key in (legacy.Neuron & 'neuron_id=0').keys(): - leg_val = (legacy.Neuron & key).fetch1('activity') - v2_val = (v2.Neuron & key).fetch1('activity') - - if isinstance(leg_val, np.ndarray): - assert np.allclose(leg_val, v2_val, rtol=1e-9), "Array values differ!" - else: - assert leg_val == v2_val, f"Value mismatch: {leg_val} != {v2_val}" - - print("✓ Populate results are equivalent!") - return True - -if __name__ == '__main__': - print("Comparing legacy and v2 pipelines...") - compare_results() - compare_populate() - print("\n✓ All equivalence tests passed!") -``` - -Run comparison: - -```bash -python compare_legacy_v2.py -``` - -#### Option B: Data Copy and Validation - -If you can't easily import both modules: - -1. Copy sample data from production to both legacy test schema and `_v2` schema -2. Run populate on both -3. Use helper to compare: - -```python -from datajoint.migrate import compare_query_results - -# Compare table contents -result = compare_query_results( - prod_schema='my_pipeline', - test_schema='my_pipeline_v2', - table='neuron', - tolerance=1e-6, -) - -if result['match']: - print(f"✓ {result['row_count']} rows match") -else: - print(f"✗ Discrepancies found:") - for disc in result['discrepancies']: - print(f" {disc}") -``` - -### Step 3: Run Existing Tests - -If you have a test suite: - -```bash -# Run tests against _v2 schemas -pytest tests/ -v - -# Or specific test modules -pytest tests/test_queries.py -v -pytest tests/test_populate.py -v -``` - -### Step 4: Document Test Results - -Document your testing process and results: - -**What to document:** - -- Date of testing -- Test data used (subset, size, representative samples) -- Tables tested and row counts -- Populate results (did computed tables generate expected rows?) -- Equivalence test results (if comparing with legacy) -- Any issues found and how they were resolved -- Test suite results (if you have automated tests) - -**Purpose:** Creates a record of validation for your team and future reference. -Useful when planning production migration in Phase III. - -✅ **Phase II Complete!** - -**You now have:** - -- Validated 2.0 pipeline with sample data -- Confidence in code migration -- Test report documenting success -- Ready to migrate production data - -**Next:** Phase III - Migrate production data - ---- - -## Phase III: Migrate Production Data - -**Goal:** Migrate production data and configure production stores. Code is complete from Phase I. - -**End state:** - -- Production data migrated to `_v2` schemas -- Production stores configured (replacing test stores) -- In-store metadata updated (UUID → JSON) -- Ready to switch production to 2.0 - -**Key principle:** All code changes were completed in Phase I. This phase is DATA migration only. - -**Prerequisites:** - -- Phase I complete (all code migrated) -- Phase II complete (equivalence validated) -- Production backup created -- Production workloads quiesced - -**Options:** - -- **Option A:** Copy data, rename schemas (recommended - safest) -- **Option B:** In-place migration (for very large databases) -- **Option C:** Gradual migration with legacy compatibility - -Choose the option that best fits your needs. - -### Option A: Copy Data and Rename Schemas (Recommended) - -**Best for:** Most pipelines, especially < 1 TB - -**Advantages:** - -- Safe - production unchanged until final step -- Easy rollback -- Can practice multiple times - -**Process:** - -#### 0. Configure Production Stores - -Update `datajoint.json` to point to production stores (not test stores): - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/production_stores/main" # Production location - } - } -} -``` - -**For in-store data migration:** You can either: - -- **Keep files in place** (recommended): Point to existing pre-2.0 store locations -- **Copy to new location**: Configure new production stores and copy files - -**Commit this change:** -```bash -git add datajoint.json -git commit -m "config: update stores to production locations" -``` - -#### 1. Backup Production - -```bash -# Full backup -mysqldump --all-databases > backup_$(date +%Y%m%d).sql - -# Or schema-specific -mysqldump my_pipeline > my_pipeline_backup_$(date +%Y%m%d).sql -``` - -#### 2. Copy Manual Table Data - -```python -from datajoint.migrate import copy_table_data - -# Copy each manual table -tables = ['mouse', 'session', 'experimenter'] # Your manual tables - -for table in tables: - result = copy_table_data( - source_schema='my_pipeline', - dest_schema='my_pipeline_v2', - table=table, - ) - print(f"{table}: copied {result['rows_copied']} rows") -``` - -#### 3. Populate Computed Tables - -```python -from your_pipeline_v2 import Neuron, Analysis - -# Populate using 2.0 code -Neuron.populate(display_progress=True) -Analysis.populate(display_progress=True) -``` - -#### 4. Migrate In-Store Metadata - -**Important:** Your code already handles in-store codecs (converted in Phase I). This step just updates metadata format. - -If you have tables using ``, ``, or `` codecs, migrate the storage metadata from legacy BINARY(16) UUID format to 2.0 JSON format: - -```python -from datajoint.migrate import migrate_external_pointers_v2 - -# Update metadata format (UUID → JSON) -# This does NOT move files—just updates database pointers -result = migrate_external_pointers_v2( - schema='my_pipeline_v2', - table='recording', - attribute='signal', - source_store='raw', # Legacy pre-2.0 store name - dest_store='raw', # 2.0 store name (from datajoint.json) - copy_files=False, # Keep files in place (recommended) -) - -print(f"Migrated {result['rows_migrated']} pointers") -``` - -**What this does:** - -- Reads legacy BINARY(16) UUID pointers from `~external_*` hidden tables -- Creates new JSON metadata with file path, store name, hash -- Writes JSON to the `` column (code written in Phase I) -- Does NOT copy files (unless `copy_files=True`) - -**Result:** Files stay in place, but 2.0 code can now access them via the new codec system. - -#### 5. Validate Data Integrity - -```python -from datajoint.migrate import compare_query_results - -# Compare production vs _v2 -tables_to_check = ['mouse', 'session', 'neuron', 'analysis'] - -all_match = True -for table in tables_to_check: - result = compare_query_results( - prod_schema='my_pipeline', - test_schema='my_pipeline_v2', - table=table, - tolerance=1e-6, - ) - - if result['match']: - print(f"✓ {table}: {result['row_count']} rows match") - else: - print(f"✗ {table}: discrepancies found") - for disc in result['discrepancies'][:5]: - print(f" {disc}") - all_match = False - -if all_match: - print("\n✓ All tables validated! Ready for cutover.") -else: - print("\n✗ Fix discrepancies before proceeding.") -``` - -#### 6. Schedule Cutover - -**Pre-cutover checklist:** - -- [ ] Full backup verified -- [ ] All data copied -- [ ] All computed tables populated -- [ ] Validation passed -- [ ] Team notified -- [ ] Maintenance window scheduled -- [ ] All pre-2.0 clients stopped - -**Execute cutover:** - -```sql --- Rename production → old -RENAME TABLE `my_pipeline` TO `my_pipeline_old`; - --- Rename _v2 → production -RENAME TABLE `my_pipeline_v2` TO `my_pipeline`; -``` - -**Update code:** - -```bash -# On pre/v2.0 branch, update schema names back -sed -i '' 's/_v2//g' your_pipeline/*.py - -git add . -git commit -m "chore: remove _v2 suffix for production" - -# Merge to main -git checkout main -git merge pre/v2.0 -git push origin main - -# Deploy updated code -``` - -#### 7. Verify Production - -```python -# Test production after cutover -from your_pipeline import schema, Mouse, Neuron - -print(f"Mice: {len(Mouse())}") -print(f"Neurons: {len(Neuron())}") - -# Run a populate -Neuron.populate(limit=5, display_progress=True) -``` - -#### 8. Cleanup (After 1-2 Weeks) - -```sql --- After confirming production stable -DROP DATABASE `my_pipeline_old`; -``` - -### Option B: In-Place Migration - -**Best for:** Very large databases (> 1 TB) where copying is impractical - -**Warning:** Modifies production schema directly. Test thoroughly first! - -```python -from datajoint.migrate import migrate_schema_in_place - -# Backup first -backup_schema('my_pipeline', 'my_pipeline_backup_20260114') - -# Migrate in place -result = migrate_schema_in_place( - schema='my_pipeline', - backup=True, - steps=[ - 'update_blob_comments', # Add :: markers - 'add_lineage_table', # Create ~lineage - 'migrate_external_storage', # BINARY(16) → JSON - ] -) - -print(f"Migrated {result['steps_completed']} steps") -``` - -### Option C: Gradual Migration with Legacy Compatibility - -**Best for:** Pipelines that must support both pre-2.0 and 2.0 clients simultaneously - -**Strategy:** Create dual columns for in-store codecs - -#### 1. Add `_v2` Columns - -For each in-store attribute, add a corresponding `_v2` column: - -```sql --- Add _v2 column for in-store codec -ALTER TABLE `my_pipeline`.`recording` - ADD COLUMN `signal_v2` JSON COMMENT '::signal data'; -``` - -#### 2. Populate `_v2` Columns - -```python -from datajoint.migrate import populate_v2_columns - -result = populate_v2_columns( - schema='my_pipeline', - table='recording', - attribute='signal', - v2_attribute='signal_v2', - source_store='raw', - dest_store='raw', -) - -print(f"Populated {result['rows']} _v2 columns") -``` - -#### 3. Update Code to Use `_v2` Columns - -```python -# Update table definition -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uint32 - --- - signal : blob@raw # Legacy (pre-2.0 clients) - signal_v2 : # 2.0 clients - """ -``` - -**Both APIs work:** - -- pre-2.0 clients use `signal` -- 2.0 clients use `signal_v2` - -#### 4. Final Cutover - -Once all clients upgraded to 2.0: - -```sql --- Drop legacy column -ALTER TABLE `my_pipeline`.`recording` - DROP COLUMN `signal`; - --- Rename _v2 to original name -ALTER TABLE `my_pipeline`.`recording` - CHANGE COLUMN `signal_v2` `signal` JSON; -``` - ---- - -## Phase IV: Adopt New Features - -After successful migration, adopt DataJoint 2.0 features incrementally based -on your needs. Migration is complete - these are optional enhancements. - -### New Features Overview - -**Schema-addressed storage** (``, ``) -- Lazy-loading arrays with fsspec integration -- Hierarchical organization by primary key -- Mutable objects with streaming access -- See: [Object Storage Tutorial](../tutorials/basics/06-object-storage.ipynb) - -**Semantic matching** -- Lineage-based join validation (enabled by default with `*` operator) -- Catches errors from incompatible data combinations -- See: [Semantic Matching Spec](../reference/specs/semantic-matching.md) - -**Jobs 2.0** -- Per-table job tracking (`~~table_name`) -- Priority-based populate (with `reserve_jobs=True`) -- Improved distributed computing coordination -- See: [Distributed Computing Tutorial](../tutorials/advanced/distributed.ipynb) - -**Custom codecs** -- Domain-specific data types -- Extensible type system -- See: [Custom Codecs Tutorial](../tutorials/advanced/custom-codecs.ipynb) - -### Learning Path - -**Start here:** - -1. [Object Storage Tutorial](../tutorials/basics/06-object-storage.ipynb) - - Learn `` and `` for large arrays -2. [Distributed Computing Tutorial](../tutorials/advanced/distributed.ipynb) - - Jobs 2.0 with priority-based populate -3. [Custom Codecs Tutorial](../tutorials/advanced/custom-codecs.ipynb) - - Create domain-specific types - -**Reference documentation:** - -- [Object Store Configuration](../reference/specs/object-store-configuration.md) -- [NPY Codec Spec](../reference/specs/npy-codec.md) -- [Codec API](../reference/specs/codec-api.md) -- [Semantic Matching Spec](../reference/specs/semantic-matching.md) -- [AutoPopulate Spec](../reference/specs/autopopulate.md) - -**Adopt features incrementally:** - -- Start with one table using `` for large arrays -- Test performance and workflow improvements -- Expand to other tables as needed -- No need to adopt all features at once - ---- - -## Troubleshooting - -### Import Errors - -**Issue:** Module not found after migration - -**Solution:** -```python -# Ensure all imports use datajoint namespace -import datajoint as dj -from datajoint import schema, Manual, Computed -``` - -### Schema Not Found - -**Issue:** `Database 'schema_v2' doesn't exist` - -**Solution:** -```python -# Ensure schema declared and created -schema = dj.schema('schema_v2') -schema.spawn_missing_classes() -``` - -### Type Syntax Errors - -**Issue:** `Invalid type: 'int unsigned'` - -**Solution:** Update to core types -```python -# Wrong -definition = """ -id : int unsigned -""" - -# Correct -definition = """ -id : uint32 -""" -``` - -### External Storage Not Found - -**Issue:** Can't access external data after migration - -**Solution:** -```python -# Ensure stores configured -dj.config['stores.default'] = 'main' -dj.config['stores.main.location'] = '/data/stores' - -# Verify -from datajoint.settings import get_store_spec -print(get_store_spec('main')) -``` - ---- - -## Summary - -**Phase I:** Branch and code migration (~1-4 hours with AI) -- Create `pre/v2.0` branch -- Update all code to 2.0 API -- Create empty `_v2` schemas - -**Phase II:** Test with sample data (~1-2 days) -- Insert test data -- Validate functionality -- Test new features - -**Phase III:** Migrate production data (~1-7 days) -- Choose migration option -- Copy or migrate data -- Validate integrity -- Execute cutover - -**Phase IV:** Adopt new features (ongoing) -- Object storage -- Semantic matching -- Custom codecs -- Jobs 2.0 - -**Total timeline:** ~1-2 weeks for most pipelines - ---- - -## See Also - -**Core Documentation:** - -- [Type System Concept](../explanation/type-system.md) -- [Configuration Reference](../reference/configuration.md) -- [Definition Syntax](../reference/definition-syntax.md) -- [Fetch API Reference](../reference/specs/fetch-api.md) - -**Tutorials:** - -- [Object Storage](../tutorials/basics/06-object-storage.ipynb) -- [Custom Codecs](../tutorials/advanced/custom-codecs.ipynb) -- [Distributed Computing](../tutorials/advanced/distributed.ipynb) - -**Specifications:** - -- [Type System Spec](../reference/specs/type-system.md) -- [Codec API Spec](../reference/specs/codec-api.md) -- [Object Store Configuration](../reference/specs/object-store-configuration.md) -- [Semantic Matching](../reference/specs/semantic-matching.md) - - ---- -## File: how-to/model-relationships.ipynb - -# Model Relationships - -Define foreign key relationships between tables. This guide shows how different foreign key placements create different relationship types, with actual schema diagrams. - - -```python -import datajoint as dj - -schema = dj.Schema('howto_relationships') -schema.drop(prompt=False) -schema = dj.Schema('howto_relationships') -``` - - -## Basic Foreign Key - -Reference another table with `->`: - - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) - --- - species : varchar(32) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject - session_idx : uint16 - --- - session_date : date - """ - -dj.Diagram(Subject) + dj.Diagram(Session) -``` - - -The `->` syntax: - -- Inherits all primary key attributes from the referenced table -- Creates a foreign key constraint -- Establishes dependency for cascading deletes -- Defines workflow order (parent must exist before child) - -## Foreign Key Placement - -Where you place a foreign key determines the relationship type: - -| Placement | Relationship | Diagram Line | -|-----------|--------------|-------------| -| Entire primary key | One-to-one extension | Thick solid | -| Part of primary key | One-to-many containment | Thin solid | -| Secondary attribute | One-to-many reference | Dashed | - -## One-to-Many: Containment - -Foreign key as **part of** the primary key (above `---`): - - -```python -@schema -class Trial(dj.Manual): - definition = """ - -> Session # Part of primary key - trial_idx : uint16 # Additional PK attribute - --- - outcome : varchar(20) - """ - -dj.Diagram(Session) + dj.Diagram(Trial) -``` - - -**Thin solid line** = containment. Trials are identified **within** their session. Trial #1 for Session A is different from Trial #1 for Session B. - -Notice `Trial` is **underlined** — it introduces a new [dimension](../explanation/entity-integrity.md#schema-dimensions) (`trial_idx`). A dimension is an independent axis of variation in your data, introduced by a table that defines new primary key attributes. - -## One-to-Many: Reference - -Foreign key as **secondary attribute** (below `---`): - - -```python -@schema -class Equipment(dj.Lookup): - definition = """ - equipment_id : varchar(16) - --- - equipment_name : varchar(60) - """ - contents = [ - {'equipment_id': 'rig1', 'equipment_name': 'Main Recording Rig'}, - {'equipment_id': 'rig2', 'equipment_name': 'Backup Rig'}, - ] - -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid # Independent identity - --- - -> Equipment # Reference, not part of identity - duration : float32 - """ - -dj.Diagram(Equipment) + dj.Diagram(Recording) -``` - - -**Dashed line** = reference. Recordings have their own global identity independent of equipment. - -Both tables are **underlined** — each introduces its own dimension. - -## One-to-One: Extension - -Foreign key **is** the entire primary key: - - -```python -@schema -class SubjectDetails(dj.Manual): - definition = """ - -> Subject # Entire primary key - --- - weight : float32 - notes : varchar(1000) - """ - -dj.Diagram(Subject) + dj.Diagram(SubjectDetails) -``` - - -**Thick solid line** = extension. Each subject has at most one details record. The tables share identity. - -Notice `SubjectDetails` is **not underlined** — it doesn't introduce a new dimension. - -## Optional (Nullable) Foreign Keys - -Make a reference optional with `[nullable]`: - - -```python -@schema -class Stimulus(dj.Lookup): - definition = """ - stimulus_type : varchar(32) - """ - contents = [{'stimulus_type': 'visual'}, {'stimulus_type': 'auditory'}] - -@schema -class TrialStimulus(dj.Manual): - definition = """ - -> Trial - --- - -> [nullable] Stimulus # Some trials have no stimulus - """ - -dj.Diagram(Trial) + dj.Diagram(Stimulus) + dj.Diagram(TrialStimulus) -``` - - -Only secondary foreign keys (below `---`) can be nullable. - -**Note:** The `[nullable]` modifier is NOT visible in diagrams — check the table definition. - -## Unique Foreign Keys - -Enforce one-to-one on a secondary FK with `[unique]`: - - -```python -@schema -class Employee(dj.Manual): - definition = """ - employee_id : uint32 - --- - name : varchar(60) - """ - -@schema -class ParkingSpot(dj.Manual): - definition = """ - spot_id : uint32 - --- - -> [unique] Employee # Each employee has at most one spot - location : varchar(30) - """ - -dj.Diagram(Employee) + dj.Diagram(ParkingSpot) -``` - - -**Note:** The `[unique]` modifier is NOT visible in diagrams — the line is still dashed. Check the table definition to see the constraint. - -## Many-to-Many - -Use an association table with composite primary key: - - -```python -@schema -class Protocol(dj.Lookup): - definition = """ - protocol_id : varchar(16) - --- - protocol_name : varchar(100) - """ - contents = [ - {'protocol_id': 'iacuc_01', 'protocol_name': 'Mouse Protocol'}, - {'protocol_id': 'iacuc_02', 'protocol_name': 'Rat Protocol'}, - ] - -@schema -class Assignment(dj.Manual): - definition = """ - -> Subject - -> Protocol - --- - assigned_date : date - """ - -dj.Diagram(Subject) + dj.Diagram(Protocol) + dj.Diagram(Assignment) -``` - - -Two **thin solid lines** converge into `Assignment`. Each subject-protocol combination appears at most once. - -Notice `Assignment` is **not underlined** — it doesn't introduce a new dimension, just combines existing ones. - -## Hierarchies - -Cascading one-to-many relationships create tree structures: - - -```python -# Already defined: Subject -> Session -> Trial -# Show the full hierarchy -dj.Diagram(Subject) + dj.Diagram(Session) + dj.Diagram(Trial) -``` - - -Primary keys cascade: Trial's key is `(subject_id, session_idx, trial_idx)`. - -All three tables are **underlined** — each introduces a dimension. - -## Part Tables - -Part tables use the `-> master` alias to reference their enclosing table: - - -```python -@schema -class Scan(dj.Manual): - definition = """ - -> Session - scan_idx : uint16 - --- - depth : float32 - """ - - class ROI(dj.Part): - definition = """ - -> master # References Scan's primary key - roi_idx : uint16 # Additional dimension - --- - x : float32 - y : float32 - """ - -dj.Diagram(Session) + dj.Diagram(Scan) + dj.Diagram(Scan.ROI) -``` - - -`-> master` is the standard way to declare the foreign key to the enclosing table. It references the master's primary key. - -Notice: -- `Scan` is **underlined** (introduces `scan_idx`) -- `Scan.ROI` is **underlined** (introduces `roi_idx`) — Part tables CAN introduce dimensions - -## Renamed Foreign Keys - -Reference the same table multiple times with `.proj()` to rename attributes: - - -```python -@schema -class Comparison(dj.Manual): - definition = """ - -> Session.proj(session_a='session_idx') - -> Session.proj(session_b='session_idx') - --- - similarity : float32 - """ - -dj.Diagram(Session) + dj.Diagram(Comparison) -``` - - -**Orange dots** indicate renamed foreign keys. Hover over them to see the projection expression. - -This creates attributes `session_a` and `session_b`, both referencing `Session.session_idx`. - -## Computed Dependencies - -Computed tables inherit keys from their dependencies: - - -```python -@schema -class TrialAnalysis(dj.Computed): - definition = """ - -> Trial - --- - score : float64 - """ - - def make(self, key): - self.insert1({**key, 'score': 0.95}) - -dj.Diagram(Trial) + dj.Diagram(TrialAnalysis) -``` - - -**Thick solid line** to a **red (Computed) table** that is **not underlined**. - -Computed tables never introduce dimensions — their primary key is entirely inherited from dependencies. - -## Full Schema View - - -```python -dj.Diagram(schema) -``` - - -## Schema as DAG - -DataJoint schemas form a directed acyclic graph (DAG). Foreign keys: - -- Define data relationships -- Prescribe workflow execution order -- Enable cascading deletes - -There are no cyclic dependencies—parent tables must always be populated before their children. - -## Summary - -| Pattern | Declaration | Line Style | Dimensions | -|---------|-------------|------------|------------| -| One-to-one | FK is entire PK | Thick solid | No new dimension | -| One-to-many (contain) | FK + other attrs in PK | Thin solid | Usually new dimension | -| One-to-many (ref) | FK in secondary | Dashed | Independent dimensions | -| Many-to-many | Two FKs in PK | Two thin solid | No new dimension | -| Part table | `-> master` | Thin solid | May introduce dimension | - -## See Also - -- [Define Tables](define-tables.md) — Table definition syntax -- [Design Primary Keys](design-primary-keys.md) — Key selection strategies -- [Read Diagrams](read-diagrams.ipynb) — Diagram notation reference -- [Delete Data](delete-data.md) — Cascade behavior - - -```python -schema.drop(prompt=False) -``` - - ---- -## File: how-to/monitor-progress.md - -# Monitor Progress - -Track computation progress and job status. - -## Progress Display - -Show progress bar during populate: - -```python -ProcessedData.populate(display_progress=True) -``` - -## Check Remaining Work - -Count entries left to compute: - -```python -# What's left to compute -remaining = ProcessedData.key_source - ProcessedData -print(f"{len(remaining)} entries remaining") -``` - -## Job Status Summary - -Get counts by status: - -```python -progress = ProcessedData.jobs.progress() -# {'pending': 100, 'reserved': 5, 'error': 3, 'success': 892} - -for status, count in progress.items(): - print(f"{status}: {count}") -``` - -## Filter Jobs by Status - -Access jobs by their current status: - -```python -# Pending jobs (waiting to run) -ProcessedData.jobs.pending - -# Currently running -ProcessedData.jobs.reserved - -# Failed jobs -ProcessedData.jobs.errors - -# Completed jobs (if keep_completed=True) -ProcessedData.jobs.completed - -# Skipped jobs -ProcessedData.jobs.ignored -``` - -## View Job Details - -Inspect specific jobs: - -```python -# All jobs for a key -(ProcessedData.jobs & key).fetch1() - -# Recent errors -ProcessedData.jobs.errors.to_dicts( - order_by='completed_time DESC', - limit=10 -) -``` - -## Worker Information - -See which workers are processing: - -```python -for job in ProcessedData.jobs.reserved.to_dicts(): - print(f"Key: {job}") - print(f"Host: {job['host']}") - print(f"PID: {job['pid']}") - print(f"Started: {job['reserved_time']}") -``` - -## Computation Timing - -Track how long jobs take: - -```python -# Average duration of completed jobs -completed = ProcessedData.jobs.completed.to_arrays('duration') -print(f"Average: {np.mean(completed):.1f}s") -print(f"Median: {np.median(completed):.1f}s") -``` - -## Enable Job Metadata - -Store timing info in computed tables: - -```python -import datajoint as dj - -dj.config.jobs.add_job_metadata = True -dj.config.jobs.keep_completed = True -``` - -This adds hidden attributes to computed tables: - -- `_job_start_time` — When computation began -- `_job_duration` — How long it took -- `_job_version` — Code version (if configured) - -## Simple Progress Script - -```python -import time -from my_pipeline import ProcessedData - -while True: - remaining, total = ProcessedData.progress() - - print(f"\rProgress: {total - remaining}/{total} ({(total - remaining) / total:.0%})", end='') - - if remaining == 0: - print("\nDone!") - break - - time.sleep(10) -``` - -For distributed mode with job tracking: - -```python -import time -from my_pipeline import ProcessedData - -while True: - status = ProcessedData.jobs.progress() - - print(f"\rPending: {status.get('pending', 0)} | " - f"Running: {status.get('reserved', 0)} | " - f"Done: {status.get('success', 0)} | " - f"Errors: {status.get('error', 0)}", end='') - - if status.get('pending', 0) == 0 and status.get('reserved', 0) == 0: - print("\nDone!") - break - - time.sleep(10) -``` - -## Pipeline-Wide Status - -Check multiple tables: - -```python -tables = [RawData, ProcessedData, Analysis] - -for table in tables: - total = len(table.key_source) - done = len(table()) - print(f"{table.__name__}: {done}/{total} ({done/total:.0%})") -``` - -## See Also - -- [Run Computations](run-computations.md) — Basic populate usage -- [Distributed Computing](distributed-computing.md) — Multi-worker setup -- [Handle Errors](handle-errors.md) — Error recovery - - ---- -## File: how-to/object-storage-overview.md - -# Object Storage Overview - -Navigate DataJoint's object storage documentation to find what you need. - -## Quick Navigation by Task - -**I want to...** - -| Task | Guide | Est. Time | -|------|-------|-----------| -| ✅ Decide which storage type to use | [Choose a Storage Type](choose-storage-type.md) | 5-10 min | -| ✅ Set up S3, MinIO, or file storage | [Configure Object Storage](configure-storage.md) | 10-15 min | -| ✅ Store and retrieve large data | [Use Object Storage](use-object-storage.md) | 15-20 min | -| ✅ Work with NumPy arrays efficiently | [Use NPY Codec](use-npy-codec.md) | 10 min | -| ✅ Create domain-specific types | [Create Custom Codec](create-custom-codec.md) | 30-45 min | -| ✅ Optimize storage performance | [Manage Large Data](manage-large-data.md) | 20 min | -| ✅ Clean up unused data | [Garbage Collection](garbage-collection.md) | 10 min | - -## Conceptual Understanding - -**Why does DataJoint have object storage?** - -Traditional databases excel at structured, relational data but struggle with large arrays, files, and streaming data. DataJoint's **Object-Augmented Schema (OAS)** unifies relational tables with object storage into a single coherent system: - -- **Relational database:** Metadata, keys, relationships (structured data < 1 MB) -- **Object storage:** Arrays, files, datasets (large data > 1 MB) -- **Full referential integrity:** Maintained across both layers - -Read: [Object-Augmented Schemas](../explanation/data-pipelines.md#object-augmented-schemas) for conceptual overview. - -## Three Storage Modes - -### In-Table Storage (``) - -**What:** Data stored directly in database column -**When:** Small objects < 1 MB (JSON, thumbnails, small arrays) -**Why:** Fast access, transactional consistency, no store setup - -```python -metadata : # Stored in MySQL -``` - -**Guide:** [Use Object Storage](use-object-storage.md#in-table-vs-object-store) - ---- - -### Object Store (Integrated) - -**What:** DataJoint-managed storage in S3, file systems, or cloud storage -**When:** Large data (arrays, files, datasets) needing lifecycle management -**Why:** Deduplication, garbage collection, transaction safety, referential integrity - -**Two addressing schemes:** - -#### Hash-Addressed (``, ``) -- Content-based paths (MD5 hash) -- Automatic deduplication -- Best for: Write-once data, attachments - -```python -waveform : # Hash: _hash/{schema}/{hash} -document : # Hash: _hash/{schema}/{hash} -``` - -#### Schema-Addressed (``, ``) -- Key-based paths (browsable) -- Streaming access, partial reads -- Best for: Zarr, HDF5, large arrays - -```python -traces : # Schema: _schema/{schema}/{table}/{key}/ -volume : # Schema: _schema/{schema}/{table}/{key}/ -``` - -**Guides:** -- [Choose a Storage Type](choose-storage-type.md) — Decision criteria -- [Use Object Storage](use-object-storage.md) — How to use codecs - ---- - -### Filepath References (``) - -**What:** User-managed file paths (DataJoint stores path string only) -**When:** Existing data archives, externally managed files -**Why:** No file lifecycle management, no deduplication, user controls paths - -```python -raw_data : # User-managed path -``` - -**Guide:** [Use Object Storage](use-object-storage.md#filepath-references) - -## Documentation by Level - -### Getting Started - -1. **[Choose a Storage Type](choose-storage-type.md)** — Start here - - Quick decision tree (5 minutes) - - Size guidelines (< 1 MB, 1-100 MB, > 100 MB) - - Access pattern considerations - - Lifecycle management options - -2. **[Configure Object Storage](configure-storage.md)** — Setup - - File system, S3, MinIO configuration - - Single vs multiple stores - - Credentials management - - Store verification - -3. **[Use Object Storage](use-object-storage.md)** — Basic usage - - Insert/fetch patterns - - In-table vs object store - - Addressing schemes (hash vs schema) - - ObjectRef for lazy access - -### Intermediate - -4. **[Use NPY Codec](use-npy-codec.md)** — NumPy arrays - - Lazy loading (doesn't load until accessed) - - Efficient slicing (fetch subsets) - - Shape/dtype metadata - - When to use `` vs `` - -5. **[Manage Large Data](manage-large-data.md)** — Optimization - - Storage tiers (hot/warm/cold) - - Compression strategies - - Batch operations - - Performance tuning - -6. **[Garbage Collection](garbage-collection.md)** — Cleanup - - Automatic cleanup for integrated storage - - Manual cleanup for filepath references - - Orphan detection - - Recovery procedures - -### Advanced - -7. **[Create Custom Codec](create-custom-codec.md)** — Extensibility - - Domain-specific types - - Codec API (encode/decode) - - HashCodec vs SchemaCodec patterns - - Integration with existing formats - -## Technical Reference - -For implementation details and specifications: - -### Specifications - -- [Type System Spec](../reference/specs/type-system.md) — Three-layer architecture -- [Codec API Spec](../reference/specs/codec-api.md) — Custom codec interface -- [NPY Codec Spec](../reference/specs/npy-codec.md) — NumPy array storage -- [Object Store Configuration Spec](../reference/specs/object-store-configuration.md) — Store config details - -### Explanations - -- [Type System](../explanation/type-system.md) — Conceptual overview -- [Data Pipelines (OAS section)](../explanation/data-pipelines.md#object-augmented-schemas) — Why OAS exists -- [Custom Codecs](../explanation/custom-codecs.md) — Design patterns - -## Common Workflows - -### Workflow 1: Adding Object Storage to Existing Pipeline - -1. [Configure Object Storage](configure-storage.md) — Set up store -2. [Choose a Storage Type](choose-storage-type.md) — Select codec -3. Update table definitions with `@` modifier -4. [Use Object Storage](use-object-storage.md) — Insert/fetch patterns - -**Estimate:** 30-60 minutes - ---- - -### Workflow 2: Migrating from In-Table to Object Store - -1. [Choose a Storage Type](choose-storage-type.md) — Determine new codec -2. Add new column with object storage codec -3. Migrate data (see [Use Object Storage](use-object-storage.md#migration-patterns)) -4. Verify data integrity -5. Drop old column (see [Alter Tables](alter-tables.md)) - -**Estimate:** 1-2 hours for small datasets - ---- - -### Workflow 3: Working with Very Large Arrays (> 1 GB) - -1. Use `` or `` (not ``) -2. [Configure Object Storage](configure-storage.md) — Ensure adequate storage -3. For Zarr: Store as `` with `.zarr` extension -4. For streaming: Use `ObjectRef.fsmap` (see [Use Object Storage](use-object-storage.md#streaming-access)) - -**Key advantage:** No need to download full dataset into memory - ---- - -### Workflow 4: Building Custom Domain Types - -1. Read [Custom Codecs](../explanation/custom-codecs.md) — Understand patterns -2. [Create Custom Codec](create-custom-codec.md) — Implementation guide -3. [Codec API Spec](../reference/specs/codec-api.md) — Technical reference -4. Test with small dataset -5. Deploy to production - -**Estimate:** 2-4 hours for simple codecs - -## Decision Trees - -### "Which storage mode?" - -``` -Is data < 1 MB per row? -├─ YES → (in-table) -└─ NO → Continue... - -Is data managed externally? -├─ YES → (user-managed reference) -└─ NO → Continue... - -Need streaming or partial reads? -├─ YES → or (schema-addressed) -└─ NO → (hash-addressed, full download) -``` - -**Full guide:** [Choose a Storage Type](choose-storage-type.md) - ---- - -### "Which codec for object storage?" - -``` -NumPy arrays that benefit from lazy loading? -├─ YES → -└─ NO → Continue... - -Large files (> 100 MB) needing streaming? -├─ YES → -└─ NO → Continue... - -Write-once data with potential duplicates? -├─ YES → (deduplication via hashing) -└─ NO → or (choose based on access pattern) -``` - -**Full guide:** [Choose a Storage Type](choose-storage-type.md#storage-type-comparison) - -## Troubleshooting - -### Common Issues - -| Problem | Likely Cause | Solution Guide | -|---------|-------------|----------------| -| "Store not configured" | Missing stores config | [Configure Object Storage](configure-storage.md) | -| Out of memory loading array | Using `` for huge data | [Choose a Storage Type](choose-storage-type.md) → Use `` | -| Slow fetches | Wrong codec choice | [Manage Large Data](manage-large-data.md) | -| Data not deduplicated | Using wrong codec | [Choose a Storage Type](choose-storage-type.md#deduplication) | -| Path conflicts with reserved | `` using `_hash/` or `_schema/` | [Use Object Storage](use-object-storage.md#filepath-references) | -| Missing files after delete | Expected behavior for integrated storage | [Garbage Collection](garbage-collection.md) | - -### Getting Help - -- Check [FAQ](../explanation/faq.md) for common questions -- Search [GitHub Discussions](https://github.com/datajoint/datajoint-python/discussions) -- Review specification for exact behavior - -## See Also - -### Related Concepts -- [Type System](../explanation/type-system.md) — Three-layer type architecture -- [Data Pipelines](../explanation/data-pipelines.md) — Object-Augmented Schemas - -### Related How-Tos -- [Manage Secrets](manage-secrets.md) — Credentials for S3/cloud storage -- [Define Tables](define-tables.md) — Table definition syntax -- [Insert Data](insert-data.md) — Data insertion patterns - -### Related Tutorials -- [Object Storage Tutorial](../tutorials/basics/06-object-storage.ipynb) — Hands-on learning -- [Custom Codecs Tutorial](../tutorials/advanced/custom-codecs.ipynb) — Build your own codec - - ---- -## File: how-to/query-data.md - -# Query Data - -Filter, join, and transform data with DataJoint operators. - -## Restriction (`&`) - -Filter rows that match a condition: - -```python -# String condition -Session & "session_date > '2026-01-01'" -Session & "duration BETWEEN 30 AND 60" - -# Dictionary (exact match) -Session & {'subject_id': 'M001'} -Session & {'subject_id': 'M001', 'session_idx': 1} - -# Query expression -Session & Subject # Sessions for subjects in Subject -Session & (Subject & "sex = 'M'") # Sessions for male subjects - -# List (OR) -Session & [{'subject_id': 'M001'}, {'subject_id': 'M002'}] -``` - -## Top N Rows (`dj.Top`) - -Limit results with optional ordering: - -```python -# First 10 by primary key -Session & dj.Top(10) - -# Top 10 by date (descending) -Session & dj.Top(10, 'session_date DESC') - -# Pagination: skip 20, take 10 -Session & dj.Top(10, 'session_date DESC', offset=20) - -# All rows ordered -Session & dj.Top(None, 'session_date DESC') -``` - -Use `"KEY"` for primary key ordering, `"KEY DESC"` for reverse: - -```python -Session & dj.Top(10, 'KEY DESC') # Last 10 by primary key -``` - -## Anti-Restriction (`-`) - -Filter rows that do NOT match: - -```python -Subject - Session # Subjects without sessions -Session - {'subject_id': 'M001'} -``` - -## Projection (`.proj()`) - -Select, rename, or compute attributes: - -```python -# Primary key only -Subject.proj() - -# Specific attributes -Subject.proj('species', 'sex') - -# All attributes -Subject.proj(...) - -# All except some -Subject.proj(..., '-notes') - -# Rename -Subject.proj(animal_species='species') - -# Computed -Subject.proj(weight_kg='weight / 1000') -``` - -## Join (`*`) - -Combine tables on matching attributes: - -```python -Subject * Session -Subject * Session * Experimenter - -# Restrict then join -(Subject & "sex = 'M'") * Session -``` - -## Aggregation (`.aggr()`) - -Group and summarize: - -```python -# Count trials per session -Session.aggr(Session.Trial, n_trials='count(trial_idx)') - -# Multiple aggregates -Session.aggr( - Session.Trial, - n_trials='count(trial_idx)', - avg_rt='avg(reaction_time)', - min_rt='min(reaction_time)' -) - -# Exclude sessions without trials -Session.aggr(Session.Trial, n='count(trial_idx)', exclude_nonmatching=True) -``` - -## Universal Set (`dj.U()`) - -Group by arbitrary attributes: - -```python -# Unique values -dj.U('species') & Subject - -# Group by non-primary-key attribute -dj.U('session_date').aggr(Session, n='count(session_idx)') - -# Global aggregation (one row) -dj.U().aggr(Session, total='count(*)') -``` - -## Extension (`.extend()`) - -Add attributes without losing rows: - -```python -# Add experimenter info, keep all sessions -Session.extend(Experimenter) -``` - -## Chain Operations - -```python -result = ( - Subject - & "sex = 'M'" - * Session - & "duration > 30" -).proj('species', 'session_date', 'duration') -``` - -## Operator Precedence - -| Priority | Operator | Operation | -|----------|----------|-----------| -| Highest | `*` | Join | -| | `+`, `-` | Union, Anti-restriction | -| Lowest | `&` | Restriction | - -Use parentheses for clarity: - -```python -(Subject & condition) * Session # Restrict then join -Subject * (Session & condition) # Join then restrict -``` - -## View Query - -```python -# See generated SQL -print((Subject & condition).make_sql()) - -# Count rows without fetching -len(Subject & condition) -``` - -## See Also - -- [Operators Reference](../reference/operators.md) — Complete operator documentation -- [Fetch Results](fetch-results.md) — Retrieving query results - - ---- -## File: how-to/read-diagrams.ipynb - -# Read Schema Diagrams - -DataJoint diagrams visualize schema structure as directed acyclic graphs (DAGs). This guide teaches you to: - -- Interpret line styles and their semantic meaning -- Recognize dimensions (underlined vs non-underlined tables) -- Use diagram operations to explore large schemas -- Compare DataJoint notation to traditional ER diagrams - - -```python -import datajoint as dj - -schema = dj.Schema('howto_diagrams') -schema.drop(prompt=False) -schema = dj.Schema('howto_diagrams') -``` - - -## Quick Reference - -| Line Style | Relationship | Child's Primary Key | -|------------|--------------|---------------------| -| **Thick Solid** ━━━ | Extension | Parent PK only (one-to-one) | -| **Thin Solid** ─── | Containment | Parent PK + own fields (one-to-many) | -| **Dashed** ┄┄┄ | Reference | Own independent PK (one-to-many) | - -**Key principle:** Solid lines mean the parent's identity becomes part of the child's identity. Dashed lines mean the child maintains independent identity. - -## Thick Solid Line: Extension (One-to-One) - -The foreign key **is** the entire primary key. The child extends the parent. - - -```python -@schema -class Customer(dj.Manual): - definition = """ - customer_id : uint32 - --- - name : varchar(60) - """ - -@schema -class CustomerPreferences(dj.Manual): - definition = """ - -> Customer # FK is entire PK - --- - theme : varchar(20) - notifications : bool - """ - -dj.Diagram(Customer) + dj.Diagram(CustomerPreferences) -``` - - -**Equivalent ER Diagram:** - -ER One-to-One - -**DataJoint vs ER:** The thick solid line immediately shows this is one-to-one. In ER notation, you must read the crow's foot symbols (`||--o|`). - -**Note:** `CustomerPreferences` is **not underlined** — it exists in the Customer dimension space. - -## Thin Solid Line: Containment (One-to-Many) - -The foreign key is **part of** the primary key, with additional fields. - - -```python -@schema -class Account(dj.Manual): - definition = """ - -> Customer # Part of PK - account_num : uint16 # Additional PK field - --- - balance : decimal(10,2) - """ - -dj.Diagram(Customer) + dj.Diagram(Account) -``` - - -**Equivalent ER Diagram:** - -ER One-to-Many - -**DataJoint vs ER:** The thin solid line shows containment — accounts belong to customers. In ER, you see `||--o{` (one-to-many). - -**Note:** `Account` is **underlined** — it introduces the Account dimension. - -## Dashed Line: Reference (One-to-Many) - -The foreign key is a **secondary attribute** (below the `---` line). - - -```python -@schema -class Department(dj.Manual): - definition = """ - dept_id : uint16 - --- - dept_name : varchar(60) - """ - -@schema -class Employee(dj.Manual): - definition = """ - employee_id : uint32 # Own independent PK - --- - -> Department # Secondary attribute - employee_name : varchar(60) - """ - -dj.Diagram(Department) + dj.Diagram(Employee) -``` - - -**Equivalent ER Diagram:** - -ER Reference - -**DataJoint vs ER:** Both show one-to-many, but DataJoint's dashed line tells you immediately that Employee has independent identity. In ER, you must examine whether the FK is part of the PK. - -**Note:** Both tables are **underlined** — each introduces its own dimension. - -## Dimensions and Underlined Names - -A **dimension** is a new entity type introduced by a table that defines new primary key attributes. Each underlined table introduces exactly **one** dimension—even if it has multiple new PK attributes, together they identify one new entity type. - -| Visual | Meaning | -|--------|--------| -| **Underlined** | Introduces a new dimension (new entity type) | -| Not underlined | Exists in the space defined by dimensions from referenced tables | - -**Key rules:** -- Computed tables **never** introduce dimensions (always non-underlined) -- Part tables **can** introduce dimensions (may be underlined) - - -```python -@schema -class Subject(dj.Manual): - definition = """ - subject_id : varchar(16) # NEW dimension - --- - species : varchar(50) - """ - -@schema -class Session(dj.Manual): - definition = """ - -> Subject # Inherits subject_id - session_idx : uint16 # NEW dimension - --- - session_date : date - """ - -@schema -class SessionQC(dj.Computed): - definition = """ - -> Session # Inherits both, adds nothing - --- - passed : bool - """ - def make(self, key): - self.insert1({**key, 'passed': True}) - -dj.Diagram(schema) -``` - - -In this diagram: -- `Subject` is **underlined** — introduces the Subject dimension -- `Session` is **underlined** — introduces the Session dimension (within each Subject) -- `SessionQC` is **not underlined** — exists in the Session dimension space, adds no new dimension - -**Why this matters:** Dimensions determine [attribute lineage](../explanation/entity-integrity.md#dimensions-and-attribute-lineage). Primary key attributes trace back to the dimension where they originated, enabling [semantic matching](../reference/specs/semantic-matching.md) for safe joins. - -## Many-to-Many: Converging Lines - -Many-to-many relationships appear as tables with multiple solid lines converging. - - -```python -@schema -class Student(dj.Manual): - definition = """ - student_id : uint32 - --- - name : varchar(60) - """ - -@schema -class Course(dj.Manual): - definition = """ - course_code : char(8) - --- - title : varchar(100) - """ - -@schema -class Enrollment(dj.Manual): - definition = """ - -> Student - -> Course - --- - grade : enum('A','B','C','D','F') - """ - -dj.Diagram(Student) + dj.Diagram(Course) + dj.Diagram(Enrollment) -``` - - -**Equivalent ER Diagram:** - -ER Many-to-Many - -**DataJoint vs ER:** Both show the association table pattern. DataJoint's converging solid lines immediately indicate the composite primary key. - -**Note:** `Enrollment` is **not underlined** — it exists in the space defined by Student × Course dimensions. - -## Orange Dots: Renamed Foreign Keys - -When referencing the same table multiple times, use `.proj()` to rename. **Orange dots** indicate renamed FKs. - - -```python -@schema -class Person(dj.Manual): - definition = """ - person_id : uint32 - --- - name : varchar(60) - """ - -@schema -class Marriage(dj.Manual): - definition = """ - marriage_id : uint32 - --- - -> Person.proj(spouse1='person_id') - -> Person.proj(spouse2='person_id') - marriage_date : date - """ - -dj.Diagram(Person) + dj.Diagram(Marriage) -``` - - -The orange dots between `Person` and `Marriage` indicate that projections renamed the foreign key attributes (`spouse1` and `spouse2` both reference `person_id`). - -**Tip:** In Jupyter, hover over orange dots to see the projection expression. - -## Diagram Operations - -Filter and combine diagrams to explore large schemas: - - -```python -# Entire schema -dj.Diagram(schema) -``` - - - -```python -# Session and 1 level upstream (dependencies) -dj.Diagram(Session) - 1 -``` - - - -```python -# Subject and 2 levels downstream (dependents) -dj.Diagram(Subject) + 2 -``` - - -**Operation Reference:** - -| Operation | Meaning | -|-----------|--------| -| `dj.Diagram(schema)` | Entire schema | -| `dj.Diagram(Table) - N` | Table + N levels upstream | -| `dj.Diagram(Table) + N` | Table + N levels downstream | -| `D1 + D2` | Union of two diagrams | -| `D1 * D2` | Intersection (common nodes) | - -**Finding paths:** Use intersection to find connection paths: -```python -(dj.Diagram(upstream) + 100) * (dj.Diagram(downstream) - 100) -``` - -## What Diagrams Don't Show - -Diagrams do **NOT** show these FK modifiers: - -| Modifier | Effect | Must Check Definition | -|----------|--------|----------------------| -| `[nullable]` | Optional reference | `-> [nullable] Parent` | -| `[unique]` | One-to-one on secondary FK | `-> [unique] Parent` | - -A dashed line could be any of: -- Required one-to-many (default) -- Optional one-to-many (`[nullable]`) -- Required one-to-one (`[unique]`) -- Optional one-to-one (`[nullable, unique]`) - -**Always check the table definition** to see modifiers. - -## DataJoint vs Traditional ER Notation - -| Feature | Chen's ER | Crow's Foot | DataJoint | -|---------|-----------|-------------|----------| -| Cardinality | Numbers | Line symbols | **Line style** | -| Direction | None | None | **Top-to-bottom** | -| Cycles | Allowed | Allowed | **Not allowed** | -| PK cascade | Not shown | Not shown | **Solid lines** | -| Identity sharing | Not indicated | Not indicated | **Thick solid** | -| New dimensions | Not indicated | Not indicated | **Underlined** | - -**Why DataJoint differs:** - -1. **DAG structure** — No cycles means schemas read as workflows (top-to-bottom) -2. **Line semantics** — Immediately reveals relationship type -3. **Executable** — Diagram is generated from schema, cannot drift out of sync - -## Summary - -| Visual | Meaning | -|--------|--------| -| **Thick solid** | One-to-one extension | -| **Thin solid** | One-to-many containment | -| **Dashed** | Reference (independent identity) | -| **Underlined** | Introduces new dimension | -| **Orange dots** | Renamed FK via `.proj()` | -| **Colors** | Green=Manual, Gray=Lookup, Red=Computed, Blue=Imported | - -## Related - -- [Entity Integrity: Dimensions](../explanation/entity-integrity.md#schema-dimensions) -- [Semantic Matching](../reference/specs/semantic-matching.md) -- [Schema Design Tutorial](../tutorials/basics/02-schema-design.ipynb) - - -```python -schema.drop(prompt=False) -``` - - ---- -## File: how-to/run-computations.md - -# Run Computations - -Execute automated computations with `populate()`. - -## Basic Usage - -```python -# Populate all missing entries -ProcessedData.populate() - -# With progress display -ProcessedData.populate(display_progress=True) -``` - -## Restrict What to Compute - -```python -# Only specific subjects -ProcessedData.populate(Subject & "sex = 'M'") - -# Only recent sessions -ProcessedData.populate(Session & "session_date > '2026-01-01'") - -# Specific key -ProcessedData.populate({'subject_id': 'M001', 'session_idx': 1}) -``` - -## Limit Number of Jobs - -```python -# Process at most 100 entries -ProcessedData.populate(limit=100) -``` - -## Error Handling - -```python -# Continue on errors (log but don't stop) -ProcessedData.populate(suppress_errors=True) - -# Check what failed -failed = ProcessedData.jobs & 'status = "error"' -print(failed) - -# Clear errors to retry -failed.delete() -ProcessedData.populate() -``` - -## When to Use Distributed Mode - -Choose your populate strategy based on your workload and infrastructure: - -### Use `populate()` (Default) When: - -✅ **Single worker** - Only one process computing at a time -✅ **Fast computations** - Each make() completes in < 1 minute -✅ **Small job count** - Processing < 100 entries -✅ **Development/testing** - Iterating on make() logic - -**Advantages:** -- Simplest approach (no job management overhead) -- Immediate execution (no reservation delay) -- Easy debugging (errors stop execution) - -**Example:** -```python -# Simple, direct execution -ProcessedData.populate() -``` - ---- - -### Use `populate(reserve_jobs=True)` When: - -✅ **Multiple workers** - Running on multiple machines or processes -✅ **Long computations** - Each make() takes > 1 minute -✅ **Production pipelines** - Need fault tolerance and monitoring -✅ **Worker crashes expected** - Jobs can be resumed - -**Advantages:** -- Prevents duplicate work between workers -- Fault tolerance (crashed jobs can be retried) -- Job status tracking (`ProcessedData.jobs`) -- Error isolation (one failure doesn't stop others) - -**Example:** -```python -# Distributed mode with job coordination -ProcessedData.populate(reserve_jobs=True) -``` - -**Job reservation overhead:** ~100ms per job -**Worth it when:** Computations take > 10 seconds each - ---- - -### Use `populate(reserve_jobs=True, processes=N)` When: - -✅ **Multi-core machine** - Want to use all CPU cores -✅ **CPU-bound tasks** - Computations are CPU-intensive, not I/O -✅ **Independent computations** - No shared state between jobs - -**Advantages:** -- Parallel execution on single machine -- No network coordination needed -- Combines job safety with parallelism - -**Example:** -```python -# Use 4 CPU cores -ProcessedData.populate(reserve_jobs=True, processes=4) -``` - -**Caution:** Don't use more processes than CPU cores (causes context switching overhead) - ---- - -## Decision Tree - -``` -How many workers? -├─ One → populate() -└─ Multiple → Continue... - -How long per computation? -├─ < 1 minute → populate() (overhead not worth it) -└─ > 1 minute → Continue... - -Need fault tolerance? -├─ Yes → populate(reserve_jobs=True) -└─ No → populate() (simpler) - -Multiple cores on one machine? -└─ Yes → populate(reserve_jobs=True, processes=N) -``` - -## Distributed Computing - -For multi-worker coordination: - -```python -# Worker 1 (on machine A) -ProcessedData.populate(reserve_jobs=True) - -# Worker 2 (on machine B) -ProcessedData.populate(reserve_jobs=True) - -# Workers coordinate automatically via database -# Each reserves different keys, no duplicates -``` - -## Check Progress - -```python -# What's left to compute -remaining = ProcessedData.key_source - ProcessedData -print(f"{len(remaining)} entries remaining") - -# View job status -ProcessedData.jobs -``` - -## The `make()` Method - -```python -@schema -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - result : float64 - """ - - def make(self, key): - # 1. Fetch input data - raw = (RawData & key).fetch1('data') - - # 2. Compute - result = process(raw) - - # 3. Insert - self.insert1({**key, 'result': result}) -``` - -## Three-Part Make for Long Computations - -For computations taking hours or days: - -```python -@schema -class LongComputation(dj.Computed): - definition = """ - -> RawData - --- - result : float64 - """ - - def make_fetch(self, key): - """Fetch input data (outside transaction)""" - data = (RawData & key).fetch1('data') - return (data,) - - def make_compute(self, key, fetched): - """Perform computation (outside transaction)""" - (data,) = fetched - result = expensive_computation(data) - return (result,) - - def make_insert(self, key, fetched, computed): - """Insert results (inside brief transaction)""" - (result,) = computed - self.insert1({**key, 'result': result}) -``` - -## Custom Key Source - -```python -@schema -class FilteredComputation(dj.Computed): - definition = """ - -> RawData - --- - result : float64 - """ - - @property - def key_source(self): - # Only compute for high-quality data - return (RawData & 'quality > 0.8') - self -``` - -## Populate Options - -| Option | Default | Description | -|--------|---------|-------------| -| `restriction` | `None` | Filter what to compute | -| `limit` | `None` | Max entries to process | -| `display_progress` | `False` | Show progress bar | -| `reserve_jobs` | `False` | Reserve jobs for distributed computing | -| `suppress_errors` | `False` | Continue on errors | - -## See Also - -- [Computation Model](../explanation/computation-model.md) — How computation works -- [Distributed Computing](distributed-computing.md) — Multi-worker setup -- [Handle Errors](handle-errors.md) — Error recovery - - ---- -## File: how-to/use-cli.md - -# Use the Command-Line Interface - -Start an interactive Python REPL with DataJoint pre-loaded. - -The `dj` command provides quick access to DataJoint for exploring schemas, running queries, and testing connections without writing scripts. - -## Start the REPL - -```bash -dj -``` - -This opens a Python REPL with `dj` (DataJoint) already imported: - -``` -DataJoint 2.0.0 REPL -Type 'dj.' and press Tab for available functions. - ->>> dj.conn() # Connect to database ->>> dj.list_schemas() # List available schemas -``` - -## Specify Database Credentials - -Override config file settings from the command line: - -```bash -dj --host localhost:3306 --user root --password secret -``` - -| Option | Description | -|--------|-------------| -| `--host HOST` | Database host as `host:port` | -| `-u`, `--user USER` | Database username | -| `-p`, `--password PASS` | Database password | - -Credentials from command-line arguments override values in config files. - -## Load Schemas as Virtual Modules - -Load database schemas directly into the REPL namespace: - -```bash -dj -s my_lab:lab -s my_analysis:analysis -``` - -The format is `schema_name:alias` where: -- `schema_name` is the database schema name -- `alias` is the variable name in the REPL - -This outputs: - -``` -DataJoint 2.0.0 REPL -Type 'dj.' and press Tab for available functions. - -Loaded schemas: - lab -> my_lab - analysis -> my_analysis - ->>> lab.Subject.to_dicts() # Query Subject table ->>> dj.Diagram(lab.schema) # View schema diagram -``` - -## Common Workflows - -### Explore an Existing Schema - -```bash -dj -s production_db:db -``` - -```python ->>> list(db.schema) # List all tables ->>> db.Experiment().to_dicts()[:5] # Preview data ->>> dj.Diagram(db.schema) # Visualize structure -``` - -### Quick Data Check - -```bash -dj --host db.example.com -s my_lab:lab -``` - -```python ->>> len(lab.Session()) # Count sessions ->>> lab.Session.describe() # Show table definition -``` - -### Test Connection - -```bash -dj --host localhost:3306 --user testuser --password testpass -``` - -```python ->>> dj.conn() # Verify connection works ->>> dj.list_schemas() # Check accessible schemas -``` - -## Version Information - -Display DataJoint version: - -```bash -dj --version -``` - -## Help - -Display all options: - -```bash -dj --help -``` - -## Entry Points - -The CLI is available as both `dj` and `datajoint`: - -```bash -dj --version -datajoint --version # Same command -``` - -## Programmatic Usage - -The CLI function can also be called from Python: - -```python -from datajoint.cli import cli - -# Show version and exit -cli(["--version"]) - -# Start REPL with schemas -cli(["-s", "my_lab:lab"]) -``` - - ---- -## File: how-to/use-npy-codec.md - -# Use the `` Codec - -Store NumPy arrays with lazy loading and metadata access. - -## Overview - -The `` codec stores NumPy arrays as portable `.npy` files in object storage. On fetch, you get an `NpyRef` that provides metadata without downloading. - -**Key benefits:** -- Access shape, dtype, size without I/O -- Lazy loading - download only when needed -- Memory mapping - random access to large arrays -- Safe bulk fetch - inspect before downloading -- Portable `.npy` format - -## Quick Start - -### 1. Configure a Store - -```python -import datajoint as dj - -# Add store configuration -dj.config.object_storage.stores['mystore'] = { - 'protocol': 's3', - 'endpoint': 'localhost:9000', - 'bucket': 'my-bucket', - 'access_key': 'access_key', - 'secret_key': 'secret_key', - 'location': 'data', -} -``` - -Or in `datajoint.json`: -```json -{ - "object_storage": { - "stores": { - "mystore": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-bucket", - "location": "data" - } - } - } -} -``` - -### 2. Define Table with `` - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : int32 - --- - waveform : - """ -``` - -### 3. Insert Arrays - -```python -import numpy as np - -Recording.insert1({ - 'recording_id': 1, - 'waveform': np.random.randn(1000, 32), -}) -``` - -### 4. Fetch with Lazy Loading - -```python -# Returns NpyRef, not array -ref = (Recording & 'recording_id=1').fetch1('waveform') - -# Metadata without download -print(ref.shape) # (1000, 32) -print(ref.dtype) # float64 - -# Load when ready -arr = ref.load() -``` - -## NpyRef Reference - -### Metadata Properties (No I/O) - -```python -ref.shape # Tuple of dimensions -ref.dtype # NumPy dtype -ref.ndim # Number of dimensions -ref.size # Total elements -ref.nbytes # Total bytes -ref.path # Storage path -ref.store # Store name -ref.is_loaded # Whether data is cached -``` - -### Loading Methods - -```python -# Explicit load (recommended) -arr = ref.load() - -# Via NumPy functions (auto-loads) -mean = np.mean(ref) -std = np.std(ref, axis=0) - -# Via conversion (auto-loads) -arr = np.asarray(ref) - -# Indexing (loads then indexes) -first_row = ref[0] -snippet = ref[100:200, :] -``` - -### Memory Mapping - -For large arrays, use `mmap_mode` to access data without loading it all into memory: - -```python -# Memory-mapped loading (random access) -arr = ref.load(mmap_mode='r') - -# Only reads the portion you access -slice = arr[1000:2000, :] # Efficient for large arrays -``` - -**Modes:** -- `'r'` - Read-only (recommended) -- `'r+'` - Read-write -- `'c'` - Copy-on-write (changes not saved) - -**Performance:** -- Local filesystem stores: mmaps directly (no copy) -- Remote stores (S3): downloads to cache first, then mmaps - -## Common Patterns - -### Bulk Fetch with Filtering - -```python -# Fetch all - returns NpyRefs, not arrays -results = MyTable.to_dicts() - -# Filter by metadata (no downloads) -large = [r for r in results if r['data'].shape[0] > 1000] - -# Load only what you need -for rec in large: - arr = rec['data'].load() - process(arr) -``` - -### Computed Tables - -```python -@schema -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - result : - """ - - def make(self, key): - # Fetch lazy reference - ref = (RawData & key).fetch1('raw') - - # NumPy functions auto-load - result = np.fft.fft(ref, axis=1) - - self.insert1({**key, 'result': result}) -``` - -### Memory-Efficient Processing - -```python -# Process recordings one at a time -for key in Recording.keys(): - ref = (Recording & key).fetch1('data') - - # Check size before loading - if ref.nbytes > 1e9: # > 1 GB - print(f"Skipping large recording: {ref.nbytes/1e9:.1f} GB") - continue - - process(ref.load()) -``` - -## Comparison with `` - -| Aspect | `` | `` | -|--------|----------|----------| -| **On fetch** | NpyRef (lazy) | Array (eager) | -| **Metadata access** | Without download | Must download | -| **Memory mapping** | Yes, via `mmap_mode` | No | -| **Addressing** | Schema-addressed | Hash-addressed | -| **Deduplication** | No | Yes | -| **Format** | `.npy` (portable) | DJ blob (Python) | -| **Best for** | Large arrays, lazy loading | Small arrays, dedup | - -### When to Use Each - -**Use `` when:** -- Arrays are large (> 10 MB) -- You need to inspect shape/dtype before loading -- Fetching many rows but processing few -- Random access to slices of very large arrays (memory mapping) -- Interoperability matters (non-Python tools) - -**Use `` when:** -- Arrays are small (< 10 MB) -- Same arrays appear in multiple rows (deduplication) -- Storing non-array Python objects (dicts, lists) - -## Supported Array Types - -The `` codec supports any NumPy array except object dtype: - -```python -# Supported -np.array([1, 2, 3], dtype=np.int32) # Integer -np.array([1.0, 2.0], dtype=np.float64) # Float -np.array([True, False], dtype=np.bool_) # Boolean -np.array([1+2j, 3+4j], dtype=np.complex128) # Complex -np.zeros((10, 10, 10)) # N-dimensional -np.array(42) # 0-dimensional scalar - -# Structured arrays -dt = np.dtype([('x', np.float64), ('y', np.float64)]) -np.array([(1.0, 2.0), (3.0, 4.0)], dtype=dt) - -# NOT supported -np.array([{}, []], dtype=object) # Object dtype -``` - -## Troubleshooting - -### "Store not configured" - -Ensure your store is configured before using ``: - -```python -dj.config.object_storage.stores['store'] = {...} -``` - -### "requires @ (store only)" - -The `` codec requires the `@` modifier: - -```python -# Wrong -data : - -# Correct -data : -data : -``` - -### Memory issues with large arrays - -Use lazy loading or memory mapping to control memory: - -```python -# Check size before loading -if ref.nbytes > available_memory: - # Use memory mapping for random access - arr = ref.load(mmap_mode='r') - # Process in chunks - for i in range(0, len(arr), chunk_size): - process(arr[i:i+chunk_size]) -else: - arr = ref.load() -``` - -## See Also - -- [Use Object Storage](use-object-storage.md) - Complete storage guide -- [Configure Object Storage](configure-storage.md) - Store setup -- [`` Codec Specification](../reference/specs/npy-codec.md) - Full spec - - ---- -## File: how-to/use-object-storage.md - -# Use Object Storage - -Store large data objects as part of your Object-Augmented Schema. - -## Object-Augmented Schema (OAS) - -An **Object-Augmented Schema** extends relational tables with object storage as a unified system. The relational database stores metadata, references, and small values while large objects (arrays, files, datasets) are stored in object storage. DataJoint maintains referential integrity across both storage layers—when you delete a row, its associated objects are cleaned up automatically. - -OAS supports two addressing schemes: - -| Addressing | Location | Path Derived From | Object Type | Use Case | -|------------|----------|-------------------|-------------|----------| -| **Hash-addressed** | Object store | Content hash (MD5) | Individual/atomic | Single blobs, single files, attachments (with deduplication) | -| **Schema-addressed** | Object store | Schema structure | Complex/multi-part | Zarr arrays, HDF5 datasets, multi-file objects (browsable paths) | - -**Key distinction:** -- **Hash-addressed** (``, ``) stores individual, atomic objects - one object per field -- **Schema-addressed** (``, ``) can store complex, multi-part objects like Zarr (directory structures with multiple files) - -Data can also be stored **in-table** directly in the database column (no `@` modifier). - -For complete details, see the [Type System specification](../reference/specs/type-system.md). - -## When to Use Object Storage - -Use the `@` modifier for: - -- Large arrays (images, videos, neural recordings) -- File attachments -- Zarr arrays and HDF5 files -- Any data too large for efficient database storage - -## In-Table vs Object Store - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid - --- - metadata : # In-table: stored in database column - raw_data : # Object store: hash-addressed - waveforms : # Object store: schema-addressed (lazy) - """ -``` - -| Syntax | Storage | Best For | -|--------|---------|----------| -| `` | Database | Small Python objects (typically < 1-10 MB) | -| `` | Database | Small files with filename (typically < 1-10 MB) | -| `` | Default store | Large Python objects (hash-addressed, with dedup) | -| `` | Default store | Large files with filename (hash-addressed, with dedup) | -| `` | Default store | NumPy arrays (schema-addressed, lazy, navigable) | -| `` | Named store | Specific storage tier | - -## Store Data - -Insert works the same regardless of storage location: - -```python -import numpy as np - -Recording.insert1({ - 'recording_id': uuid.uuid4(), - 'metadata': {'channels': 32, 'rate': 30000}, - 'raw_data': np.random.randn(32, 30000) # ~7.7 MB array -}) -``` - -DataJoint automatically routes to the configured store. - -## Retrieve Data - -Fetch works transparently: - -```python -data = (Recording & key).fetch1('raw_data') -# Returns the numpy array, regardless of where it was stored -``` - -## Named Stores - -Use different stores for different data types: - -```python -@schema -class Experiment(dj.Manual): - definition = """ - experiment_id : uuid - --- - raw_video : # Fast local storage - processed : # S3 for long-term - """ -``` - -Configure stores in `datajoint.json`: - -```json -{ - "stores": { - "default": "raw", - "raw": { - "protocol": "file", - "location": "/fast/storage" - }, - "archive": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive", - "location": "project-data" - } - } -} -``` - -## Hash-Addressed Storage - -`` and `` use **hash-addressed** storage: - -- Objects are stored by their content hash (MD5) -- Identical data is stored once (automatic deduplication) -- Multiple rows can reference the same object -- Immutable—changing data creates a new object - -```python -# These two inserts store the same array only once -data = np.zeros((1000, 1000)) -Table.insert1({'id': 1, 'array': data}) -Table.insert1({'id': 2, 'array': data}) # References same object -``` - -## Schema-Addressed Storage - -`` and `` use **schema-addressed** storage: - -- Objects stored at paths that mirror database schema: `{schema}/{table}/{pk}/{attribute}.npy` -- Browsable organization in object storage -- One object per entity (no deduplication) -- Supports lazy loading with metadata access - -```python -@schema -class Dataset(dj.Manual): - definition = """ - dataset_id : uuid - --- - zarr_array : # Zarr array stored by path - """ -``` - -Use path-addressed storage for: - -- Zarr arrays (chunked, appendable) -- HDF5 files -- Large datasets requiring streaming access - -## Write Directly to Object Storage - -For large datasets like multi-GB imaging recordings, avoid intermediate copies by writing directly to object storage with `staged_insert1`: - -```python -import zarr - -@schema -class ImagingSession(dj.Manual): - definition = """ - subject_id : int32 - session_id : int32 - --- - n_frames : int32 - frame_rate : float32 - frames : - """ - -# Write Zarr directly to object storage -with ImagingSession.staged_insert1 as staged: - # 1. Set primary key values first - staged.rec['subject_id'] = 1 - staged.rec['session_id'] = 1 - - # 2. Get storage handle - store = staged.store('frames', '.zarr') - - # 3. Write directly (no local copy) - z = zarr.open(store, mode='w', shape=(1000, 512, 512), - chunks=(10, 512, 512), dtype='uint16') - for i in range(1000): - z[i] = acquire_frame() # Write frame-by-frame - - # 4. Set remaining attributes - staged.rec['n_frames'] = 1000 - staged.rec['frame_rate'] = 30.0 - -# Record inserted with computed metadata on successful exit -``` - -The `staged_insert1` context manager: - -- Writes directly to the object store (no intermediate files) -- Computes metadata (size, manifest) automatically on exit -- Cleans up storage if an error occurs (atomic) -- Requires primary key values before calling `store()` or `open()` - -Use `staged.store(field, ext)` for FSMap access (Zarr), or `staged.open(field, ext)` for file-like access. - -## Attachments - -Preserve original filenames with ``: - -```python -@schema -class Document(dj.Manual): - definition = """ - doc_id : uuid - --- - report : # Preserves filename - """ - -# Insert with AttachFileType -from datajoint import AttachFileType -Document.insert1({ - 'doc_id': uuid.uuid4(), - 'report': AttachFileType('/path/to/report.pdf') -}) -``` - -## NumPy Arrays with `` - -The `` codec stores NumPy arrays as portable `.npy` files with lazy loading: - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : int32 - --- - waveform : # NumPy array, schema-addressed - """ - -# Insert - just pass the array -Recording.insert1({ - 'recording_id': 1, - 'waveform': np.random.randn(1000, 32), -}) - -# Fetch returns NpyRef (lazy) -ref = (Recording & 'recording_id=1').fetch1('waveform') -``` - -### NpyRef: Lazy Array Reference - -`NpyRef` provides metadata without downloading: - -```python -ref = (Recording & key).fetch1('waveform') - -# Metadata access - NO download -ref.shape # (1000, 32) -ref.dtype # float64 -ref.nbytes # 256000 -ref.is_loaded # False - -# Explicit loading -arr = ref.load() # Downloads and caches -ref.is_loaded # True - -# Numpy integration (triggers download) -result = np.mean(ref) # Uses __array__ protocol -result = np.asarray(ref) + 1 # Convert then operate -``` - -### Bulk Fetch Safety - -Fetching many rows doesn't download until you access each array: - -```python -# Fetch 1000 recordings - NO downloads yet -results = Recording.to_dicts() - -# Inspect metadata without downloading -for rec in results: - ref = rec['waveform'] - if ref.shape[0] > 500: # Check without download - process(ref.load()) # Download only what you need -``` - -## Lazy Loading with ObjectRef - -`` and `` return lazy references: - -```python -ref = (Dataset & key).fetch1('zarr_array') - -# Open for streaming access -with ref.open() as f: - data = zarr.open(f) - -# Or download to local path -local_path = ref.download('/tmp/data') -``` - -## Storage Best Practices - -### Choose the Right Codec - -| Data Type | Codec | Addressing | Lazy | Best For | -|-----------|-------|------------|------|----------| -| NumPy arrays | `` | Schema | Yes | Arrays needing lazy load, metadata inspection | -| Python objects | `` or `` | In-table or Hash | No | Dicts, lists, arrays (use `@` for large/dedup) | -| File attachments | `` or `` | In-table or Hash | No | Files with filename preserved (use `@` for large/dedup) | -| Zarr/HDF5 | `` | Schema | Yes | Chunked arrays, streaming access | -| File references | `` | External | Yes | References to external files | - -### Size Guidelines - -**Technical limits:** -- **MySQL**: In-table blobs up to 4 GiB (`LONGBLOB`) -- **PostgreSQL**: In-table blobs unlimited (`BYTEA`) - -**Practical recommendations** (consider accessibility, cost, performance): -- **< 1-10 MB**: In-table storage (``) often sufficient -- **10-100 MB**: Object store (`` with dedup, or `` for arrays) -- **> 100 MB**: Schema-addressed (``, ``) for streaming and lazy loading - -### Store Tiers - -Configure stores for different access patterns: - -```json -{ - "stores": { - "default": "hot", - "hot": { - "protocol": "file", - "location": "/ssd/data" - }, - "warm": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "project-data", - "location": "active" - }, - "cold": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive", - "location": "long-term" - } - } -} -``` - -## See Also - -- [Configure Object Storage](configure-storage.md) — Storage setup -- [Create Custom Codecs](create-custom-codec.md) — Domain-specific types -- [Manage Large Data](manage-large-data.md) — Working with blobs - - -============================================================ -# Reference -============================================================ - - ---- -## File: reference/configuration.md - -# Configuration Reference - -DataJoint configuration options and settings. - -## Configuration Sources - -Configuration is loaded in priority order: - -1. **Environment variables** (highest priority) -2. **Secrets directory** (`.secrets/`) -3. **Config file** (`datajoint.json`) -4. **Defaults** (lowest priority) - -## Database Settings - -| Setting | Environment | Default | Description | -|---------|-------------|---------|-------------| -| `database.host` | `DJ_HOST` | `localhost` | MySQL server hostname | -| `database.port` | `DJ_PORT` | `3306` | MySQL server port | -| `database.user` | `DJ_USER` | — | Database username | -| `database.password` | `DJ_PASS` | — | Database password | -| `database.reconnect` | — | `True` | Auto-reconnect on connection loss | -| `database.use_tls` | — | `None` | Enable TLS encryption | - -## Connection Settings - -| Setting | Default | Description | -|---------|---------|-------------| -| `connection.init_function` | `None` | SQL function to run on connect | -| `connection.charset` | `""` | Character set (pymysql default) | - -## Stores Configuration - -Unified storage configuration for all external storage types (``, ``, ``, ``, ``). - -**Default stores:** - -DataJoint uses two default settings to reflect the architectural distinction between integrated and reference storage: - -| Setting | Default | Description | -|---------|---------|-------------| -| `stores.default` | — | Default store for integrated storage (``, ``, ``, ``) | -| `stores.filepath_default` | — | Default store for filepath references (``) — often different from `stores.default` | - -**Why separate defaults?** Hash and schema-addressed storage are integrated into the Object-Augmented Schema (OAS)—DataJoint manages paths, lifecycle, and integrity. Filepath storage is user-managed references to existing files—DataJoint only stores the path. These are architecturally distinct and often use different storage locations. - -**Common settings (all protocols):** - -| Setting | Required | Description | -|---------|----------|-------------| -| `stores..protocol` | Yes | Storage protocol: `file`, `s3`, `gcs`, `azure` | -| `stores..location` | Yes | Base path or prefix (includes project context) | -| `stores..hash_prefix` | No | Path prefix for hash-addressed section (default: `"_hash"`) | -| `stores..schema_prefix` | No | Path prefix for schema-addressed section (default: `"_schema"`) | -| `stores..filepath_prefix` | No | Required path prefix for filepath section, or `null` for unrestricted (default: `null`) | -| `stores..subfolding` | No | Directory nesting for hash-addressed storage, e.g., `[2, 2]` (default: no subfolding) | -| `stores..partition_pattern` | No | Path partitioning for schema-addressed storage, e.g., `"subject_id/session_date"` (default: no partitioning) | -| `stores..token_length` | No | Random token length for schema-addressed filenames (default: `8`) | - -**Storage sections:** - -Each store is divided into sections defined by prefix configuration. The `*_prefix` parameters set the path prefix for each storage section: - -- **`hash_prefix`**: Defines the hash-addressed section for `` and `` (default: `"_hash"`) -- **`schema_prefix`**: Defines the schema-addressed section for `` and `` (default: `"_schema"`) -- **`filepath_prefix`**: Optionally restricts the filepath section for `` (default: `null` = unrestricted) - -Prefixes must be mutually exclusive (no prefix can be a parent/child of another). This allows mapping DataJoint to existing storage layouts: - -```json -{ - "stores": { - "legacy": { - "protocol": "file", - "location": "/data/existing_storage", - "hash_prefix": "content_addressed", // Path prefix for hash section - "schema_prefix": "structured_data", // Path prefix for schema section - "filepath_prefix": "raw_files" // Path prefix for filepath section - } - } -} -``` - -**S3-specific settings:** - -| Setting | Required | Description | -|---------|----------|-------------| -| `stores..endpoint` | Yes | S3 endpoint URL (e.g., `s3.amazonaws.com`) | -| `stores..bucket` | Yes | Bucket name | -| `stores..access_key` | Yes | S3 access key ID | -| `stores..secret_key` | Yes | S3 secret access key | -| `stores..secure` | No | Use HTTPS (default: `True`) | - -**GCS-specific settings:** - -| Setting | Required | Description | -|---------|----------|-------------| -| `stores..bucket` | Yes | GCS bucket name | -| `stores..token` | Yes | Authentication token path | -| `stores..project` | No | GCS project ID | - -**Azure-specific settings:** - -| Setting | Required | Description | -|---------|----------|-------------| -| `stores..container` | Yes | Azure container name | -| `stores..account_name` | Yes | Storage account name | -| `stores..account_key` | Yes | Storage account key | -| `stores..connection_string` | No | Alternative to account_name + account_key | - -**How storage methods use stores:** - -- **Hash-addressed** (``, ``): `{location}/{hash_prefix}/{schema}/{hash}` with optional subfolding -- **Schema-addressed** (``, ``): `{location}/{schema_prefix}/{partition}/{schema}/{table}/{key}/{field}.{token}.{ext}` with optional partitioning -- **Filepath** (``): `{location}/{filepath_prefix}/{user_path}` (user-managed, cannot use hash or schema prefixes) - -All storage methods share the same stores and default store. DataJoint reserves the configured `hash_prefix` and `schema_prefix` sections for managed storage; `` references can use any other paths (unless `filepath_prefix` is configured to restrict them). - -**Path structure examples:** - -Without partitioning: -``` -{location}/_hash/{schema}/ab/cd/abcd1234... # hash-addressed with subfolding -{location}/_schema/{schema}/{table}/{key}/data.x8f2a9b1.zarr # schema-addressed, no partitioning -``` - -With `partition_pattern: "subject_id/session_date"`: -``` -{location}/_schema/subject_id=042/session_date=2024-01-15/{schema}/{table}/{remaining_key}/data.x8f2a9b1.zarr -``` - -If table lacks partition attributes, it follows normal path structure. - -**Credentials should be stored in secrets:** - -``` -.secrets/ -├── stores.main.access_key -├── stores.main.secret_key -├── stores.archive.access_key -└── stores.archive.secret_key -``` - -## Jobs Settings - -| Setting | Default | Description | -|---------|---------|-------------| -| `jobs.auto_refresh` | `True` | Auto-refresh job queue on populate | -| `jobs.keep_completed` | `False` | Retain success records in jobs table | -| `jobs.stale_timeout` | `3600` | Seconds before stale job cleanup | -| `jobs.default_priority` | `5` | Default priority (0-255, lower = more urgent) | -| `jobs.version_method` | `None` | Version tracking: `git`, `none`, or `None` (disabled) | -| `jobs.add_job_metadata` | `False` | Add hidden metadata to computed tables | -| `jobs.allow_new_pk_fields_in_computed_tables` | `False` | Allow non-FK primary key fields | - -## Display Settings - -| Setting | Default | Description | -|---------|---------|-------------| -| `display.limit` | `12` | Max rows to display | -| `display.width` | `14` | Column width | -| `display.show_tuple_count` | `True` | Show row count in output | - -## Top-Level Settings - -| Setting | Environment | Default | Description | -|---------|-------------|---------|-------------| -| `loglevel` | `DJ_LOG_LEVEL` | `INFO` | Log level: `DEBUG`, `INFO`, `WARNING`, `ERROR`, `CRITICAL` | -| `safemode` | — | `True` | Require confirmation for destructive operations | -| `enable_python_native_blobs` | — | `True` | Allow Python-native blob serialization | -| `cache` | — | `None` | Path for query result cache | -| `query_cache` | — | `None` | Path for compiled query cache | -| `download_path` | — | `.` | Download location for attachments/filepaths | - -## Example Configuration - -### datajoint.json (Non-sensitive settings) - -```json -{ - "database.host": "mysql.example.com", - "database.port": 3306, - "stores": { - "default": "main", - "filepath_default": "raw_data", - "main": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "datajoint-bucket", - "location": "neuroscience-lab/production", - "partition_pattern": "subject_id/session_date", - "token_length": 8 - }, - "archive": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive-bucket", - "location": "neuroscience-lab/long-term", - "subfolding": [2, 2] - }, - "raw_data": { - "protocol": "file", - "location": "/mnt/acquisition", - "filepath_prefix": "recordings" - } - }, - "jobs": { - "add_job_metadata": true - } -} -``` - -### .secrets/ (Credentials - never commit!) - -``` -.secrets/ -├── database.user # analyst -├── database.password # dbpass123 -├── stores.main.access_key # AKIAIOSFODNN7EXAMPLE -├── stores.main.secret_key # wJalrXUtnFEMI/K7MDENG... -├── stores.archive.access_key # AKIAIOSFODNN8EXAMPLE -└── stores.archive.secret_key # xKbmsYVuoGFNJ/L8NEOH... -``` - -Add `.secrets/` to `.gitignore`: - -```bash -echo ".secrets/" >> .gitignore -``` - -### Environment Variables (Alternative to .secrets/) - -```bash -# Database -export DJ_HOST=mysql.example.com -export DJ_USER=analyst -export DJ_PASS=secret -``` - -**Note:** Per-store credentials must be configured in `datajoint.json` or `.secrets/` — environment variable overrides are not supported for nested store configurations. - -## API Reference - -See [Settings API](../api/datajoint/settings.md) for programmatic access. - - ---- -## File: reference/definition-syntax.md - -# Table Definition Syntax - -DataJoint's declarative table definition language. - -## Basic Structure - -```python -@schema -class TableName(dj.Manual): - definition = """ - # Table comment - primary_attr1 : type # comment - primary_attr2 : type # comment - --- - secondary_attr1 : type # comment - secondary_attr2 = default : type # comment with default - """ -``` - -## Grammar - -``` -definition = [comment] pk_section "---" secondary_section -pk_section = attribute_line+ -secondary_section = attribute_line* - -attribute_line = [foreign_key | attribute] -foreign_key = "->" table_reference [alias] -attribute = [default "="] name ":" type [# comment] - -default = NULL | literal | CURRENT_TIMESTAMP -type = core_type | codec_type | native_type -core_type = int32 | float64 | varchar(n) | ... -codec_type = "<" name ["@" [store]] ">" -``` - -## Foreign Keys - -```python --> ParentTable # Inherit all PK attributes --> ParentTable.proj(new='old') # Rename attributes -``` - -## Attribute Types - -### Core Types - -```python -mouse_id : int32 # 32-bit integer -weight : float64 # 64-bit float -name : varchar(100) # Variable string up to 100 chars -is_active : bool # Boolean -created : datetime # Date and time -data : json # JSON document -``` - -### Codec Types - -```python -image : # Serialized Python object (in DB) -large_array : # Serialized Python object (external) -config_file : # File attachment (in DB) -data_file : # File attachment (named store) -zarr_data : # Path-addressed folder -raw_path : # Portable file reference -``` - -## Defaults - -```python -status = "pending" : varchar(20) # String default -count = 0 : int32 # Numeric default -notes = '' : varchar(1000) # Empty string default (preferred for strings) -created = CURRENT_TIMESTAMP : datetime # Auto-timestamp -ratio = NULL : float64 # Nullable (only NULL can be default) -``` - -**Nullable attributes:** An attribute is nullable if and only if its default is `NULL`. -DataJoint does not allow other defaults for nullable attributes—this prevents ambiguity -about whether an attribute is optional. For strings, prefer empty string `''` as the -default rather than `NULL`. - -## Comments - -```python -# Table-level comment (first line) -mouse_id : int32 # Inline attribute comment -``` - -## Indexes - -```python -definition = """ - ... - --- - ... - INDEX (attr1) # Single-column index - INDEX (attr1, attr2) # Composite index - UNIQUE INDEX (email) # Unique constraint - """ -``` - -## Complete Example - -```python -@schema -class Session(dj.Manual): - definition = """ - # Experimental session - -> Subject - session_idx : int32 # Session number for this subject - --- - session_date : date # Date of session - -> [nullable] Experimenter # Optional experimenter - notes = '' : varchar(1000) # Session notes - start_time : datetime # Session start - duration : float64 # Duration in minutes - INDEX (session_date) - """ -``` - -## Validation - -DataJoint validates definitions at declaration time: - -- Primary key must have at least one attribute -- Attribute names must be valid identifiers -- Types must be recognized -- Foreign key references must exist -- No circular dependencies allowed - -## See Also - -- [Primary Keys](specs/primary-keys.md) — Key determination rules -- [Type System](specs/type-system.md) — Type architecture -- [Codec API](specs/codec-api.md) — Custom types - - ---- -## File: reference/errors.md - -# Error Reference - -DataJoint exception classes and their meanings. - -## Exception Hierarchy - -``` -Exception -└── DataJointError - ├── LostConnectionError - ├── QueryError - │ ├── QuerySyntaxError - │ ├── AccessError - │ ├── DuplicateError - │ ├── IntegrityError - │ ├── UnknownAttributeError - │ └── MissingAttributeError - ├── MissingTableError - ├── MissingExternalFile - └── BucketInaccessible -``` - -## Base Exception - -### DataJointError - -Base class for all DataJoint-specific errors. - -```python -try: - # DataJoint operation -except dj.DataJointError as e: - print(f"DataJoint error: {e}") -``` - -## Connection Errors - -### LostConnectionError - -Database connection was lost during operation. - -**Common causes:** -- Network interruption -- Server timeout -- Server restart - -**Resolution:** -- Check network connectivity -- Reconnect with `dj.conn().connect()` - -## Query Errors - -### QuerySyntaxError - -Invalid query syntax. - -**Common causes:** -- Malformed restriction string -- Invalid attribute reference -- SQL syntax error in projection - -### AccessError - -Insufficient database privileges. - -**Common causes:** -- User lacks SELECT/INSERT/DELETE privileges -- Schema access not granted - -**Resolution:** -- Contact database administrator -- Check user grants - -### DuplicateError - -Attempt to insert duplicate primary key. - -```python -try: - table.insert1({'id': 1, 'name': 'Alice'}) - table.insert1({'id': 1, 'name': 'Bob'}) # Raises DuplicateError -except dj.errors.DuplicateError: - print("Entry already exists") -``` - -**Resolution:** -- Use `insert(..., skip_duplicates=True)` -- Use `insert(..., replace=True)` to update -- Check if entry exists before inserting - -### IntegrityError - -Foreign key constraint violation. - -**Common causes:** -- Inserting row with non-existent parent -- Parent row deletion blocked by children - -**Resolution:** -- Insert parent rows first -- Use cascade delete for parent - -### UnknownAttributeError - -Referenced attribute doesn't exist. - -```python -# Raises UnknownAttributeError -table.to_arrays('nonexistent_column') -``` - -**Resolution:** -- Check `table.heading` for available attributes -- Verify spelling - -### MissingAttributeError - -Required attribute not provided in insert. - -```python -# Raises MissingAttributeError if 'name' is required -table.insert1({'id': 1}) # Missing 'name' -``` - -**Resolution:** -- Provide all required attributes -- Set default values in definition - -## Table Errors - -### MissingTableError - -Table not declared in database. - -**Common causes:** -- Schema not created -- Table class not instantiated -- Database dropped - -**Resolution:** -- Check schema exists: `schema.is_activated()` -- Verify table declaration - -## Storage Errors - -### MissingExternalFile - -External file managed by DataJoint is missing. - -**Common causes:** -- File manually deleted from store -- Store misconfigured -- Network/permission issues - -**Resolution:** -- Check store configuration -- Verify file exists at expected path -- Run garbage collection audit - -### BucketInaccessible - -S3 bucket cannot be accessed. - -**Common causes:** -- Invalid credentials -- Bucket doesn't exist -- Network/firewall issues - -**Resolution:** -- Verify AWS credentials -- Check bucket name and region -- Test with AWS CLI - -## Handling Errors - -### Catching Specific Errors - -```python -import datajoint as dj - -try: - table.insert1(data) -except dj.errors.DuplicateError: - print("Entry exists, skipping") -except dj.errors.IntegrityError: - print("Parent entry missing") -except dj.DataJointError as e: - print(f"Other DataJoint error: {e}") -``` - -### Error Information - -```python -try: - table.insert1(data) -except dj.DataJointError as e: - print(f"Error type: {type(e).__name__}") - print(f"Message: {e}") - print(f"Args: {e.args}") -``` - -## See Also - -- [API: errors module](../api/datajoint/errors.md) - - ---- -## File: reference/index.md - -# Reference - -Specifications, API documentation, and technical details. - -## Specifications - -Detailed specifications of DataJoint's behavior and semantics. - -- [Primary Key Rules](specs/primary-keys.md) — How primary keys are determined in query results -- [Semantic Matching](specs/semantic-matching.md) — Attribute lineage and homologous matching -- [Type System](specs/type-system.md) — Core types, codecs, and storage modes -- [Codec API](specs/codec-api.md) — Creating custom attribute types -- [Object Store Configuration](specs/object-store-configuration.md) — Store configuration, path generation, and integrated storage models -- [AutoPopulate](specs/autopopulate.md) — Jobs 2.0 specification -- [Fetch API](specs/fetch-api.md) — Data retrieval methods -- [Job Metadata](specs/job-metadata.md) — Hidden job tracking columns - -## Quick Reference - -- [Configuration](configuration.md) — All `dj.config` options -- [Definition Syntax](definition-syntax.md) — Table definition grammar -- [Operators](operators.md) — Query operator summary -- [Errors](errors.md) — Exception types and meanings - -## Elements - -Curated pipeline modules for neurophysiology experiments. - -- [DataJoint Elements](../elements/index.md) — Pre-built pipelines for calcium imaging, electrophysiology, behavior tracking, and more - -## API Documentation - -Auto-generated from source code docstrings. - -- [API Index](../api/index.md) - - ---- -## File: reference/operators.md - -# Query Operators Reference - -DataJoint provides a small set of operators for querying data. All operators return new query expressions without modifying the original—queries are immutable and composable. - -## Operator Summary - -| Operator | Syntax | Description | -|----------|--------|-------------| -| Restriction | `A & condition` | Select rows matching condition | -| Anti-restriction | `A - condition` | Select rows NOT matching condition | -| Projection | `A.proj(...)` | Select, rename, or compute attributes | -| Join | `A * B` | Combine tables on matching attributes | -| Extension | `A.extend(B)` | Add attributes from B, keeping all rows of A | -| Aggregation | `A.aggr(B, ...)` | Group B by A's primary key and compute summaries | -| Union | `A + B` | Combine entity sets | - ---- - -## Restriction (`&`) - -Select rows that match a condition. - -```python -# String condition (SQL expression) -Session & "session_date > '2024-01-01'" -Session & "duration BETWEEN 30 AND 60" - -# Dictionary (exact match) -Session & {'subject_id': 'M001'} -Session & {'subject_id': 'M001', 'session_idx': 1} - -# Query expression (matching keys) -Session & Subject # Sessions for subjects in Subject table -Session & (Subject & "sex = 'M'") # Sessions for male subjects - -# List (OR of conditions) -Session & [{'subject_id': 'M001'}, {'subject_id': 'M002'}] -``` - -**Chaining**: Multiple restrictions combine with AND: -```python -Session & "duration > 30" & {'experimenter': 'alice'} -``` - -### Top N Rows (`dj.Top`) - -Restrict to the top N rows with optional ordering: - -```python -# First row by primary key -Session & dj.Top() - -# First 10 rows by primary key (ascending) -Session & dj.Top(10) - -# First 10 rows by primary key (descending) -Session & dj.Top(10, 'KEY DESC') - -# Top 5 by score descending -Result & dj.Top(5, 'score DESC') - -# Top 10 most recent sessions -Session & dj.Top(10, 'session_date DESC') - -# Pagination: skip 20, take 10 -Session & dj.Top(10, 'session_date DESC', offset=20) - -# All rows ordered (no limit) -Session & dj.Top(None, 'session_date DESC') -``` - -**Parameters**: -- `limit` (default=1): Maximum rows. Use `None` for no limit. -- `order_by` (default="KEY"): Attribute(s) to sort by. `"KEY"` expands to all primary key attributes. Add `DESC` for descending order (e.g., `"KEY DESC"`, `"score DESC"`). Use `None` to inherit existing order. -- `offset` (default=0): Rows to skip. - -**Chaining Tops**: When chaining multiple Top restrictions, the second Top can inherit the first's ordering by using `order_by=None`: - -```python -# First Top sets the order, second inherits it -(Session & dj.Top(100, 'date DESC')) & dj.Top(10, order_by=None) -# Result: top 10 of top 100 by date descending -``` - -**Note**: `dj.Top` can only be used with restriction (`&`), not with anti-restriction (`-`). - ---- - -## Anti-Restriction (`-`) - -Select rows that do NOT match a condition. - -```python -# Subjects without any sessions -Subject - Session - -# Sessions not from subject M001 -Session - {'subject_id': 'M001'} - -# Sessions without trials -Session - Trial -``` - ---- - -## Projection (`.proj()`) - -Select, rename, or compute attributes. Primary key is always included. - -```python -# Primary key only -Subject.proj() - -# Specific attributes -Subject.proj('species', 'sex') - -# All attributes -Subject.proj(...) - -# All except some -Subject.proj(..., '-notes', '-internal_id') - -# Rename attribute -Subject.proj(animal_species='species') - -# Computed attribute (SQL expression) -Subject.proj(weight_kg='weight / 1000') -Session.proj(year='YEAR(session_date)') -Trial.proj(is_correct='response = stimulus') -``` - ---- - -## Join (`*`) - -Combine tables on shared attributes. DataJoint matches attributes by **semantic matching**—only attributes with the same name AND same origin (through foreign keys) are matched. - -```python -# Join Subject and Session on subject_id -Subject * Session - -# Three-way join -Subject * Session * Experimenter - -# Join then restrict -(Subject * Session) & "sex = 'M'" - -# Restrict then join (equivalent) -(Subject & "sex = 'M'") * Session -``` - -**Primary key of result**: Determined by functional dependencies between operands. See [Query Algebra Specification](specs/query-algebra.md) for details. - ---- - -## Extension (`.extend()`) - -Add attributes from another table while preserving all rows. This is useful for adding optional attributes. - -```python -# Add experimenter info to sessions -# Sessions without an experimenter get NULL values -Session.extend(Experimenter) -``` - -**Requirement**: The left operand must "determine" the right operand—all of B's primary key attributes must exist in A. - ---- - -## Aggregation (`.aggr()`) - -Group one entity type by another and compute summary statistics. - -```python -# Count trials per session -Session.aggr(Session.Trial, n_trials='count(trial_idx)') - -# Multiple aggregates -Session.aggr( - Session.Trial, - n_trials='count(trial_idx)', - n_correct='sum(correct)', - avg_rt='avg(reaction_time)', - min_rt='min(reaction_time)', - max_rt='max(reaction_time)' -) - -# Count sessions per subject -Subject.aggr(Session, n_sessions='count(session_idx)') -``` - -**Default behavior**: Keeps all rows from the grouping table (left operand), even those without matches. Use `count(pk_attribute)` to get 0 for entities without matches. - -```python -# All subjects, including those with 0 sessions -Subject.aggr(Session, n_sessions='count(session_idx)') - -# Only subjects with at least one session -Subject.aggr(Session, n_sessions='count(session_idx)', exclude_nonmatching=True) -``` - -### Common Aggregate Functions - -| Function | Description | -|----------|-------------| -| `count(attr)` | Count non-NULL values | -| `count(*)` | Count all rows (including NULL) | -| `sum(attr)` | Sum of values | -| `avg(attr)` | Average | -| `min(attr)` | Minimum | -| `max(attr)` | Maximum | -| `std(attr)` | Standard deviation | -| `group_concat(attr)` | Concatenate values | - ---- - -## Union (`+`) - -Combine entity sets from two tables with the same primary key. - -```python -# All subjects that are either mice or rats -Mouse + Rat -``` - -**Requirements**: -- Same primary key attributes -- No overlapping secondary attributes - ---- - -## Universal Set (`dj.U()`) - -Create ad-hoc groupings or extract unique values. - -### Unique Values - -```python -# Unique species -dj.U('species') & Subject - -# Unique (year, month) combinations -dj.U('year', 'month') & Session.proj(year='YEAR(session_date)', month='MONTH(session_date)') -``` - -### Aggregation by Non-Primary-Key Attributes - -```python -# Count sessions by date (session_date is not a primary key) -dj.U('session_date').aggr(Session, n='count(session_idx)') - -# Count by experimenter -dj.U('experimenter_id').aggr(Session, n='count(session_idx)') -``` - -### Universal Aggregation (Single Row Result) - -```python -# Total count across all sessions -dj.U().aggr(Session, total='count(*)') - -# Global statistics -dj.U().aggr(Trial, - total='count(*)', - avg_rt='avg(reaction_time)', - std_rt='std(reaction_time)' -) -``` - ---- - -## Operator Precedence - -Python operator precedence applies: - -| Precedence | Operator | Operation | -|------------|----------|-----------| -| Highest | `*` | Join | -| | `+`, `-` | Union, Anti-restriction | -| Lowest | `&` | Restriction | - -Use parentheses to make intent clear: - -```python -# Join happens before restriction -Subject * Session & condition # Same as: (Subject * Session) & condition - -# Use parentheses to restrict first -(Subject & condition) * Session -``` - ---- - -## Semantic Matching - -DataJoint uses **semantic matching** for joins and restrictions by query expression. Attributes match only if they have: - -1. The same name -2. The same origin (traced through foreign key lineage) - -This prevents accidental matches on attributes that happen to share names but represent different things (like generic `id` columns in unrelated tables). - -```python -# These match on subject_id because Session references Subject -Subject * Session # Correct: subject_id has same lineage - -# These would error if both have 'name' from different origins -Student * Course # Error if both define their own 'name' attribute -``` - -**Resolution**: Rename attributes to avoid conflicts: -```python -Student * Course.proj(..., course_name='name') -``` - ---- - -## See Also - -- [Query Algebra Specification](specs/query-algebra.md) — Complete formal specification -- [Fetch API](specs/fetch-api.md) — Retrieving query results -- [Queries Tutorial](../tutorials/basics/04-queries.ipynb) — Hands-on examples - - ---- -## File: reference/specs/autopopulate.md - -# AutoPopulate Specification - -## Overview - -AutoPopulate is DataJoint's mechanism for automated computation. Tables that inherit from `dj.Computed` or `dj.Imported` automatically populate themselves by executing a `make()` method for each entry defined by their dependencies. - -This specification covers: -- The populate process and key source calculation -- Transaction management and atomicity -- The `make()` method and tripartite pattern -- Part tables in computed results -- Distributed computing with job reservation - ---- - -## 1. Auto-Populated Tables - -### 1.1 Table Types - -| Type | Base Class | Purpose | -|------|------------|---------| -| Computed | `dj.Computed` | Results derived from other DataJoint tables | -| Imported | `dj.Imported` | Data ingested from external sources (files, instruments) | - -Both types share the same AutoPopulate mechanism. The distinction is semantic—`Imported` indicates external data sources while `Computed` indicates derivation from existing tables. - -### 1.2 Basic Structure - -```python -@schema -class FilteredImage(dj.Computed): - definition = """ - -> RawImage - --- - filtered : - """ - - def make(self, key): - # Fetch source data - raw = (RawImage & key).fetch1('image') - - # Compute result - filtered = apply_filter(raw) - - # Insert result - self.insert1({**key, 'filtered': filtered}) -``` - -### 1.3 Primary Key Constraint - -Auto-populated tables must have primary keys composed entirely of foreign key references: - -```python -# Correct: all PK attributes from foreign keys -@schema -class Analysis(dj.Computed): - definition = """ - -> Session - -> AnalysisMethod - --- - result : float64 - """ - -# Error: non-FK primary key attribute -@schema -class Analysis(dj.Computed): - definition = """ - -> Session - method : varchar(32) # Not allowed - use FK to lookup table - --- - result : float64 - """ -``` - -**Rationale:** This ensures each computed entry is uniquely determined by its upstream dependencies, enabling automatic key source calculation and precise job tracking. - ---- - -## 2. Key Source Calculation - -### 2.1 Definition - -The `key_source` property defines which entries should exist in the table—the complete set of primary keys that `make()` should be called with. - -### 2.2 Automatic Key Source - -By default, DataJoint automatically calculates `key_source` as the join of all tables referenced by foreign keys in the primary key: - -```python -@schema -class SpikeDetection(dj.Computed): - definition = """ - -> Recording - -> DetectionMethod - --- - spike_times : - """ - # Automatic key_source = Recording * DetectionMethod -``` - -**Calculation rules:** -1. Identify all foreign keys in the primary key section -2. Join the referenced tables: `Parent1 * Parent2 * ...` -3. Project to primary key attributes only - -For a table with definition: -```python --> Session --> Probe --> SortingMethod ---- -units : -``` - -The automatic `key_source` is: -```python -Session * Probe * SortingMethod -``` - -This produces all valid combinations of (session, probe, method) that could be computed. - -### 2.3 Custom Key Source - -Override `key_source` to customize which entries to compute: - -```python -@schema -class QualityAnalysis(dj.Computed): - definition = """ - -> Session - --- - score : float64 - """ - - @property - def key_source(self): - # Only process sessions marked as 'good' - return Session & "quality = 'good'" -``` - -**Common customizations:** - -```python -# Filter by condition -@property -def key_source(self): - return Session & "status = 'complete'" - -# Restrict to specific combinations -@property -def key_source(self): - return Recording * Method & "method_name != 'deprecated'" - -# Add complex logic -@property -def key_source(self): - # Only sessions with enough trials - good_sessions = dj.U('session_id').aggr( - Trial, n='count(*)') & 'n >= 100' - return Session & good_sessions -``` - -### 2.4 Pending Entries - -Entries to be computed = `key_source - self`: - -```python -# Entries that should exist but don't yet -pending = table.key_source - table - -# Check how many entries need computing -n_pending = len(table.key_source - table) -``` - ---- - -## 3. The Populate Process - -### 3.1 Basic Populate - -The `populate()` method iterates through pending entries and calls `make()` for each: - -```python -# Populate all pending entries -FilteredImage.populate() -``` - -**Execution flow (direct mode):** - -``` -1. Calculate pending keys: key_source - self -2. Apply restrictions: pending & restrictions -3. For each key in pending: - a. Start transaction - b. Call make(key) - c. Commit transaction (or rollback on error) -4. Return summary -``` - -### 3.2 Method Signature - -```python -def populate( - self, - *restrictions, - suppress_errors: bool = False, - return_exception_objects: bool = False, - reserve_jobs: bool = False, - max_calls: int = None, - display_progress: bool = False, - processes: int = 1, - make_kwargs: dict = None, - priority: int = None, - refresh: bool = None, -) -> dict -``` - -### 3.3 Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `*restrictions` | — | Filter `key_source` to subset of entries | -| `suppress_errors` | `False` | Continue on errors instead of raising | -| `return_exception_objects` | `False` | Return exception objects vs strings | -| `reserve_jobs` | `False` | Enable job reservation for distributed computing | -| `max_calls` | `None` | Maximum number of `make()` calls | -| `display_progress` | `False` | Show progress bar | -| `processes` | `1` | Number of parallel worker processes | -| `make_kwargs` | `None` | Additional kwargs passed to `make()` | -| `priority` | `None` | Process only jobs at this priority or more urgent | -| `refresh` | `None` | Refresh jobs queue before processing | - -### 3.4 Common Usage Patterns - -```python -# Populate everything -Analysis.populate() - -# Populate specific subjects -Analysis.populate(Subject & "subject_id < 10") - -# Populate with progress bar -Analysis.populate(display_progress=True) - -# Populate limited batch -Analysis.populate(max_calls=100) - -# Populate with error collection -errors = Analysis.populate(suppress_errors=True) - -# Parallel populate (single machine) -Analysis.populate(processes=4) -``` - -### 3.5 Return Value - -```python -result = Analysis.populate() -# { -# 'success': 150, # Entries successfully computed -# 'error': 3, # Entries that failed -# 'skip': 0, # Entries skipped (already exist) -# } -``` - ---- - -## 4. The make() Method - -### 4.1 Basic Pattern - -The `make()` method computes and inserts one entry: - -```python -def make(self, key): - """ - Compute and insert one entry. - - Parameters - ---------- - key : dict - Primary key values identifying which entry to compute. - """ - # 1. Fetch source data - source_data = (SourceTable & key).fetch1() - - # 2. Compute result - result = compute(source_data) - - # 3. Insert result - self.insert1({**key, **result}) -``` - -### 4.2 Requirements - -- **Must insert**: `make()` must insert exactly one row matching the key -- **Idempotent**: Same input should produce same output -- **Atomic**: Runs within a transaction—all or nothing -- **Self-contained**: Should not depend on external state that changes - -### 4.3 Accessing Source Data - -```python -def make(self, key): - # Fetch single row - data = (SourceTable & key).fetch1() - - # Fetch specific attributes - image, timestamp = (Recording & key).fetch1('image', 'timestamp') - - # Fetch multiple rows (e.g., trials for a session) - trials = (Trial & key).to_dicts() - - # Join multiple sources - combined = (TableA * TableB & key).to_dicts() -``` - -**Upstream-only convention:** Inside `make()`, fetch only from tables that are strictly upstream in the pipeline—tables referenced by foreign keys in the definition, their ancestors, and their part tables. This ensures reproducibility: computed results depend only on their declared dependencies. - -This convention is not currently enforced programmatically but is critical for pipeline integrity. Some pipelines violate this rule for operational reasons, which makes them non-reproducible. A future release may programmatically enforce upstream-only fetches inside `make()`. - -### 4.4 Tripartite Make Pattern - -For long-running computations, use the tripartite pattern to separate fetch, compute, and insert phases. This enables better transaction management for jobs that take minutes or hours. - -**Method-based tripartite:** - -```python -@schema -class HeavyComputation(dj.Computed): - definition = """ - -> Recording - --- - result : - """ - - def make_fetch(self, key): - """Fetch all required data (runs in transaction).""" - return (Recording & key).fetch1('raw_data') - - def make_compute(self, key, data): - """Perform computation (runs outside transaction).""" - # Long-running computation - no database locks held - return heavy_algorithm(data) - - def make_insert(self, key, result): - """Insert results (runs in transaction).""" - self.insert1({**key, 'result': result}) -``` - -**Generator-based tripartite:** - -```python -def make(self, key): - # Phase 1: Fetch (in transaction) - data = (Recording & key).fetch1('raw_data') - - yield # Exit transaction, release locks - - # Phase 2: Compute (outside transaction) - result = heavy_algorithm(data) # May take hours - - yield # Re-enter transaction - - # Phase 3: Insert (in transaction) - self.insert1({**key, 'result': result}) -``` - -**When to use tripartite:** -- Computation takes more than a few seconds -- You want to avoid holding database locks during computation -- Working with external resources (files, APIs) that may be slow - -### 4.5 Additional make() Arguments - -Pass extra arguments via `make_kwargs`: - -```python -@schema -class ConfigurableAnalysis(dj.Computed): - definition = """ - -> Session - --- - result : float64 - """ - - def make(self, key, threshold=0.5, method='default'): - data = (Session & key).fetch1('data') - result = analyze(data, threshold=threshold, method=method) - self.insert1({**key, 'result': result}) - -# Call with custom parameters -ConfigurableAnalysis.populate(make_kwargs={'threshold': 0.8}) -``` - -**Anti-pattern warning:** Passing arguments that affect the computed result breaks reproducibility—all inputs should come from `fetch` calls inside `make()`. If a parameter affects results, it should be stored in a lookup table and referenced via foreign key. - -**Acceptable use:** Directives that don't affect results, such as: -- `verbose=True` for logging -- `gpu_id=0` for device selection -- `n_workers=4` for parallelization - ---- - -## 5. Transaction Management - -### 5.1 Automatic Transactions - -Each `make()` call runs within an automatic transaction: - -```python -# Pseudocode for populate loop -for key in pending_keys: - connection.start_transaction() - try: - self.make(key) - connection.commit() - except Exception: - connection.rollback() - raise # or log if suppress_errors=True -``` - -### 5.2 Atomicity Guarantees - -- **All or nothing**: If `make()` fails, no partial data is inserted -- **Isolation**: Concurrent workers see consistent state -- **Rollback on error**: Any exception rolls back the transaction - -```python -def make(self, key): - # If this succeeds... - self.insert1({**key, 'step1': result1}) - - # But this fails... - self.Part.insert(part_data) # Raises exception - - # Both inserts are rolled back - table unchanged -``` - -### 5.3 Transaction Scope - -**Simple make (single transaction):** -``` -BEGIN TRANSACTION - └── make(key) - ├── fetch source data - ├── compute - └── insert result -COMMIT -``` - -**Tripartite make (single transaction):** -``` -[No transaction] - ├── make_fetch(key) # Fetch source data - └── make_compute(key, data) # Long-running computation - -BEGIN TRANSACTION - ├── make_fetch(key) # Repeat fetch, verify unchanged - └── make_insert(key, result) # Insert computed result -COMMIT -``` - -This pattern allows long computations without holding database locks, while ensuring data consistency by verifying the source data hasn't changed before inserting. - -### 5.4 Nested Operations - -Inserts within `make()` share the same transaction: - -```python -def make(self, key): - # Main table insert - self.insert1({**key, 'summary': summary}) - - # Part table inserts - same transaction - self.Part1.insert(part1_data) - self.Part2.insert(part2_data) - - # All three inserts commit together or roll back together -``` - -### 5.5 Manual Transaction Control - -For complex scenarios, use explicit transactions: - -```python -def make(self, key): - # Fetch outside transaction - data = (Source & key).to_dicts() - - # Explicit transaction for insert - with dj.conn().transaction: - self.insert1({**key, 'result': compute(data)}) - self.Part.insert(parts) -``` - ---- - -## 6. Part Tables - -### 6.1 Part Tables in Computed Tables - -Computed tables can have Part tables for detailed results: - -```python -@schema -class SpikeSorting(dj.Computed): - definition = """ - -> Recording - --- - n_units : int - """ - - class Unit(dj.Part): - definition = """ - -> master - unit_id : int - --- - waveform : - spike_times : - """ - - def make(self, key): - # Compute spike sorting - units = sort_spikes((Recording & key).fetch1('data')) - - # Insert master entry - self.insert1({**key, 'n_units': len(units)}) - - # Insert part entries - self.Unit.insert([ - {**key, 'unit_id': i, **unit} - for i, unit in enumerate(units) - ]) -``` - -### 6.2 Transaction Behavior - -Master and part inserts share the same transaction: - -```python -def make(self, key): - self.insert1({**key, 'summary': s}) # Master - self.Part.insert(parts) # Parts - - # If Part.insert fails, master insert is also rolled back -``` - -### 6.3 Fetching Part Data - -```python -# Fetch master with parts -master = (SpikeSorting & key).fetch1() -parts = (SpikeSorting.Unit & key).to_dicts() - -# Join master and parts -combined = (SpikeSorting * SpikeSorting.Unit & key).to_dicts() -``` - -### 6.4 Key Source with Parts - -The key source is based on the master table's primary key only: - -```python -# key_source returns master keys, not part keys -SpikeSorting.key_source # Recording keys -``` - -### 6.5 Deleting Computed Parts - -Deleting master entries cascades to parts: - -```python -# Deletes SpikeSorting entry AND all SpikeSorting.Unit entries -(SpikeSorting & key).delete() -``` - ---- - -## 7. Progress and Monitoring - -### 7.1 Progress Method - -Check computation progress: - -```python -# Simple progress -remaining, total = Analysis.progress() -print(f"{remaining}/{total} entries remaining") - -# With display -Analysis.progress(display=True) -# Analysis: 150/200 (75%) [===========> ] -``` - -### 7.2 Display Progress During Populate - -```python -Analysis.populate(display_progress=True) -# [################----] 80% 160/200 [00:15<00:04] -``` - ---- - -## 8. Direct Mode vs Distributed Mode - -### 8.1 Direct Mode (Default) - -When `reserve_jobs=False` (default): - -```python -Analysis.populate() # Direct mode -``` - -**Characteristics:** -- Calculates `key_source - self` on each call -- No job tracking or status persistence -- Simple and efficient for single-worker scenarios -- No coordination overhead - -**Best for:** -- Interactive development -- Single-worker pipelines -- Small to medium datasets - -### 8.2 Distributed Mode - -When `reserve_jobs=True`: - -```python -Analysis.populate(reserve_jobs=True) # Distributed mode -``` - -**Characteristics:** -- Uses per-table jobs queue for coordination -- Workers reserve jobs before processing -- Full status tracking (pending, reserved, error, success) -- Enables monitoring and recovery - -**Best for:** -- Multi-worker distributed computing -- Long-running pipelines -- Production environments with monitoring needs - ---- - -## 9. Per-Table Jobs System - -### 9.1 Jobs Table - -Each auto-populated table has an associated jobs table: - -``` -Table: Analysis -Jobs: ~~analysis -``` - -Access via the `.jobs` property: - -```python -Analysis.jobs # Jobs table -Analysis.jobs.pending # Pending jobs -Analysis.jobs.errors # Failed jobs -Analysis.jobs.progress() # Status summary -``` - -### 9.2 Jobs Table Structure - -``` -# Job queue for Analysis - ---- -status : enum('pending', 'reserved', 'success', 'error', 'ignore') -priority : uint8 # Lower = more urgent (0 = highest) -created_time : timestamp -scheduled_time : timestamp # Process on or after this time -reserved_time : timestamp # When reserved -completed_time : timestamp # When completed -duration : float64 # Execution time in seconds -error_message : varchar(2047) # Truncated error -error_stack : # Full traceback -user : varchar(255) # Database user -host : varchar(255) # Worker hostname -pid : uint32 # Process ID -connection_id : uint64 # MySQL connection ID -version : varchar(255) # Code version -``` - -### 9.3 Job Statuses - -| Status | Description | -|--------|-------------| -| `pending` | Queued and ready to process | -| `reserved` | Currently being processed by a worker | -| `success` | Completed successfully (when `jobs.keep_completed=True`) | -| `error` | Failed with error details | -| `ignore` | Manually marked to skip | - -```mermaid -stateDiagram-v2 - state "(none)" as none1 - state "(none)" as none2 - none1 --> pending : refresh() - none1 --> ignore : ignore() - pending --> reserved : reserve() - reserved --> none2 : complete() - reserved --> success : complete()* - reserved --> error : error() - success --> pending : refresh()* - error --> none2 : delete() - success --> none2 : delete() - ignore --> none2 : delete() -``` - -**Transitions:** - -| Method | Description | -|--------|-------------| -| `refresh()` | Adds new jobs as `pending`; re-pends `success` jobs if key is in `key_source` but not in target | -| `ignore()` | Marks a key as `ignore` (can be called on keys not yet in jobs table) | -| `reserve()` | Marks a `pending` job as `reserved` before calling `make()` | -| `complete()` | Deletes job (default) or marks as `success` (when `jobs.keep_completed=True`) | -| `error()` | Marks `reserved` job as `error` with message and stack trace | -| `delete()` | Removes job entry; use `(jobs & condition).delete()` pattern | - -**Notes:** - -- `ignore` is set manually via `jobs.ignore(key)` and skipped by `populate()` and `refresh()` -- To reset an ignored job: `jobs.ignored.delete(); jobs.refresh()` - -### 9.4 Jobs API - -```python -# Refresh job queue (sync with key_source) -Analysis.jobs.refresh() - -# Status queries -Analysis.jobs.pending # Pending jobs -Analysis.jobs.reserved # Currently processing -Analysis.jobs.errors # Failed jobs -Analysis.jobs.ignored # Skipped jobs -Analysis.jobs.completed # Success jobs (if kept) - -# Progress summary -Analysis.jobs.progress() -# {'pending': 150, 'reserved': 3, 'success': 847, 'error': 12, 'total': 1012} - -# Manual control -Analysis.jobs.ignore(key) # Skip a job -(Analysis.jobs & condition).delete() # Remove jobs -Analysis.jobs.errors.delete() # Clear errors -``` - ---- - -## 10. Priority and Scheduling - -### 10.1 Priority - -Lower values = higher priority (0 is most urgent): - -```python -# Urgent jobs (priority 0) -Analysis.jobs.refresh(priority=0) - -# Normal jobs (default priority 5) -Analysis.jobs.refresh() - -# Background jobs (priority 10) -Analysis.jobs.refresh(priority=10) - -# Urgent jobs for specific data -Analysis.jobs.refresh(Subject & "priority='urgent'", priority=0) -``` - -### 10.2 Scheduling - -Delay job availability using server time: - -```python -# Available in 2 hours -Analysis.jobs.refresh(delay=2*60*60) - -# Available tomorrow -Analysis.jobs.refresh(delay=24*60*60) -``` - -Jobs with `scheduled_time > now` are not processed by `populate()`. - ---- - -## 11. Distributed Computing - -### 11.1 Basic Pattern - -Multiple workers can run simultaneously: - -```python -# Worker 1 -Analysis.populate(reserve_jobs=True) - -# Worker 2 (different machine/process) -Analysis.populate(reserve_jobs=True) - -# Worker 3 -Analysis.populate(reserve_jobs=True) -``` - -### 11.2 Execution Flow (Distributed) - -``` -1. Refresh jobs queue (if auto_refresh=True) -2. Fetch pending jobs ordered by (priority, scheduled_time) -3. For each job: - a. Mark as 'reserved' - b. Start transaction - c. Call make(key) - d. Commit transaction - e. Mark as 'success' or delete job - f. On error: mark as 'error' with details -``` - -### 11.3 Conflict Resolution - -When two workers reserve the same job simultaneously: - -1. Both reservations succeed (optimistic, no locking) -2. Both call `make()` for the same key -3. First worker's transaction commits -4. Second worker gets duplicate key error (silently ignored) -5. First worker marks job complete - -This is acceptable because: -- The `make()` transaction guarantees data integrity -- Conflicts are rare with job reservation -- Wasted computation is minimal vs locking overhead - ---- - -## 12. Error Handling - -### 12.1 Default Behavior - -Errors stop populate and raise the exception: - -```python -Analysis.populate() # Stops on first error -``` - -### 12.2 Suppressing Errors - -Continue processing despite errors: - -```python -errors = Analysis.populate( - suppress_errors=True, - return_exception_objects=True -) -# errors contains list of (key, exception) tuples -``` - -### 12.3 Error Recovery (Distributed Mode) - -```python -# View errors -for err in Analysis.jobs.errors.to_dicts(): - print(f"Key: {err}, Error: {err['error_message']}") - -# Clear and retry -Analysis.jobs.errors.delete() -Analysis.jobs.refresh() -Analysis.populate(reserve_jobs=True) -``` - -### 12.4 Stale and Orphaned Jobs - -**Stale jobs**: Keys no longer in `key_source` (upstream deleted) -```python -Analysis.jobs.refresh(stale_timeout=3600) # Clean up after 1 hour -``` - -**Orphaned jobs**: Reserved jobs whose worker crashed -```python -Analysis.jobs.refresh(orphan_timeout=3600) # Reset after 1 hour -``` - ---- - -## 13. Configuration - -```python -dj.config['jobs.auto_refresh'] = True # Auto-refresh on populate -dj.config['jobs.keep_completed'] = False # Retain success records -dj.config['jobs.stale_timeout'] = 3600 # Seconds before stale cleanup -dj.config['jobs.default_priority'] = 5 # Default priority (lower=urgent) -dj.config['jobs.version'] = None # Version string ('git' for auto) -dj.config['jobs.add_job_metadata'] = False # Add hidden metadata columns -``` - ---- - -## 14. Hidden Job Metadata - -When `config['jobs.add_job_metadata'] = True`, auto-populated tables receive hidden columns: - -| Column | Type | Description | -|--------|------|-------------| -| `_job_start_time` | `datetime(3)` | When computation began | -| `_job_duration` | `float64` | Duration in seconds | -| `_job_version` | `varchar(64)` | Code version | - -```python -# Fetch with job metadata -Analysis().to_arrays('result', '_job_duration') - -# Query slow computations -slow = Analysis & '_job_duration > 3600' -``` - ---- - -## 15. Migration from Legacy DataJoint - -DataJoint 2.0 replaces the schema-level `~jobs` table with per-table `~~table_name` jobs tables. See the [Migration Guide](../../how-to/migrate-to-v20.md) for details. - ---- - -## 16. Quick Reference - -### 16.1 Common Operations - -```python -# Basic populate (direct mode) -Table.populate() -Table.populate(restriction) -Table.populate(max_calls=100, display_progress=True) - -# Distributed populate -Table.populate(reserve_jobs=True) - -# Check progress -remaining, total = Table.progress() -Table.jobs.progress() # Detailed status - -# Error handling -Table.populate(suppress_errors=True) -Table.jobs.errors.to_dicts() -Table.jobs.errors.delete() - -# Priority control -Table.jobs.refresh(priority=0) # Urgent -Table.jobs.refresh(delay=3600) # Scheduled -``` - -### 16.2 make() Patterns - -```python -# Simple make -def make(self, key): - data = (Source & key).fetch1() - self.insert1({**key, 'result': compute(data)}) - -# With parts -def make(self, key): - self.insert1({**key, 'summary': s}) - self.Part.insert(parts) - -# Tripartite (generator) -def make(self, key): - data = (Source & key).fetch1() - yield # Release transaction - result = heavy_compute(data) - yield # Re-acquire transaction - self.insert1({**key, 'result': result}) - -# Tripartite (methods) -def make_fetch(self, key): return data -def make_compute(self, key, data): return result -def make_insert(self, key, result): self.insert1(...) -``` - - ---- -## File: reference/specs/codec-api.md - -# Codec API Specification - -This document specifies the DataJoint Codec API for creating custom attribute types. -For the complete type system architecture (core types, built-in codecs, storage modes), -see the [Type System Specification](type-system.md). - -## Overview - -Codecs define bidirectional conversion between Python objects and database storage. -They enable storing complex data types (graphs, models, custom formats) while -maintaining DataJoint's query capabilities. - -```mermaid -flowchart LR - A["Python Object
(e.g. Graph)"] -- encode --> B["Storage Type
(e.g. bytes)"] - B -- decode --> A -``` - -## Two Patterns for Custom Codecs - -There are two approaches for creating custom codecs: - -| Pattern | When to Use | Base Class | -|---------|-------------|------------| -| **Type Chaining** | Transform Python objects, use existing storage | `dj.Codec` | -| **SchemaCodec Subclassing** | Custom file formats with schema-addressed paths | `dj.SchemaCodec` | - -### Pattern 1: Type Chaining (Most Common) - -Chain to an existing codec for storage. Your codec transforms objects; the chained codec handles storage. - -```python -import datajoint as dj -import networkx as nx - -class GraphCodec(dj.Codec): - """Store NetworkX graphs.""" - - name = "graph" # Use as in definitions - - def get_dtype(self, is_store: bool) -> str: - return "" # Delegate to blob for serialization - - def encode(self, graph, *, key=None, store_name=None): - return { - 'nodes': list(graph.nodes(data=True)), - 'edges': list(graph.edges(data=True)), - } - - def decode(self, stored, *, key=None): - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G - -# Use in table definition -@schema -class Connectivity(dj.Manual): - definition = ''' - conn_id : int - --- - network : # in-table storage - network_ext : # object store - ''' -``` - -### Pattern 2: SchemaCodec Subclassing (File Formats) - -For custom file formats that need schema-addressed storage paths. - -```python -import datajoint as dj - -class ParquetCodec(dj.SchemaCodec): - """Store DataFrames as Parquet files.""" - - name = "parquet" - - # get_dtype inherited: returns "json", requires @ - - def encode(self, df, *, key=None, store_name=None): - import io - schema, table, field, pk = self._extract_context(key) - path, _ = self._build_path(schema, table, field, pk, ext=".parquet") - backend = self._get_backend(store_name) - - buffer = io.BytesIO() - df.to_parquet(buffer) - backend.put_buffer(buffer.getvalue(), path) - - return {"path": path, "store": store_name, "shape": list(df.shape)} - - def decode(self, stored, *, key=None): - return ParquetRef(stored, self._get_backend(stored.get("store"))) - -# Use in table definition (store only) -@schema -class Results(dj.Manual): - definition = ''' - result_id : int - --- - data : - ''' -``` - -## The Codec Base Class - -All custom codecs inherit from `dj.Codec`: - -```python -class Codec(ABC): - """Base class for codec types.""" - - name: str | None = None # Required: unique identifier - - def get_dtype(self, is_store: bool) -> str: - """Return the storage dtype.""" - raise NotImplementedError - - @abstractmethod - def encode(self, value, *, key=None, store_name=None) -> Any: - """Encode Python value for storage.""" - ... - - @abstractmethod - def decode(self, stored, *, key=None) -> Any: - """Decode stored value back to Python.""" - ... - - def validate(self, value) -> None: - """Optional: validate value before encoding.""" - pass -``` - -## The SchemaCodec Base Class - -For schema-addressed storage (file formats), inherit from `dj.SchemaCodec`: - -```python -class SchemaCodec(Codec, register=False): - """Base class for schema-addressed codecs.""" - - def get_dtype(self, is_store: bool) -> str: - """Store only, returns 'json'.""" - if not is_store: - raise DataJointError(f"<{self.name}> requires @ (store only)") - return "json" - - def _extract_context(self, key: dict) -> tuple[str, str, str, dict]: - """Parse key into (schema, table, field, primary_key).""" - ... - - def _build_path(self, schema, table, field, pk, ext=None) -> tuple[str, str]: - """Build schema-addressed path: {schema}/{table}/{pk}/{field}{ext}""" - ... - - def _get_backend(self, store_name: str = None): - """Get storage backend by name.""" - ... -``` - -## Required Components - -### 1. The `name` Attribute - -The `name` class attribute is a unique identifier used in table definitions with -`` syntax: - -```python -class MyCodec(dj.Codec): - name = "mycodec" # Use as in definitions -``` - -Naming conventions: -- Use lowercase with underscores: `spike_train`, `graph_embedding` -- Avoid generic names that might conflict: prefer `lab_model` over `model` -- Names must be unique across all registered codecs - -### 2. The `get_dtype()` Method - -Returns the underlying storage type. The `is_store` parameter indicates whether -the `@` modifier is present in the table definition: - -```python -def get_dtype(self, is_store: bool) -> str: - """ - Args: - is_store: True if @ modifier present (e.g., ) - - Returns: - - A core type: "bytes", "json", "varchar(N)", "int32", etc. - - Another codec: "", "", etc. - - Raises: - DataJointError: If store not supported but @ is present - """ -``` - -Examples: - -```python -# Simple: always store as bytes -def get_dtype(self, is_store: bool) -> str: - return "bytes" - -# Different behavior for in-table/store -def get_dtype(self, is_store: bool) -> str: - return "" if is_store else "bytes" - -# Store-only codec -def get_dtype(self, is_store: bool) -> str: - if not is_store: - raise DataJointError(" requires @ (store only)") - return "json" -``` - -### 3. The `encode()` Method - -Converts Python objects to the format expected by `get_dtype()`: - -```python -def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> Any: - """ - Args: - value: The Python object to store - key: Primary key values (for context-dependent encoding) - store_name: Target store name (for external storage) - - Returns: - Value in the format expected by get_dtype() - """ -``` - -### 4. The `decode()` Method - -Converts stored values back to Python objects: - -```python -def decode(self, stored: Any, *, key: dict | None = None) -> Any: - """ - Args: - stored: Data retrieved from storage - key: Primary key values (for context-dependent decoding) - - Returns: - The reconstructed Python object - """ -``` - -### 5. The `validate()` Method (Optional) - -Called automatically before `encode()` during INSERT operations: - -```python -def validate(self, value: Any) -> None: - """ - Args: - value: The value to validate - - Raises: - TypeError: If the value has an incompatible type - ValueError: If the value fails domain validation - """ - if not isinstance(value, ExpectedType): - raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") -``` - -## Auto-Registration - -Codecs automatically register when their class is defined. No decorator needed: - -```python -# This codec is registered automatically when the class is defined -class MyCodec(dj.Codec): - name = "mycodec" - # ... -``` - -### Skipping Registration - -For abstract base classes that shouldn't be registered: - -```python -class BaseCodec(dj.Codec, register=False): - """Abstract base - not registered.""" - name = None # Or omit entirely - -class ConcreteCodec(BaseCodec): - name = "concrete" # This one IS registered - # ... -``` - -### Registration Timing - -Codecs are registered at class definition time. Ensure your codec classes are -imported before any table definitions that use them: - -```python -# myproject/codecs.py -class GraphCodec(dj.Codec): - name = "graph" - ... - -# myproject/tables.py -import myproject.codecs # Ensure codecs are registered - -@schema -class Networks(dj.Manual): - definition = ''' - id : int - --- - network : - ''' -``` - -## Codec Composition (Chaining) - -Codecs can delegate to other codecs by returning `` from `get_dtype()`. -This enables layered functionality: - -```python -class CompressedJsonCodec(dj.Codec): - """Compress JSON data with zlib.""" - - name = "zjson" - - def get_dtype(self, is_store: bool) -> str: - return "" # Delegate serialization to blob codec - - def encode(self, value, *, key=None, store_name=None): - import json, zlib - json_bytes = json.dumps(value).encode('utf-8') - return zlib.compress(json_bytes) - - def decode(self, stored, *, key=None): - import json, zlib - json_bytes = zlib.decompress(stored) - return json.loads(json_bytes.decode('utf-8')) -``` - -### How Chaining Works - -When DataJoint encounters ``: - -1. Calls `ZjsonCodec.get_dtype(is_store=False)` → returns `""` -2. Calls `BlobCodec.get_dtype(is_store=False)` → returns `"bytes"` -3. Final storage type is `bytes` (LONGBLOB in MySQL) - -During INSERT: -1. `ZjsonCodec.encode()` converts Python dict → compressed bytes -2. `BlobCodec.encode()` packs bytes → DJ blob format -3. Stored in database - -During FETCH: -1. Read from database -2. `BlobCodec.decode()` unpacks DJ blob → compressed bytes -3. `ZjsonCodec.decode()` decompresses → Python dict - -### Built-in Codec Chains - -DataJoint's built-in codecs form these chains: - -| Codec | Chain | Final Storage | -|-------|-------|---------------| -| `` | `` → `bytes` | Inline | -| `` | `` → `` → `json` | Store (hash-addressed) | -| `` | `` → `bytes` | Inline | -| `` | `` → `` → `json` | Store (hash-addressed) | -| `` | `` → `json` | Store only (hash-addressed) | -| `` | `` → `json` | Store only (schema-addressed) | -| `` | `` → `json` | Store only (schema-addressed) | -| `` | `` → `json` | Store only (external ref) | - -### Store Name Propagation - -When using object storage (`@`), the store name propagates through the chain: - -```python -# Table definition -data : - -# Resolution: -# 1. MyCodec.get_dtype(is_store=True) → "" -# 2. BlobCodec.get_dtype(is_store=True) → "" -# 3. HashCodec.get_dtype(is_store=True) → "json" -# 4. store_name="coldstore" passed to HashCodec.encode() -``` - -## Plugin System (Entry Points) - -Codecs can be distributed as installable packages using Python entry points. - -### Package Structure - -``` -dj-graph-codecs/ -├── pyproject.toml -└── src/ - └── dj_graph_codecs/ - ├── __init__.py - └── codecs.py -``` - -### pyproject.toml - -```toml -[project] -name = "dj-graph-codecs" -version = "1.0.0" -dependencies = ["datajoint>=2.0", "networkx"] - -[project.entry-points."datajoint.codecs"] -graph = "dj_graph_codecs.codecs:GraphCodec" -weighted_graph = "dj_graph_codecs.codecs:WeightedGraphCodec" -``` - -### Codec Implementation - -```python -# src/dj_graph_codecs/codecs.py -import datajoint as dj -import networkx as nx - -class GraphCodec(dj.Codec): - name = "graph" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, graph, *, key=None, store_name=None): - return { - 'nodes': list(graph.nodes(data=True)), - 'edges': list(graph.edges(data=True)), - } - - def decode(self, stored, *, key=None): - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G - -class WeightedGraphCodec(dj.Codec): - name = "weighted_graph" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, graph, *, key=None, store_name=None): - return { - 'nodes': list(graph.nodes(data=True)), - 'edges': [(u, v, d) for u, v, d in graph.edges(data=True)], - } - - def decode(self, stored, *, key=None): - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - for u, v, d in stored['edges']: - G.add_edge(u, v, **d) - return G -``` - -### Usage After Installation - -```bash -pip install dj-graph-codecs -``` - -```python -# Codecs are automatically discovered and available -@schema -class Networks(dj.Manual): - definition = ''' - network_id : int - --- - topology : - weights : - ''' -``` - -### Entry Point Discovery - -DataJoint loads entry points lazily when a codec is first requested: - -1. Check explicit registry (codecs defined in current process) -2. Load entry points from `datajoint.codecs` group -3. Also checks legacy `datajoint.types` group for compatibility - -## API Reference - -### Module Functions - -```python -import datajoint as dj - -# List all registered codec names -dj.list_codecs() # Returns: ['blob', 'hash', 'object', 'attach', 'filepath', ...] - -# Get a codec instance by name -codec = dj.get_codec("blob") -codec = dj.get_codec("") # Angle brackets are optional -codec = dj.get_codec("") # Store parameter is stripped -``` - -### Internal Functions (for advanced use) - -```python -from datajoint.codecs import ( - is_codec_registered, # Check if codec exists - unregister_codec, # Remove codec (testing only) - resolve_dtype, # Resolve codec chain - parse_type_spec, # Parse "" syntax -) -``` - -## Built-in Codecs - -DataJoint provides these built-in codecs. See the [Type System Specification](type-system.md) for detailed behavior and implementation. - -| Codec | Inline | Store | Addressing | Description | -|-------|--------|-------|------------|-------------| -| `` | `bytes` | `` | Hash | DataJoint serialization for Python objects | -| `` | `bytes` | `` | Hash | File attachments with filename preserved | -| `` | N/A | `json` | Hash | Hash-addressed storage with MD5 deduplication | -| `` | N/A | `json` | Schema | Schema-addressed storage for files/folders | -| `` | N/A | `json` | Schema | Schema-addressed storage for numpy arrays | -| `` | N/A | `json` | External | Reference to existing files in store | - -**Addressing schemes:** -- **Hash-addressed**: Path from content hash. Automatic deduplication. -- **Schema-addressed**: Path mirrors database structure. One location per entity. - -## Complete Examples - -### Example 1: Simple Serialization - -```python -import datajoint as dj -import numpy as np - -class SpikeTrainCodec(dj.Codec): - """Efficient storage for sparse spike timing data.""" - - name = "spike_train" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def validate(self, value): - if not isinstance(value, np.ndarray): - raise TypeError("Expected numpy array of spike times") - if value.ndim != 1: - raise ValueError("Spike train must be 1-dimensional") - if len(value) > 1 and not np.all(np.diff(value) >= 0): - raise ValueError("Spike times must be sorted") - - def encode(self, spike_times, *, key=None, store_name=None): - # Store as differences (smaller values, better compression) - return np.diff(spike_times, prepend=0).astype(np.float32) - - def decode(self, stored, *, key=None): - # Reconstruct original spike times - return np.cumsum(stored).astype(np.float64) -``` - -### Example 2: External Storage - -```python -import datajoint as dj -import pickle - -class ModelCodec(dj.Codec): - """Store ML models with optional external storage.""" - - name = "model" - - def get_dtype(self, is_store: bool) -> str: - # Use hash-addressed storage for large models - return "" if is_store else "" - - def encode(self, model, *, key=None, store_name=None): - return pickle.dumps(model, protocol=pickle.HIGHEST_PROTOCOL) - - def decode(self, stored, *, key=None): - return pickle.loads(stored) - - def validate(self, value): - # Check that model has required interface - if not hasattr(value, 'predict'): - raise TypeError("Model must have a predict() method") -``` - -Usage: -```python -@schema -class Models(dj.Manual): - definition = ''' - model_id : int - --- - small_model : # Internal storage - large_model : # External (default store) - archive_model : # External (specific store) - ''' -``` - -### Example 3: JSON with Schema Validation - -```python -import datajoint as dj -import jsonschema - -class ConfigCodec(dj.Codec): - """Store validated JSON configuration.""" - - name = "config" - - SCHEMA = { - "type": "object", - "properties": { - "version": {"type": "integer", "minimum": 1}, - "settings": {"type": "object"}, - }, - "required": ["version", "settings"], - } - - def get_dtype(self, is_store: bool) -> str: - return "json" - - def validate(self, value): - jsonschema.validate(value, self.SCHEMA) - - def encode(self, config, *, key=None, store_name=None): - return config # JSON type handles serialization - - def decode(self, stored, *, key=None): - return stored -``` - -### Example 4: Context-Dependent Encoding - -```python -import datajoint as dj - -class VersionedDataCodec(dj.Codec): - """Handle different encoding versions based on primary key.""" - - name = "versioned" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, value, *, key=None, store_name=None): - version = key.get("schema_version", 1) if key else 1 - if version >= 2: - return {"v": 2, "data": self._encode_v2(value)} - return {"v": 1, "data": self._encode_v1(value)} - - def decode(self, stored, *, key=None): - version = stored.get("v", 1) - if version >= 2: - return self._decode_v2(stored["data"]) - return self._decode_v1(stored["data"]) - - def _encode_v1(self, value): - return value - - def _decode_v1(self, data): - return data - - def _encode_v2(self, value): - # New encoding format - return {"optimized": True, "payload": value} - - def _decode_v2(self, data): - return data["payload"] -``` - -### Example 5: External-Only Codec - -```python -import datajoint as dj -from pathlib import Path - -class ZarrCodec(dj.Codec): - """Store Zarr arrays in object storage.""" - - name = "zarr" - - def get_dtype(self, is_store: bool) -> str: - if not is_store: - raise dj.DataJointError(" requires @ (external storage only)") - return "" # Delegate to object storage - - def encode(self, value, *, key=None, store_name=None): - import zarr - import tempfile - - # If already a path, pass through - if isinstance(value, (str, Path)): - return str(value) - - # If zarr array, save to temp and return path - if isinstance(value, zarr.Array): - tmpdir = tempfile.mkdtemp() - path = Path(tmpdir) / "data.zarr" - zarr.save(path, value) - return str(path) - - raise TypeError(f"Expected zarr.Array or path, got {type(value)}") - - def decode(self, stored, *, key=None): - # ObjectCodec returns ObjectRef, use its fsmap for zarr - import zarr - return zarr.open(stored.fsmap, mode='r') -``` - -## Best Practices - -### 1. Choose Appropriate Storage Types - -| Data Type | Recommended `get_dtype()` | -|-----------|---------------------------| -| Python objects (dicts, arrays) | `""` | -| Large binary data | `""` (external) | -| Files/folders (Zarr, HDF5) | `""` (external) | -| Simple JSON-serializable | `"json"` | -| Short strings | `"varchar(N)"` | -| Numeric identifiers | `"int32"`, `"int64"` | - -### 2. Handle None Values - -Nullable columns may pass `None` to your codec: - -```python -def encode(self, value, *, key=None, store_name=None): - if value is None: - return None # Pass through for nullable columns - return self._actual_encode(value) - -def decode(self, stored, *, key=None): - if stored is None: - return None - return self._actual_decode(stored) -``` - -### 3. Test Round-Trips - -Always verify that `decode(encode(x)) == x`: - -```python -def test_codec_roundtrip(): - codec = MyCodec() - - test_values = [ - {"key": "value"}, - [1, 2, 3], - np.array([1.0, 2.0]), - ] - - for original in test_values: - encoded = codec.encode(original) - decoded = codec.decode(encoded) - assert decoded == original or np.array_equal(decoded, original) -``` - -### 4. Include Validation - -Catch errors early with `validate()`: - -```python -def validate(self, value): - if not isinstance(value, ExpectedType): - raise TypeError(f"Expected ExpectedType, got {type(value).__name__}") - - if not self._is_valid(value): - raise ValueError("Value fails validation constraints") -``` - -### 5. Document Expected Formats - -Include docstrings explaining input/output formats: - -```python -class MyCodec(dj.Codec): - """ - Store MyType objects. - - Input format (encode): - MyType instance with attributes: x, y, z - - Storage format: - Dict with keys: 'x', 'y', 'z' - - Output format (decode): - MyType instance reconstructed from storage - """ -``` - -### 6. Consider Versioning - -If your encoding format might change: - -```python -def encode(self, value, *, key=None, store_name=None): - return { - "_version": 2, - "_data": self._encode_v2(value), - } - -def decode(self, stored, *, key=None): - version = stored.get("_version", 1) - data = stored.get("_data", stored) - - if version == 1: - return self._decode_v1(data) - return self._decode_v2(data) -``` - -## Error Handling - -### Common Errors - -| Error | Cause | Solution | -|-------|-------|----------| -| `Unknown codec: ` | Codec not registered | Import module defining codec before table definition | -| `Codec already registered` | Duplicate name | Use unique names; check for conflicts | -| ` requires @` | External-only codec used without @ | Add `@` or `@store` to attribute type | -| `Circular codec reference` | Codec chain forms a loop | Check `get_dtype()` return values | - -### Debugging - -```python -# Check what codecs are registered -print(dj.list_codecs()) - -# Inspect a codec -codec = dj.get_codec("mycodec") -print(f"Name: {codec.name}") -print(f"Internal dtype: {codec.get_dtype(is_store=False)}") -print(f"External dtype: {codec.get_dtype(is_store=True)}") - -# Resolve full chain -from datajoint.codecs import resolve_dtype -final_type, chain, store = resolve_dtype("") -print(f"Final storage type: {final_type}") -print(f"Codec chain: {[c.name for c in chain]}") -print(f"Store: {store}") -``` - - ---- -## File: reference/specs/data-manipulation.md - -# DataJoint Data Manipulation Specification - -## Overview - -This document specifies data manipulation operations in DataJoint Python: insert, update, and delete. These operations maintain referential integrity across the pipeline while supporting the **workflow normalization** paradigm. - -## 1. Workflow Normalization Philosophy - -### 1.1 Insert and Delete as Primary Operations - -DataJoint pipelines are designed around **insert** and **delete** as the primary data manipulation operations: - -``` -Insert: Add complete entities (rows) to tables -Delete: Remove entities and all dependent data (cascading) -``` - -This design maintains referential integrity at the **entity level**—each row represents a complete, self-consistent unit of data. - -### 1.2 Updates as Surgical Corrections - -**Updates are intentionally limited** to the `update1()` method, which modifies a single row at a time. This is by design: - -- Updates bypass the normal workflow -- They can create inconsistencies with derived data -- They should be used sparingly for **corrective operations** - -**Appropriate uses of update1():** -- Fixing data entry errors -- Correcting metadata after the fact -- Administrative annotations - -**Inappropriate uses:** -- Regular workflow operations -- Batch modifications -- Anything that should trigger recomputation - -### 1.3 The Recomputation Pattern - -When source data changes, the correct pattern is: - -```python -# 1. Delete the incorrect data (cascades to all derived tables) -(SourceTable & {"key": value}).delete() - -# 2. Insert the corrected data -SourceTable.insert1(corrected_row) - -# 3. Recompute derived tables -DerivedTable.populate() -``` - -This ensures all derived data remains consistent with its sources. - ---- - -## 2. Insert Operations - -### 2.1 `insert()` Method - -**Signature:** -```python -def insert( - self, - rows, - replace=False, - skip_duplicates=False, - ignore_extra_fields=False, - allow_direct_insert=None, - chunk_size=None, -) -``` - -**Parameters:** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `rows` | iterable | — | Data to insert | -| `replace` | bool | `False` | Replace existing rows with matching PK | -| `skip_duplicates` | bool | `False` | Silently skip duplicate keys | -| `ignore_extra_fields` | bool | `False` | Ignore fields not in table | -| `allow_direct_insert` | bool | `None` | Allow insert into auto-populated tables | -| `chunk_size` | int | `None` | Insert in batches of this size | - -### 2.2 Accepted Input Formats - -| Format | Example | -|--------|---------| -| List of dicts | `[{"id": 1, "name": "Alice"}, ...]` | -| pandas DataFrame | `pd.DataFrame({"id": [1, 2], "name": ["A", "B"]})` | -| polars DataFrame | `pl.DataFrame({"id": [1, 2], "name": ["A", "B"]})` | -| numpy structured array | `np.array([(1, "A")], dtype=[("id", int), ("name", "U10")])` | -| QueryExpression | `OtherTable.proj(...)` (INSERT...SELECT) | -| Path to CSV | `Path("data.csv")` | - -### 2.3 Basic Usage - -```python -# Single row -Subject.insert1({"subject_id": 1, "name": "Mouse001", "dob": "2024-01-15"}) - -# Multiple rows -Subject.insert([ - {"subject_id": 1, "name": "Mouse001", "dob": "2024-01-15"}, - {"subject_id": 2, "name": "Mouse002", "dob": "2024-01-16"}, -]) - -# From DataFrame -df = pd.DataFrame({"subject_id": [1, 2], "name": ["M1", "M2"], "dob": ["2024-01-15", "2024-01-16"]}) -Subject.insert(df) - -# From query (INSERT...SELECT) -ActiveSubjects.insert(Subject & "status = 'active'") -``` - -### 2.4 Handling Duplicates - -```python -# Error on duplicate (default) -Subject.insert1({"subject_id": 1, ...}) # Raises DuplicateError if exists - -# Skip duplicates silently -Subject.insert(rows, skip_duplicates=True) - -# Replace existing rows -Subject.insert(rows, replace=True) -``` - -**Difference between skip and replace:** -- `skip_duplicates`: Keeps existing row unchanged -- `replace`: Overwrites existing row with new values - -### 2.5 Extra Fields - -```python -# Error on extra fields (default) -Subject.insert1({"subject_id": 1, "unknown_field": "x"}) # Raises error - -# Ignore extra fields -Subject.insert1({"subject_id": 1, "unknown_field": "x"}, ignore_extra_fields=True) -``` - -### 2.6 Auto-Populated Tables - -Computed and Imported tables normally only accept inserts from their `make()` method: - -```python -# Raises DataJointError by default -ComputedTable.insert1({"key": 1, "result": 42}) - -# Explicit override -ComputedTable.insert1({"key": 1, "result": 42}, allow_direct_insert=True) -``` - -### 2.7 Chunked Insertion - -For large datasets, insert in batches: - -```python -# Insert 10,000 rows at a time -Subject.insert(large_dataset, chunk_size=10000) -``` - -Each chunk is a separate transaction. If interrupted, completed chunks persist. - -### 2.8 `insert1()` Method - -Convenience wrapper for single-row inserts: - -```python -def insert1(self, row, **kwargs) -``` - -Equivalent to `insert((row,), **kwargs)`. - -### 2.9 Staged Insert for Large Objects - -For large objects (Zarr arrays, HDF5 files), use staged insert to write directly to object storage: - -```python -with table.staged_insert1 as staged: - # Set primary key and metadata - staged.rec["session_id"] = 123 - staged.rec["timestamp"] = datetime.now() - - # Write large data directly to storage - zarr_path = staged.store("raw_data", ".zarr") - z = zarr.open(zarr_path, mode="w") - z[:] = large_array - staged.rec["raw_data"] = z - -# Row automatically inserted on successful exit -# Storage cleaned up if exception occurs -``` - ---- - -## 3. Update Operations - -### 3.1 `update1()` Method - -**Signature:** -```python -def update1(self, row: dict) -> None -``` - -**Parameters:** -- `row`: Dictionary containing all primary key values plus attributes to update - -### 3.2 Basic Usage - -```python -# Update a single attribute -Subject.update1({"subject_id": 1, "name": "NewName"}) - -# Update multiple attributes -Subject.update1({ - "subject_id": 1, - "name": "NewName", - "notes": "Updated on 2024-01-15" -}) -``` - -### 3.3 Requirements - -1. **Complete primary key**: All PK attributes must be provided -2. **Exactly one match**: Must match exactly one existing row -3. **No restrictions**: Cannot call on restricted table - -```python -# Error: incomplete primary key -Subject.update1({"name": "NewName"}) - -# Error: row doesn't exist -Subject.update1({"subject_id": 999, "name": "Ghost"}) - -# Error: cannot update restricted table -(Subject & "subject_id > 10").update1({...}) -``` - -### 3.4 Resetting to Default - -Setting an attribute to `None` resets it to its default value: - -```python -# Reset 'notes' to its default (NULL if nullable) -Subject.update1({"subject_id": 1, "notes": None}) -``` - -### 3.5 When to Use Updates - -**Appropriate:** -```python -# Fix a typo in metadata -Subject.update1({"subject_id": 1, "name": "Mouse001"}) # Was "Mous001" - -# Add a note to an existing record -Session.update1({"session_id": 5, "notes": "Excluded from analysis"}) -``` - -**Inappropriate (use delete + insert + populate instead):** -```python -# DON'T: Update source data that affects computed results -Trial.update1({"trial_id": 1, "stimulus": "new_stim"}) # Computed tables now stale! - -# DO: Delete and recompute -(Trial & {"trial_id": 1}).delete() # Cascades to computed tables -Trial.insert1({"trial_id": 1, "stimulus": "new_stim"}) -ComputedResults.populate() -``` - -### 3.6 Why No Bulk Update? - -DataJoint intentionally does not provide `update()` for multiple rows: - -1. **Consistency**: Bulk updates easily create inconsistencies with derived data -2. **Auditability**: Single-row updates are explicit and traceable -3. **Workflow**: The insert/delete pattern maintains referential integrity - -If you need to update many rows, iterate explicitly: - -```python -for key in (Subject & condition).keys(): - Subject.update1({**key, "status": "archived"}) -``` - ---- - -## 4. Delete Operations - -### 4.1 `delete()` Method - -**Signature:** -```python -def delete( - self, - transaction: bool = True, - prompt: bool | None = None, - part_integrity: str = "enforce", -) -> int -``` - -**Parameters:** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `transaction` | bool | `True` | Wrap in atomic transaction | -| `prompt` | bool | `None` | Prompt for confirmation (default: config setting) | -| `part_integrity` | str | `"enforce"` | Master-part integrity policy (see below) | - -**`part_integrity` values:** - -| Value | Behavior | -|-------|----------| -| `"enforce"` | Error if parts would be deleted without masters | -| `"ignore"` | Allow deleting parts without masters (breaks integrity) | -| `"cascade"` | Also delete masters when parts are deleted | - -**Returns:** Number of deleted rows from the primary table. - -### 4.2 Cascade Behavior - -Delete automatically cascades to all dependent tables: - -```python -# Deleting a subject deletes all their sessions, trials, and computed results -(Subject & {"subject_id": 1}).delete() -``` - -**Cascade order:** -1. Identify all tables with foreign keys referencing target -2. Recursively delete matching rows in child tables -3. Delete rows in target table - -### 4.3 Basic Usage - -```python -# Delete specific rows -(Subject & {"subject_id": 1}).delete() - -# Delete matching a condition -(Session & "session_date < '2024-01-01'").delete() - -# Delete all rows (use with caution!) -Subject.delete() -``` - -### 4.4 Safe Mode - -When `prompt=True` (default from config): - -``` -About to delete: - Subject: 1 rows - Session: 5 rows - Trial: 150 rows - ProcessedData: 150 rows - -Commit deletes? [yes, No]: -``` - -Disable for automated scripts: - -```python -Subject.delete(prompt=False) -``` - -### 4.5 Transaction Control - -```python -# Atomic delete (default) - all or nothing -(Subject & condition).delete(transaction=True) - -# Non-transactional (for nested transactions) -(Subject & condition).delete(transaction=False) -``` - -### 4.6 Part Table Constraints - -Cannot delete from part tables without deleting from master (by default): - -```python -# Error: cannot delete part without master -Session.Recording.delete() - -# Allow breaking master-part integrity -Session.Recording.delete(part_integrity="ignore") - -# Delete parts AND cascade up to delete master -Session.Recording.delete(part_integrity="cascade") -``` - -**`part_integrity` parameter:** - -| Value | Behavior | -|-------|----------| -| `"enforce"` | (default) Error if parts would be deleted without masters | -| `"ignore"` | Allow deleting parts without masters (breaks integrity) | -| `"cascade"` | Also delete masters when parts are deleted (maintains integrity) | - -### 4.7 `delete_quick()` Method - -Fast delete without cascade or confirmation: - -```python -def delete_quick(self, get_count: bool = False) -> int | None -``` - -**Use cases:** -- Internal cleanup -- Tables with no dependents -- When you've already handled dependencies - -**Behavior:** -- No cascade to child tables -- No user confirmation -- Fails on FK constraint violation - -```python -# Quick delete (fails if has dependents) -(TempTable & condition).delete_quick() - -# Get count of deleted rows -n = (TempTable & condition).delete_quick(get_count=True) -``` - ---- - -## 5. Validation - -### 5.1 `validate()` Method - -Pre-validate rows before insertion: - -```python -def validate(self, rows, *, ignore_extra_fields=False) -> ValidationResult -``` - -**Returns:** `ValidationResult` with: -- `is_valid`: Boolean indicating all rows passed -- `errors`: List of (row_idx, field_name, error_message) -- `rows_checked`: Number of rows validated - -### 5.2 Usage - -```python -result = Subject.validate(rows) - -if result: - Subject.insert(rows) -else: - print(result.summary()) - # Row 3, field 'dob': Invalid date format - # Row 7, field 'subject_id': Missing required field -``` - -### 5.3 Validations Performed - -| Check | Description | -|-------|-------------| -| Field existence | All fields must exist in table | -| NULL constraints | Required fields must have values | -| Primary key completeness | All PK fields must be present | -| UUID format | Valid UUID string or object | -| JSON serializability | JSON fields must be serializable | -| Codec validation | Custom type validation via codecs | - -### 5.4 Limitations - -These constraints are only checked at database level: -- Foreign key references -- Unique constraints (beyond PK) -- Custom CHECK constraints - ---- - -## 6. Part Tables - -### 6.1 Inserting into Part Tables - -Part tables are inserted via their master: - -```python -@schema -class Session(dj.Manual): - definition = """ - session_id : int - --- - date : date - """ - - class Recording(dj.Part): - definition = """ - -> master - recording_id : int - --- - duration : float - """ - -# Insert master with parts -Session.insert1({"session_id": 1, "date": "2024-01-15"}) -Session.Recording.insert([ - {"session_id": 1, "recording_id": 1, "duration": 60.0}, - {"session_id": 1, "recording_id": 2, "duration": 45.5}, -]) -``` - -### 6.2 Deleting with Part Tables - -Deleting master cascades to parts: - -```python -# Deletes session AND all its recordings -(Session & {"session_id": 1}).delete() -``` - -Cannot delete parts independently (by default): - -```python -# Error -Session.Recording.delete() - -# Allow breaking master-part integrity -Session.Recording.delete(part_integrity="ignore") - -# Or cascade up to also delete master -Session.Recording.delete(part_integrity="cascade") -``` - ---- - -## 7. Transaction Handling - -### 7.1 Implicit Transactions - -Single operations are atomic: - -```python -Subject.insert1(row) # Atomic -Subject.update1(row) # Atomic -Subject.delete() # Atomic (by default) -``` - -### 7.2 Explicit Transactions - -For multi-table operations: - -```python -with dj.conn().transaction: - Parent.insert1(parent_row) - Child.insert(child_rows) - # Commits on successful exit - # Rolls back on exception -``` - -### 7.3 Chunked Inserts and Transactions - -With `chunk_size`, each chunk is a separate transaction: - -```python -# Each chunk of 1000 rows commits independently -Subject.insert(large_dataset, chunk_size=1000) -``` - -If interrupted, completed chunks persist. - ---- - -## 8. Error Handling - -### 8.1 Common Errors - -| Error | Cause | Resolution | -|-------|-------|------------| -| `DuplicateError` | Primary key already exists | Use `skip_duplicates=True` or `replace=True` | -| `IntegrityError` | Foreign key constraint violated | Insert parent rows first | -| `MissingAttributeError` | Required field not provided | Include all required fields | -| `UnknownAttributeError` | Field not in table | Use `ignore_extra_fields=True` or fix field name | -| `DataJointError` | Various validation failures | Check error message for details | - -### 8.2 Error Recovery Pattern - -```python -try: - Subject.insert(rows) -except dj.errors.DuplicateError as e: - # Handle specific duplicate - print(f"Duplicate: {e}") -except dj.errors.IntegrityError as e: - # Missing parent reference - print(f"Missing parent: {e}") -except dj.DataJointError as e: - # Other DataJoint errors - print(f"Error: {e}") -``` - ---- - -## 9. Best Practices - -### 9.1 Prefer Insert/Delete Over Update - -```python -# Good: Delete and reinsert -(Trial & key).delete() -Trial.insert1(corrected_trial) -DerivedTable.populate() - -# Avoid: Update that creates stale derived data -Trial.update1({**key, "value": new_value}) # Derived tables now inconsistent! -``` - -### 9.2 Validate Before Insert - -```python -result = Subject.validate(rows) -if not result: - raise ValueError(result.summary()) -Subject.insert(rows) -``` - -### 9.3 Use Transactions for Related Inserts - -```python -with dj.conn().transaction: - session_key = Session.insert1(session_data, skip_duplicates=True) - Session.Recording.insert(recordings) - Session.Stimulus.insert(stimuli) -``` - -### 9.4 Batch Inserts for Performance - -```python -# Good: Single insert call -Subject.insert(all_rows) - -# Avoid: Loop of insert1 calls -for row in all_rows: - Subject.insert1(row) # Slow! -``` - -### 9.5 Safe Deletion in Production - -```python -# Always use prompt in interactive sessions -(Subject & condition).delete(prompt=True) - -# Disable only in tested automated scripts -(Subject & condition).delete(prompt=False) -``` - ---- - -## 10. Quick Reference - -| Operation | Method | Cascades | Transaction | Typical Use | -|-----------|--------|----------|-------------|-------------| -| Insert one | `insert1()` | — | Implicit | Adding single entity | -| Insert many | `insert()` | — | Per-chunk | Bulk data loading | -| Insert large object | `staged_insert1` | — | On exit | Zarr, HDF5 files | -| Update one | `update1()` | — | Implicit | Surgical corrections | -| Delete | `delete()` | Yes | Optional | Removing entities | -| Delete quick | `delete_quick()` | No | No | Internal cleanup | -| Validate | `validate()` | — | — | Pre-insert check | - - ---- -## File: reference/specs/fetch-api.md - -# DataJoint 2.0 Fetch API Specification - -## Overview - -DataJoint 2.0 replaces the complex `fetch()` method with a set of explicit, composable output methods. This provides better discoverability, clearer intent, and more efficient iteration. - -## Design Principles - -1. **Explicit over implicit**: Each output format has its own method -2. **Composable**: Use existing `.proj()` for column selection -3. **Lazy iteration**: Single cursor streaming instead of fetch-all-keys -4. **Modern formats**: First-class support for polars and Arrow - ---- - -## New API Reference - -### Output Methods - -| Method | Returns | Description | -|--------|---------|-------------| -| `to_dicts()` | `list[dict]` | All rows as list of dictionaries | -| `to_pandas()` | `DataFrame` | pandas DataFrame with primary key as index | -| `to_polars()` | `polars.DataFrame` | polars DataFrame (requires `datajoint[polars]`) | -| `to_arrow()` | `pyarrow.Table` | PyArrow Table (requires `datajoint[arrow]`) | -| `to_arrays()` | `np.ndarray` | numpy structured array (recarray) | -| `to_arrays('a', 'b')` | `tuple[array, array]` | Tuple of arrays for specific columns | -| `keys()` | `list[dict]` | Primary key values only | -| `fetch1()` | `dict` | Single row as dict (raises if not exactly 1) | -| `fetch1('a', 'b')` | `tuple` | Single row attribute values | -| `head(limit=25)` | `list[dict]` | Preview first N entries | -| `tail(limit=25)` | `list[dict]` | Preview last N entries | -| `cursor(as_dict=False)` | `cursor` | Raw database cursor for manual iteration | - -### Common Parameters - -All output methods accept these optional parameters: - -```python -table.to_dicts( - order_by=None, # str or list: column(s) to sort by, e.g. "KEY", "name DESC" - limit=None, # int: maximum rows to return - offset=None, # int: rows to skip - squeeze=False, # bool: remove singleton dimensions from arrays -) -``` - -For external storage types (attachments, filepaths), files are downloaded to `config["download_path"]`. Use `config.override()` to change: - -```python -with dj.config.override(download_path="/data"): - data = table.to_dicts() -``` - -### Iteration - -```python -# Lazy streaming - yields one dict per row from database cursor -for row in table: - process(row) # row is a dict -``` - ---- - -## Migration Guide - -### Basic Fetch Operations - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `table.fetch()` | `table.to_arrays()` or `table.to_dicts()` | -| `table.fetch(format="array")` | `table.to_arrays()` | -| `table.fetch(format="frame")` | `table.to_pandas()` | -| `table.fetch(as_dict=True)` | `table.to_dicts()` | - -### Attribute Fetching - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `table.fetch('a')` | `table.to_arrays('a')` | -| `a, b = table.fetch('a', 'b')` | `a, b = table.to_arrays('a', 'b')` | -| `table.fetch('a', 'b', as_dict=True)` | `table.proj('a', 'b').to_dicts()` | - -### Primary Key Fetching - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `table.fetch('KEY')` | `table.keys()` | -| `table.fetch(dj.key)` | `table.keys()` | -| `keys, a = table.fetch('KEY', 'a')` | See note below | - -For mixed KEY + attribute fetch: -```python -# Old: keys, a = table.fetch('KEY', 'a') -# New: Combine keys() with to_arrays() -keys = table.keys() -a = table.to_arrays('a') -# Or use to_dicts() which includes all columns -``` - -### Ordering, Limiting, Offset - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `table.fetch(order_by='name')` | `table.to_arrays(order_by='name')` | -| `table.fetch(limit=10)` | `table.to_arrays(limit=10)` | -| `table.fetch(order_by='KEY', limit=10, offset=5)` | `table.to_arrays(order_by='KEY', limit=10, offset=5)` | - -### Single Row Fetch (fetch1) - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `table.fetch1()` | `table.fetch1()` (unchanged) | -| `a, b = table.fetch1('a', 'b')` | `a, b = table.fetch1('a', 'b')` (unchanged) | -| `table.fetch1('KEY')` | `table.fetch1()` then extract pk columns | - -### Configuration - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `dj.config['fetch_format'] = 'frame'` | Use `.to_pandas()` explicitly | -| `with dj.config.override(fetch_format='frame'):` | Use `.to_pandas()` in the block | - -### Iteration - -| Old Pattern (1.x) | New Pattern (2.0) | -|-------------------|-------------------| -| `for row in table:` | `for row in table:` (same syntax, now lazy!) | -| `list(table)` | `table.to_dicts()` | - -### Column Selection with proj() - -Use `.proj()` for column selection, then apply output method: - -```python -# Select specific columns -table.proj('col1', 'col2').to_pandas() -table.proj('col1', 'col2').to_dicts() - -# Computed columns -table.proj(total='price * quantity').to_pandas() -``` - ---- - -## Removed Features - -### Removed Methods and Parameters - -- `fetch()` method - use explicit output methods -- `fetch('KEY')` - use `keys()` -- `dj.key` class - use `keys()` method -- `format=` parameter - use explicit methods -- `as_dict=` parameter - use `to_dicts()` -- `config['fetch_format']` setting - use explicit methods - -### Removed Imports - -```python -# Old (removed) -from datajoint import key -result = table.fetch(dj.key) - -# New -result = table.keys() -``` - ---- - -## Examples - -### Example 1: Basic Data Retrieval - -```python -# Get all data as DataFrame -df = Experiment().to_pandas() - -# Get all data as list of dicts -rows = Experiment().to_dicts() - -# Get all data as numpy array -arr = Experiment().to_arrays() -``` - -### Example 2: Filtered and Sorted Query - -```python -# Get recent experiments, sorted by date -recent = (Experiment() & 'date > "2024-01-01"').to_pandas( - order_by='date DESC', - limit=100 -) -``` - -### Example 3: Specific Columns - -```python -# Fetch specific columns as arrays -names, dates = Experiment().to_arrays('name', 'date') - -# Or with primary key included -names, dates = Experiment().to_arrays('name', 'date', include_key=True) -``` - -### Example 4: Primary Keys for Iteration - -```python -# Get keys for restriction -keys = Experiment().keys() -for key in keys: - process(Session() & key) -``` - -### Example 5: Single Row - -```python -# Get one row as dict -row = (Experiment() & key).fetch1() - -# Get specific attributes -name, date = (Experiment() & key).fetch1('name', 'date') -``` - -### Example 6: Lazy Iteration - -```python -# Stream rows efficiently (single database cursor) -for row in Experiment(): - if should_process(row): - process(row) - if done: - break # Early termination - no wasted fetches -``` - -### Example 7: Modern DataFrame Libraries - -```python -# Polars (fast, modern) -import polars as pl -df = Experiment().to_polars() -result = df.filter(pl.col('value') > 100).group_by('category').agg(pl.mean('value')) - -# PyArrow (zero-copy interop) -table = Experiment().to_arrow() -# Can convert to pandas or polars with zero copy -``` - ---- - -## Performance Considerations - -### Lazy Iteration - -The new iteration is significantly more efficient: - -```python -# Old (1.x): N+1 queries -# 1. fetch("KEY") gets ALL keys -# 2. fetch1() for EACH key - -# New (2.0): Single query -# Streams rows from one cursor -for row in table: - ... -``` - -### Memory Efficiency - -- `to_dicts()`: Returns full list in memory -- `for row in table:`: Streams one row at a time -- `to_arrays(limit=N)`: Fetches only N rows - -### Format Selection - -| Use Case | Recommended Method | -|----------|-------------------| -| Data analysis | `to_pandas()` or `to_polars()` | -| JSON API responses | `to_dicts()` | -| Numeric computation | `to_arrays()` | -| Large datasets | `for row in table:` (streaming) | -| Interop with other tools | `to_arrow()` | - ---- - -## Error Messages - -When attempting to use removed methods, users see helpful error messages: - -```python ->>> table.fetch() -AttributeError: fetch() has been removed in DataJoint 2.0. -Use to_dicts(), to_pandas(), to_arrays(), or keys() instead. -See table.fetch.__doc__ for details. -``` - ---- - -## Optional Dependencies - -Install optional dependencies for additional output formats: - -```bash -# For polars support -pip install datajoint[polars] - -# For PyArrow support -pip install datajoint[arrow] - -# For both -pip install datajoint[polars,arrow] -``` - - ---- -## File: reference/specs/index.md - -# Specifications - -Formal specifications of DataJoint's data model and behavior. - -These documents define how DataJoint works at a detailed level. They serve as -authoritative references for: - -- Understanding exact behavior of operations -- Implementing compatible tools and extensions -- Debugging complex scenarios - -## How to Use These Specifications - -**If you're new to DataJoint:** -Start with the [tutorials](../../tutorials/index.md) and [how-to guides](../../how-to/index.md) before diving into specifications. Specs are technical references, not learning materials. - -**If you're implementing features:** -Use specs as authoritative sources for behavior. Start with dependencies (see below) and work up to your target specification. - -**If you're debugging:** -Specs clarify exact behavior when documentation or examples are ambiguous. - -## Reading Order - -### Foundation (Start Here) - -1. [Table Declaration](table-declaration.md) — How to define tables -2. [Primary Keys](primary-keys.md) — Key propagation rules -3. [Type System](type-system.md) — Three-layer type architecture - -**Next:** Choose based on your needs: -- **Working with data?** → Data Operations -- **Building queries?** → Query Algebra -- **Using large data?** → Object Storage - -### Query Algebra - -**Prerequisites:** Table Declaration, Primary Keys - -1. [Query Operators](query-algebra.md) — Restrict, proj, join, aggr, union -2. [Semantic Matching](semantic-matching.md) — Attribute lineage -3. [Fetch API](fetch-api.md) — Data retrieval - -### Data Operations - -**Prerequisites:** Table Declaration - -1. [Data Manipulation](data-manipulation.md) — Insert, update, delete -2. [AutoPopulate](autopopulate.md) — Jobs 2.0 system -3. [Job Metadata](job-metadata.md) — Hidden job tracking columns - -### Object Storage - -**Prerequisites:** Type System - -1. [Object Store Configuration](object-store-configuration.md) — Store setup -2. [Codec API](codec-api.md) — Custom type implementation -3. [`` Codec](npy-codec.md) — NumPy array storage - -### Advanced Topics - -1. [Master-Part Relationships](master-part.md) — Compositional modeling -2. [Virtual Schemas](virtual-schemas.md) — Schema introspection without source - -## Document Structure - -Each specification follows a consistent structure: - -1. **Overview** — What this specifies -2. **User Guide** — Practical usage -3. **API Reference** — Methods and signatures -4. **Concepts** — Definitions and rules -5. **Implementation Details** — Internal behavior -6. **Examples** — Concrete code samples -7. **Best Practices** — Recommendations - -## Specifications by Topic - -### Schema Definition - -| Specification | Prerequisites | Related How-To | Related Explanation | -|---------------|---------------|----------------|---------------------| -| [Table Declaration](table-declaration.md) | None | [Define Tables](../../how-to/define-tables.md) | [Relational Workflow Model](../../explanation/relational-workflow-model.md) | -| [Master-Part Relationships](master-part.md) | Table Declaration | [Model Relationships](../../how-to/model-relationships.ipynb) | [Data Pipelines](../../explanation/data-pipelines.md) | -| [Virtual Schemas](virtual-schemas.md) | Table Declaration | — | — | - -**Key concepts:** Table tiers (Manual, Lookup, Imported, Computed, Part), foreign keys, dependency graphs, compositional modeling - ---- - -### Query Algebra - -| Specification | Prerequisites | Related How-To | Related Explanation | -|---------------|---------------|----------------|---------------------| -| [Query Operators](query-algebra.md) | Table Declaration, Primary Keys | [Query Data](../../how-to/query-data.md) | [Query Algebra](../../explanation/query-algebra.md) | -| [Semantic Matching](semantic-matching.md) | Query Operators | [Model Relationships](../../how-to/model-relationships.ipynb) | [Query Algebra](../../explanation/query-algebra.md) | -| [Primary Keys](primary-keys.md) | Table Declaration | [Design Primary Keys](../../how-to/design-primary-keys.md) | [Entity Integrity](../../explanation/entity-integrity.md) | -| [Fetch API](fetch-api.md) | Query Operators | [Fetch Results](../../how-to/fetch-results.md) | — | - -**Key concepts:** Restriction (`&`, `-`), projection (`.proj()`), join (`*`), aggregation (`.aggr()`), union, universal set (`U()`), attribute lineage - ---- - -### Type System - -| Specification | Prerequisites | Related How-To | Related Explanation | -|---------------|---------------|----------------|---------------------| -| [Type System](type-system.md) | None | [Choose a Storage Type](../../how-to/choose-storage-type.md) | [Type System](../../explanation/type-system.md) | -| [Codec API](codec-api.md) | Type System | [Create Custom Codec](../../how-to/create-custom-codec.md) | [Custom Codecs](../../explanation/custom-codecs.md) | -| [`` Codec](npy-codec.md) | Type System | [Use Object Storage](../../how-to/use-object-storage.md) | — | - -**Key concepts:** Native types (MySQL), core types (portable), codec types (Python objects), in-table vs object storage, addressing schemes - ---- - -### Object Storage - -| Specification | Prerequisites | Related How-To | Related Explanation | -|---------------|---------------|----------------|---------------------| -| [Object Store Configuration](object-store-configuration.md) | Type System | [Configure Object Storage](../../how-to/configure-storage.md) | [Data Pipelines (OAS)](../../explanation/data-pipelines.md#object-augmented-schemas) | - -**Key concepts:** Hash-addressed storage (deduplication), schema-addressed storage (browsable paths), filepath storage (user-managed), store configuration, path generation - ---- - -### Data Operations - -| Specification | Prerequisites | Related How-To | Related Explanation | -|---------------|---------------|----------------|---------------------| -| [Data Manipulation](data-manipulation.md) | Table Declaration | [Insert Data](../../how-to/insert-data.md) | [Normalization](../../explanation/normalization.md) | -| [AutoPopulate](autopopulate.md) | Table Declaration, Data Manipulation | [Run Computations](../../how-to/run-computations.md), [Distributed Computing](../../how-to/distributed-computing.md) | [Computation Model](../../explanation/computation-model.md) | -| [Job Metadata](job-metadata.md) | AutoPopulate | [Handle Errors](../../how-to/handle-errors.md) | [Computation Model](../../explanation/computation-model.md) | - -**Key concepts:** Insert patterns, transactional integrity, workflow normalization, Jobs 2.0, job coordination, populate(), make() method, job states - - - ---- -## File: reference/specs/job-metadata.md - -# Hidden Job Metadata in Computed Tables - -## Overview - -Job execution metadata (start time, duration, code version) should be persisted in computed tables themselves, not just in ephemeral job entries. This is accomplished using hidden attributes. - -## Motivation - -The current job table (`~~table_name`) tracks execution metadata, but: -1. Job entries are deleted after completion (unless `keep_completed=True`) -2. Users often need to know when and with what code version each row was computed -3. This metadata should be transparent - not cluttering the user-facing schema - -Hidden attributes (prefixed with `_`) provide the solution: stored in the database but filtered from user-facing APIs. - -## Hidden Job Metadata Attributes - -| Attribute | Type | Description | -|-----------|------|-------------| -| `_job_start_time` | datetime(3) | When computation began | -| `_job_duration` | float32 | Computation duration in seconds | -| `_job_version` | varchar(64) | Code version (e.g., git commit hash) | - -**Design notes:** -- `_job_duration` (elapsed time) rather than `_job_completed_time` because duration is more informative for performance analysis -- `varchar(64)` for version is sufficient for git hashes (40 chars for SHA-1, 7-8 for short hash) -- `datetime(3)` provides millisecond precision - -## Configuration - -### Settings Structure - -Job metadata is controlled via `config.jobs` settings: - -```python -class JobsSettings(BaseSettings): - """Job queue configuration for AutoPopulate 2.0.""" - - model_config = SettingsConfigDict( - env_prefix="DJ_JOBS_", - case_sensitive=False, - extra="forbid", - validate_assignment=True, - ) - - # Existing settings - auto_refresh: bool = Field(default=True, ...) - keep_completed: bool = Field(default=False, ...) - stale_timeout: int = Field(default=3600, ...) - default_priority: int = Field(default=5, ...) - version_method: Literal["git", "none"] | None = Field(default=None, ...) - allow_new_pk_fields_in_computed_tables: bool = Field(default=False, ...) - - # New setting for hidden job metadata - add_job_metadata: bool = Field( - default=False, - description="Add hidden job metadata attributes (_job_start_time, _job_duration, _job_version) " - "to Computed and Imported tables during declaration. Tables created without this setting " - "will not receive metadata updates during populate." - ) -``` - -### Access Patterns - -```python -import datajoint as dj - -# Read setting -dj.config.jobs.add_job_metadata # False (default) - -# Enable programmatically -dj.config.jobs.add_job_metadata = True - -# Enable via environment variable -# DJ_JOBS_ADD_JOB_METADATA=true - -# Enable in config file (dj_config.yaml) -# jobs: -# add_job_metadata: true - -# Temporary override -with dj.config.override(jobs={"add_job_metadata": True}): - schema(MyComputedTable) # Declared with metadata columns -``` - -### Setting Interactions - -| Setting | Effect on Job Metadata | -|---------|----------------------| -| `add_job_metadata=True` | New Computed/Imported tables get hidden metadata columns | -| `add_job_metadata=False` | Tables declared without metadata columns (default) | -| `version_method="git"` | `_job_version` populated with git short hash | -| `version_method="none"` | `_job_version` left empty | -| `version_method=None` | `_job_version` left empty (same as "none") | - -### Behavior at Declaration vs Populate - -| `add_job_metadata` at declare | `add_job_metadata` at populate | Result | -|------------------------------|-------------------------------|--------| -| True | True | Metadata columns created and populated | -| True | False | Metadata columns exist but not populated | -| False | True | No metadata columns, populate skips silently | -| False | False | No metadata columns, normal behavior | - -### Retrofitting Existing Tables - -Tables created before enabling `add_job_metadata` do not have the hidden metadata columns. -To add metadata columns to existing tables, use the migration utility (not automatic): - -```python -from datajoint.migrate import add_job_metadata_columns - -# Add hidden metadata columns to specific table -add_job_metadata_columns(MyComputedTable) - -# Add to all Computed/Imported tables in a schema -add_job_metadata_columns(schema) -``` - -This utility: -- ALTERs the table to add the three hidden columns -- Does NOT populate existing rows (metadata remains NULL) -- Future `populate()` calls will populate metadata for new rows - -## Behavior - -### Declaration-time - -When `config.jobs.add_job_metadata=True` and a Computed/Imported table is declared: -- Hidden metadata columns are added to the table definition -- Only master tables receive metadata columns; Part tables never get them - -### Population-time - -After `make()` completes successfully: -1. Check if the table has hidden metadata columns -2. If yes: UPDATE the just-inserted rows with start_time, duration, version -3. If no: Silently skip (no error, no ALTER) - -This applies to both: -- **Direct mode** (`reserve_jobs=False`): Single-process populate -- **Distributed mode** (`reserve_jobs=True`): Multi-worker with job table coordination - -## Excluding Hidden Attributes from Binary Operators - -### Problem Statement - -If two tables have hidden attributes with the same name (e.g., both have `_job_start_time`), SQL's NATURAL JOIN would incorrectly match on them: - -```sql --- NATURAL JOIN matches ALL common attributes including hidden -SELECT * FROM table_a NATURAL JOIN table_b --- Would incorrectly match on _job_start_time! -``` - -### Solution: Replace NATURAL JOIN with USING Clause - -Hidden attributes must be excluded from all binary operator considerations. The result of a join does not preserve hidden attributes from its operands. - -**Current implementation:** -```python -def from_clause(self): - clause = next(support) - for s, left in zip(support, self._left): - clause += " NATURAL{left} JOIN {clause}".format(...) -``` - -**Proposed implementation:** -```python -def from_clause(self): - clause = next(support) - for s, (left, using_attrs) in zip(support, self._joins): - if using_attrs: - using = "USING ({})".format(", ".join(f"`{a}`" for a in using_attrs)) - clause += " {left}JOIN {s} {using}".format( - left="LEFT " if left else "", - s=s, - using=using - ) - else: - # Cross join (no common non-hidden attributes) - clause += " CROSS JOIN " + s if not left else " LEFT JOIN " + s + " ON TRUE" - return clause -``` - -### Changes Required - -#### 1. `QueryExpression._left` → `QueryExpression._joins` - -Replace `_left: List[bool]` with `_joins: List[Tuple[bool, List[str]]]` - -Each join stores: -- `left`: Whether it's a left join -- `using_attrs`: Non-hidden common attributes to join on - -```python -# Before -result._left = self._left + [left] + other._left - -# After -join_attributes = [n for n in self.heading.names if n in other.heading.names] -result._joins = self._joins + [(left, join_attributes)] + other._joins -``` - -#### 2. `heading.names` (existing behavior) - -Already filters out hidden attributes: -```python -@property -def names(self): - return [k for k in self.attributes] # attributes excludes is_hidden=True -``` - -This ensures join attribute computation automatically excludes hidden attributes. - -### Behavior Summary - -| Scenario | Hidden Attributes | Result | -|----------|-------------------|--------| -| `A * B` (join) | Same hidden attr in both | NOT matched - excluded from USING | -| `A & B` (restriction) | Same hidden attr in both | NOT matched | -| `A - B` (anti-restriction) | Same hidden attr in both | NOT matched | -| `A.proj()` | Hidden attrs in A | NOT projected (unless explicitly named) | -| `A.to_dicts()` | Hidden attrs in A | NOT returned by default | - -## Implementation Details - -### 1. Declaration (declare.py) - -```python -def declare(full_table_name, definition, context): - # ... existing code ... - - # Add hidden job metadata for auto-populated tables - if config.jobs.add_job_metadata and table_tier in (TableTier.COMPUTED, TableTier.IMPORTED): - # Only for master tables, not parts - if not is_part_table: - job_metadata_sql = [ - "`_job_start_time` datetime(3) DEFAULT NULL", - "`_job_duration` float DEFAULT NULL", - "`_job_version` varchar(64) DEFAULT ''", - ] - attribute_sql.extend(job_metadata_sql) -``` - -### 2. Population (autopopulate.py) - -```python -def _populate1(self, key, callback, use_jobs, jobs): - start_time = datetime.now() - version = _get_job_version() - - # ... call make() ... - - duration = time.time() - start_time.timestamp() - - # Update job metadata if table has the hidden attributes - if self._has_job_metadata_attrs(): - self._update_job_metadata( - key, - start_time=start_time, - duration=duration, - version=version - ) - -def _has_job_metadata_attrs(self): - """Check if table has hidden job metadata columns.""" - hidden_attrs = self.heading._attributes # includes hidden - return '_job_start_time' in hidden_attrs - -def _update_job_metadata(self, key, start_time, duration, version): - """Update hidden job metadata for the given key.""" - # UPDATE using primary key - pk_condition = make_condition(self, key, set()) - self.connection.query( - f"UPDATE {self.full_table_name} SET " - f"`_job_start_time`=%s, `_job_duration`=%s, `_job_version`=%s " - f"WHERE {pk_condition}", - args=(start_time, duration, version[:64]) - ) -``` - -### 3. Job table (jobs.py) - -Update version field length: -```python -version="" : varchar(64) -``` - -### 4. Version helper - -```python -def _get_job_version() -> str: - """Get version string, truncated to 64 chars.""" - from .settings import config - - method = config.jobs.version_method - if method is None or method == "none": - return "" - elif method == "git": - try: - result = subprocess.run( - ["git", "rev-parse", "--short", "HEAD"], - capture_output=True, - text=True, - timeout=5, - ) - return result.stdout.strip()[:64] if result.returncode == 0 else "" - except Exception: - return "" - return "" -``` - -## Example Usage - -```python -# Enable job metadata for new tables -dj.config.jobs.add_job_metadata = True - -@schema -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - result : float - """ - - def make(self, key): - # User code - unaware of hidden attributes - self.insert1({**key, 'result': compute(key)}) - -# Job metadata automatically added and populated: -# _job_start_time, _job_duration, _job_version - -# User-facing API unaffected: -ProcessedData().heading.names # ['raw_data_id', 'result'] -ProcessedData().to_dicts() # Returns only visible attributes - -# Access hidden attributes explicitly if needed: -ProcessedData().to_arrays('_job_start_time', '_job_duration', '_job_version') -``` - -## Summary of Design Decisions - -| Decision | Resolution | -|----------|------------| -| Configuration | `config.jobs.add_job_metadata` (default False) | -| Environment variable | `DJ_JOBS_ADD_JOB_METADATA` | -| Existing tables | No automatic ALTER - silently skip metadata if columns absent | -| Retrofitting | Manual via `datajoint.migrate.add_job_metadata_columns()` utility | -| Populate modes | Record metadata in both direct and distributed modes | -| Part tables | No metadata columns - only master tables | -| Version length | varchar(64) in both jobs table and computed tables | -| Binary operators | Hidden attributes excluded via USING clause instead of NATURAL JOIN | -| Failed makes | N/A - transaction rolls back, no rows to update | - - ---- -## File: reference/specs/master-part.md - -# Master-Part Relationships Specification - -## Overview - -Master-Part relationships model compositional data where a master entity contains multiple detail records. Part tables provide a way to store variable-length, structured data associated with each master entity while maintaining strict referential integrity. - ---- - -## 1. Definition - -### 1.1 Master Table - -Any table class (`Manual`, `Lookup`, `Imported`, `Computed`) can serve as a master: - -```python -@schema -class Session(dj.Manual): - definition = """ - subject_id : varchar(16) - session_idx : uint8 - --- - session_date : date - """ -``` - -### 1.2 Part Table - -Part tables are nested classes inheriting from `dj.Part`: - -```python -@schema -class Session(dj.Manual): - definition = """ - subject_id : varchar(16) - session_idx : uint8 - --- - session_date : date - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 - --- - stimulus : varchar(32) - response : varchar(32) - """ -``` - -### 1.3 SQL Naming - -| Python | SQL Table Name | -|--------|----------------| -| `Session` | `schema`.`session` | -| `Session.Trial` | `schema`.`session__trial` | - -Part tables use double underscore (`__`) separator in SQL. - -### 1.4 Master Reference - -Within a Part definition, reference the master using: - -```python --> master # lowercase keyword (preferred) --> Session # explicit class name -``` - -The `-> master` reference: -- Automatically inherits master's primary key -- Creates foreign key constraint to master -- Enforces ON DELETE RESTRICT (by default) - ---- - -## 2. Integrity Constraints - -### 2.1 Compositional Integrity - -Master-Part relationships enforce **compositional integrity**: - -1. **Existence**: Parts cannot exist without their master -2. **Cohesion**: Parts should be deleted/dropped with their master -3. **Atomicity**: Master and parts form a logical unit - -### 2.2 Foreign Key Behavior - -Part tables have implicit foreign key to master: - -```sql -FOREIGN KEY (master_pk) REFERENCES master_table (master_pk) -ON UPDATE CASCADE -ON DELETE RESTRICT -``` - -The `ON DELETE RESTRICT` prevents orphaned parts at the database level. - ---- - -## 3. Insert Operations - -### 3.1 Master-First Insertion - -Master must exist before inserting parts: - -```python -# Insert master -Session.insert1({ - 'subject_id': 'M001', - 'session_idx': 1, - 'session_date': '2026-01-08' -}) - -# Insert parts -Session.Trial.insert([ - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 1, 'stimulus': 'A', 'response': 'left'}, - {'subject_id': 'M001', 'session_idx': 1, 'trial_idx': 2, 'stimulus': 'B', 'response': 'right'}, -]) -``` - -### 3.2 Atomic Insertion - -For atomic master+parts insertion, use transactions: - -```python -with dj.conn().transaction: - Session.insert1(master_data) - Session.Trial.insert(trials_data) -``` - -### 3.3 Computed Tables with Parts - -In `make()` methods, use `self.insert1()` for master and `self.PartName.insert()` for parts: - -```python -class ProcessedSession(dj.Computed): - definition = """ - -> Session - --- - n_trials : uint16 - """ - - class TrialResult(dj.Part): - definition = """ - -> master - -> Session.Trial - --- - score : float32 - """ - - def make(self, key): - trials = (Session.Trial & key).fetch() - results = process(trials) - - self.insert1({**key, 'n_trials': len(trials)}) - self.TrialResult.insert(results) -``` - ---- - -## 4. Delete Operations - -### 4.1 Cascade from Master - -Deleting from master cascades to parts: - -```python -# Deletes session AND all its trials -(Session & {'subject_id': 'M001', 'session_idx': 1}).delete() -``` - -### 4.2 Part Integrity Parameter - -Direct deletion from Part tables is controlled by `part_integrity`: - -```python -def delete(self, part_integrity: str = "enforce", ...) -> int -``` - -| Value | Behavior | -|-------|----------| -| `"enforce"` | (default) Error if parts deleted without masters | -| `"ignore"` | Allow deleting parts without masters (breaks integrity) | -| `"cascade"` | Also delete masters when parts are deleted | - -### 4.3 Default Behavior (enforce) - -```python -# Error: Cannot delete from Part directly -Session.Trial.delete() -# DataJointError: Cannot delete from a Part directly. -# Delete from master instead, or use part_integrity='ignore' -# to break integrity, or part_integrity='cascade' to also delete master. -``` - -### 4.4 Breaking Integrity (ignore) - -```python -# Allow direct part deletion (master retains incomplete parts) -(Session.Trial & {'trial_idx': 1}).delete(part_integrity="ignore") -``` - -**Use cases:** -- Removing specific invalid trials -- Partial data cleanup -- Testing/debugging - -**Warning:** This leaves masters with incomplete part data. - -### 4.5 Cascade to Master (cascade) - -```python -# Delete parts AND their masters -(Session.Trial & condition).delete(part_integrity="cascade") -``` - -**Behavior:** -- Identifies affected masters -- Deletes masters (which cascades to ALL their parts) -- Maintains compositional integrity - -### 4.6 Behavior Matrix - -| Operation | Result | -|-----------|--------| -| `Master.delete()` | Deletes master + all parts | -| `Part.delete()` | Error (default) | -| `Part.delete(part_integrity="ignore")` | Deletes parts only | -| `Part.delete(part_integrity="cascade")` | Deletes parts + masters | - ---- - -## 5. Drop Operations - -### 5.1 Drop Master - -Dropping a master table also drops all its part tables: - -```python -Session.drop() # Drops Session AND Session.Trial -``` - -### 5.2 Drop Part Directly - -Part tables cannot be dropped directly by default: - -```python -Session.Trial.drop() -# DataJointError: Cannot drop a Part directly. Drop master instead, -# or use part_integrity='ignore' to force. - -# Override with part_integrity="ignore" -Session.Trial.drop(part_integrity="ignore") -``` - -**Note:** `part_integrity="cascade"` is not supported for drop (too destructive). - -### 5.3 Schema Drop - -Dropping schema drops all tables including masters and parts: - -```python -schema.drop(prompt=False) -``` - ---- - -## 6. Query Operations - -### 6.1 Accessing Parts - -```python -# From master class -Session.Trial - -# From master instance -session = Session() -session.Trial -``` - -### 6.2 Joining Master and Parts - -```python -# All trials with session info -Session * Session.Trial - -# Filtered -(Session & {'subject_id': 'M001'}) * Session.Trial -``` - -### 6.3 Aggregating Parts - -```python -# Count trials per session -Session.aggr(Session.Trial, n_trials='count(trial_idx)') - -# Statistics -Session.aggr( - Session.Trial, - n_trials='count(trial_idx)', - n_correct='sum(response = stimulus)' -) -``` - ---- - -## 7. Best Practices - -### 7.1 When to Use Part Tables - -**Good use cases:** -- Trials within sessions -- Electrodes within probes -- Cells within imaging fields -- Frames within videos -- Rows within files - -**Avoid when:** -- Parts have independent meaning (use regular FK instead) -- Need to query parts without master context -- Parts reference multiple masters - -### 7.2 Naming Conventions - -```python -class Master(dj.Manual): - class Detail(dj.Part): # Singular, descriptive - ... - class Items(dj.Part): # Or plural for collections - ... -``` - -### 7.3 Part Primary Keys - -Include minimal additional keys beyond master reference: - -```python -class Session(dj.Manual): - definition = """ - session_id : uint32 - --- - ... - """ - - class Trial(dj.Part): - definition = """ - -> master - trial_idx : uint16 # Only trial-specific key - --- - ... - """ -``` - -### 7.4 Avoiding Deep Nesting - -Part tables cannot have their own parts. For hierarchical data: - -```python -# Instead of nested parts, use separate tables with FKs -@schema -class Session(dj.Manual): - definition = """...""" - class Trial(dj.Part): - definition = """...""" - -@schema -class TrialEvent(dj.Manual): # Not a Part, but references Trial - definition = """ - -> Session.Trial - event_idx : uint8 - --- - event_time : float32 - """ -``` - ---- - -## 8. Implementation Reference - -| File | Purpose | -|------|---------| -| `user_tables.py` | Part class definition | -| `table.py` | delete() with part_integrity | -| `schemas.py` | Part table decoration | -| `declare.py` | Part table SQL generation | - ---- - -## 9. Error Messages - -| Error | Cause | Solution | -|-------|-------|----------| -| "Cannot delete from Part directly" | Called Part.delete() with part_integrity="enforce" | Delete from master, or use part_integrity="ignore" or "cascade" | -| "Cannot drop Part directly" | Called Part.drop() with part_integrity="enforce" | Drop master table, or use part_integrity="ignore" | -| "Attempt to delete part before master" | Cascade would delete part without master | Use part_integrity="ignore" or "cascade" | - - ---- -## File: reference/specs/npy-codec.md - -# `` Codec Specification - -Schema-addressed storage for numpy arrays as portable `.npy` files. - -## Overview - -The `` codec stores numpy arrays as standard `.npy` files using -schema-addressed paths that mirror the database structure. On fetch, it returns -`NpyRef`—a lazy reference that provides metadata access without downloading, -and transparent numpy integration via the `__array__` protocol. - -**Key characteristics:** - -- **Store only**: Requires `@` modifier (`` or ``) -- **Schema-addressed**: Paths mirror database structure (`{schema}/{table}/{pk}/{attr}.npy`) -- **Lazy loading**: Shape/dtype available without download -- **Transparent**: Use directly in numpy operations -- **Portable**: Standard `.npy` format readable by numpy, MATLAB, etc. - -## Quick Start - -```python -import datajoint as dj -import numpy as np - -@schema -class Recording(dj.Manual): - definition = """ - recording_id : int - --- - waveform : - """ - -# Insert - just pass the array -Recording.insert1({ - 'recording_id': 1, - 'waveform': np.random.randn(1000, 32), -}) - -# Fetch - returns NpyRef (lazy) -ref = (Recording & 'recording_id=1').fetch1('waveform') - -# Metadata without download -ref.shape # (1000, 32) -ref.dtype # float64 - -# Use in numpy ops - downloads automatically -mean = np.mean(ref, axis=0) - -# Or load explicitly -arr = ref.load() -``` - -## NpyRef: Lazy Array Reference - -When you fetch an `` attribute, you get an `NpyRef` object: - -```python -ref = (Recording & key).fetch1('waveform') -type(ref) # -``` - -### Metadata Access (No I/O) - -```python -ref.shape # tuple: (1000, 32) -ref.dtype # numpy.dtype: float64 -ref.ndim # int: 2 -ref.size # int: 32000 -ref.nbytes # int: 256000 (estimated) -ref.path # str: "my_schema/recording/recording_id=1/waveform.npy" -ref.store # str or None: store name -ref.is_loaded # bool: False (until loaded) -``` - -### Loading Data - -**Explicit loading:** -```python -arr = ref.load() # Downloads, caches, returns np.ndarray -arr = ref.load() # Returns cached copy (no re-download) -``` - -**Implicit loading via `__array__`:** -```python -# These all trigger automatic download -result = ref + 1 -result = np.mean(ref) -result = np.dot(ref, weights) -arr = np.asarray(ref) -``` - -**Indexing/slicing:** -```python -first_row = ref[0] # Loads then indexes -subset = ref[100:200] # Loads then slices -``` - -### Memory Mapping - -For very large arrays, use `mmap_mode` to access data without loading it all: - -```python -# Memory-mapped loading (random access) -arr = ref.load(mmap_mode='r') - -# Efficient random access - only reads needed portions -slice = arr[1000:2000, :] -chunk = arr[::100] -``` - -**Modes:** -- `'r'` - Read-only (recommended) -- `'r+'` - Read-write (modifications persist) -- `'c'` - Copy-on-write (changes not saved) - -**Performance characteristics:** -- Local filesystem stores: memory-maps the file directly (zero-copy) -- Remote stores (S3, GCS): downloads to local cache first, then memory-maps - -**When to use:** -- Arrays too large to fit in memory -- Only need random access to portions of the array -- Processing data in chunks - -### Safe Bulk Fetch - -The lazy design protects against accidental mass downloads: - -```python -# Fetch 10,000 recordings - NO downloads happen yet -recs = Recording.fetch() - -# Inspect without downloading -for rec in recs: - ref = rec['waveform'] - print(f"Shape: {ref.shape}, dtype: {ref.dtype}") # No I/O - -# Download only what you need -large_arrays = [rec['waveform'] for rec in recs if rec['waveform'].shape[0] > 1000] -for ref in large_arrays: - process(ref.load()) # Downloads here -``` - -### Repr for Debugging - -```python ->>> ref -NpyRef(shape=(1000, 32), dtype=float64, not loaded) - ->>> ref.load() ->>> ref -NpyRef(shape=(1000, 32), dtype=float64, loaded) -``` - -## Table Definition - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : int - --- - waveform : # default store - spectrogram : # specific store - """ -``` - -## Storage Details - -### Addressing Scheme - -The `` codec uses **schema-addressed** storage, where paths mirror the -database schema structure. This creates a browsable organization in object -storage that reflects your data model. - -### Type Chain - -``` - → "json" (metadata stored in JSON column) -``` - -### File Format - -- Format: NumPy `.npy` (version 1.0 or 2.0 depending on array size) -- Encoding: `numpy.save()` with `allow_pickle=False` -- Extension: `.npy` - -### Schema-Addressed Path Construction - -``` -{schema}/{table}/{primary_key_values}/{attribute}.npy -``` - -Example: `lab_ephys/recording/recording_id=1/waveform.npy` - -This schema-addressed layout means you can browse the object store and understand -the organization because it mirrors your database schema. - -### JSON Metadata - -The database column stores: - -```json -{ - "path": "lab_ephys/recording/recording_id=1/waveform.npy", - "store": "main", - "dtype": "float64", - "shape": [1000, 32] -} -``` - -## Validation - -The codec validates on insert: - -- Value must be `numpy.ndarray` -- Array must not have `object` dtype - -```python -# Valid -Recording.insert1({'recording_id': 1, 'waveform': np.array([1, 2, 3])}) - -# Invalid - not an array -Recording.insert1({'recording_id': 1, 'waveform': [1, 2, 3]}) -# DataJointError: requires numpy.ndarray, got list - -# Invalid - object dtype -Recording.insert1({'recording_id': 1, 'waveform': np.array([{}, []])}) -# DataJointError: does not support object dtype arrays -``` - -## Direct File Access - -Files are stored at predictable paths and can be accessed directly: - -```python -# Get the storage path -ref = (Recording & 'recording_id=1').fetch1('waveform') -print(ref.path) # "my_schema/recording/recording_id=1/waveform.npy" - -# Load directly with numpy (if you have store access) -arr = np.load('/path/to/store/my_schema/recording/recording_id=1/waveform.npy') -``` - -## Comparison with Other Codecs - -| Codec | Format | Addressing | Lazy | Memory Map | Portability | -|-------|--------|------------|------|------------|-------------| -| `` | `.npy` | Schema | Yes (NpyRef) | Yes | High (numpy, MATLAB) | -| `` | varies | Schema | Yes (ObjectRef) | No | Depends on content | -| `` | pickle | Hash | No | No | Python only | -| `` | raw bytes | Hash | No | No | N/A | - -**Addressing schemes:** -- **Schema-addressed**: Path mirrors database structure. Browsable, one location per entity. -- **Hash-addressed**: Path from content hash. Automatic deduplication. - -## When to Use `` - -**Use `` when:** -- Storing single numpy arrays -- Interoperability matters (non-Python tools) -- You want lazy loading with metadata inspection -- Fetching many rows where not all arrays are needed -- Random access to large arrays via memory mapping -- Browsable object store organization is valuable - -**Use `` when:** -- Storing arbitrary Python objects (dicts, lists, mixed types) -- Arrays are small and eager loading is fine -- MATLAB compatibility with DataJoint's mYm format is needed -- Deduplication is beneficial (hash-addressed) - -**Use `` when:** -- Storing files/folders (Zarr, HDF5, multi-file outputs) -- Content is not a single numpy array - -## Limitations - -1. **Single array only**: For multiple arrays, use separate attributes or `` with `.npz` -2. **No compression**: For compressed storage, use a custom codec with `numpy.savez_compressed` -3. **No object dtype**: Arrays containing arbitrary Python objects are not supported -4. **Store only**: Cannot store in-table (database column) - -## See Also - -- [Type System Specification](type-system.md) - Complete type system overview -- [Codec API](codec-api.md) - Creating custom codecs -- [Object Storage](type-system.md#object--path-addressed-storage) - Path-addressed storage details - - ---- -## File: reference/specs/object-store-configuration.md - -# Object Store Configuration Specification - -This specification defines DataJoint's unified object store system, including store configuration, path generation algorithms, and storage models. - -## Overview - -DataJoint's Object-Augmented Schema (OAS) integrates relational tables with object storage as a single coherent system. Large data objects are stored in file systems or cloud storage while maintaining full referential integrity with the relational database. - -### Storage Models - -DataJoint 2.0 supports three storage models, all sharing the same store configuration: - -| Model | Data Types | Path Structure | Integration | Use Case | -|-------|------------|----------------|-------------|----------| -| **Hash-addressed** | ``, `` | Content-addressed by hash | **Integrated** (OAS) | Immutable data, automatic deduplication | -| **Schema-addressed** | ``, `` | Key-based hierarchical paths | **Integrated** (OAS) | Mutable data, streaming access, arrays | -| **Filepath** | `` | User-managed paths | **Reference** | User-managed files (no lifecycle management) | - -**Key distinction:** -- **Hash-addressed** and **schema-addressed** storage are **integrated** into the Object-Augmented Schema. DataJoint manages their lifecycle, paths, integrity, garbage collection, transaction safety, and deduplication. -- **Filepath** storage stores only the path string. DataJoint provides no lifecycle management, garbage collection, transaction safety, or deduplication. Users control file creation, organization, and lifecycle. - -**Legacy note:** DataJoint 0.14.x only supported hash-addressed (called "external") and filepath storage. Schema-addressed storage is new in 2.0. - -## Store Configuration - -### Minimal Configuration - -Every store requires two fields: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "file", - "location": "/data/my-project" - } - } -} -``` - -This creates a store named `main` and designates it as the default. - -### Default Store - -DataJoint uses two default settings to reflect the architectural distinction between integrated and reference storage: - -#### stores.default — Integrated Storage (OAS) - -The `stores.default` setting determines which store is used for **integrated storage** (hash-addressed and schema-addressed) when no store is specified: - -```python -# These are equivalent when stores.default = "main" -signal : # Uses stores.default -signal : # Explicitly names store - -arrays : # Uses stores.default -arrays : # Explicitly names store -``` - -**Rules:** -- `stores.default` must be a string naming a configured store -- Required for ``, ``, ``, `` without explicit `@store` -- Each project typically uses one primary store for integrated data - -#### stores.filepath_default — Filepath References - -The `stores.filepath_default` setting determines which store is used for **filepath references** when no store is specified: - -```python -# These are equivalent when stores.filepath_default = "raw_data" -recording : # Uses stores.filepath_default -recording : # Explicitly names store -``` - -**Rules:** -- `stores.filepath_default` must be a string naming a configured store -- Required for `` without explicit store name -- Often configured differently from `stores.default` because filepath references are not part of OAS -- Users manage file lifecycle and organization - -**Why separate defaults?** - -Integrated storage (hash, schema) is managed by DataJoint as part of the Object-Augmented Schema—DataJoint controls paths, lifecycle, integrity, garbage collection, transaction safety, and deduplication. Filepath storage is user-managed—DataJoint only stores the path string and provides no lifecycle management, garbage collection, transaction safety, or deduplication. These are architecturally distinct, so they often use different storage locations and require separate defaults. - -### Complete Store Configuration - -A fully configured store specifying all sections: - -```json -{ - "stores": { - "default": "main", - "main": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "neuroscience-data", - "location": "lab-project-2024", - - "hash_prefix": "blobs", - "schema_prefix": "arrays", - "filepath_prefix": "imported", - - "subfolding": [2, 2], - "partition_pattern": "subject_id/session_date", - "token_length": 8 - } - } -} -``` - -### Section Prefixes - -Each store is divided into sections controlled by prefix configuration. The `*_prefix` parameters define the path prefix for each storage section: - -| Configuration Parameter | Default | Storage Section | Used By | -|------------------------|---------|-----------------|---------| -| `hash_prefix` | `"_hash"` | Hash-addressed section | ``, `` | -| `schema_prefix` | `"_schema"` | Schema-addressed section | ``, `` | -| `filepath_prefix` | `null` | Filepath section (optional) | `` | - -**Validation rules:** -1. All prefixes must be mutually exclusive (no nesting) -2. `hash_prefix` and `schema_prefix` are reserved for DataJoint -3. `filepath_prefix` is optional: - - `null` (default): filepaths can use any path except reserved sections - - `"some/prefix"`: all filepaths must start with this prefix - -**Example with custom prefixes:** - -```json -{ - "hash_prefix": "content_addressed", - "schema_prefix": "structured_data", - "filepath_prefix": "user_files" -} -``` - -Results in these sections: -- `{location}/content_addressed/{schema}/{hash}` — hash-addressed -- `{location}/structured_data/{schema}/{table}/{key}/` — schema-addressed -- `{location}/user_files/{user_path}` — filepath (required prefix) - -### Multiple Stores - -Configure multiple stores for different data types or storage tiers: - -```json -{ - "stores": { - "default": "main", - "filepath_default": "raw_data", - "main": { - "protocol": "file", - "location": "/data/fast-storage", - "hash_prefix": "blobs", - "schema_prefix": "arrays" - }, - "archive": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "archive-bucket", - "location": "long-term-storage", - "hash_prefix": "archived_blobs", - "schema_prefix": "archived_arrays", - "subfolding": [2, 2] - }, - "raw_data": { - "protocol": "file", - "location": "/data/acquisition", - "filepath_prefix": "recordings" - } - } -} -``` - -Use named stores in table definitions: - -```python -@schema -class Recording(dj.Manual): - definition = """ - recording_id : uuid - --- - metadata : # Fast storage, hash-addressed - raw_file : # Reference existing acquisition file - processed : # Fast storage, schema-addressed - backup : # Long-term storage - """ -``` - -## Secret Management - -Store credentials separately from configuration files using the `.secrets/` directory. - -### Secrets Directory Structure - -``` -project/ -├── datajoint.json # Non-sensitive configuration -└── .secrets/ # Credentials (gitignored) - ├── .gitignore # Ensures secrets aren't committed - ├── database.user - ├── database.password - ├── stores.main.access_key - ├── stores.main.secret_key - ├── stores.archive.access_key - └── stores.archive.secret_key -``` - -### Configuration Priority - -DataJoint loads configuration in this order (highest priority first): - -1. **Environment variables**: `DJ_HOST`, `DJ_USER`, `DJ_PASS` -2. **Secrets directory**: `.secrets/database.user`, `.secrets/stores.main.access_key` -3. **Config file**: `datajoint.json` -4. **Defaults**: Built-in defaults - -### Secrets File Format - -Each secret file contains a single value (no quotes, no JSON): - -```bash -# .secrets/database.password -my_secure_password -``` - -```bash -# .secrets/stores.main.access_key -AKIAIOSFODNN7EXAMPLE -``` - -### Per-Store Credentials - -Store credentials use the naming pattern: `stores..` - -**S3 stores:** -``` -.secrets/stores.main.access_key -.secrets/stores.main.secret_key -``` - -**GCS stores:** -``` -.secrets/stores.gcs_store.token -``` - -**Azure stores:** -``` -.secrets/stores.azure_store.account_key -``` - -### Setting Up Secrets - -```bash -# Create secrets directory -mkdir .secrets -echo "*" > .secrets/.gitignore - -# Add credentials (no quotes) -echo "analyst" > .secrets/database.user -echo "dbpass123" > .secrets/database.password -echo "AKIAIOSFODNN7EXAMPLE" > .secrets/stores.main.access_key -echo "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" > .secrets/stores.main.secret_key - -# Verify .secrets/ is gitignored -git check-ignore .secrets/database.password # Should output the path -``` - -### Template Generation - -Generate configuration templates: - -```python -import datajoint as dj - -# Create config file -dj.config.save_template('datajoint.json') - -# Create config + secrets directory with placeholders -dj.config.save_template('datajoint.json', create_secrets_dir=True) -``` - -## Path Generation - -### Hash-Addressed Storage - -**Data types:** ``, `` - -**Path structure:** -``` -{location}/{hash_prefix}/{schema_name}/{hash}[.ext] -``` - -**With subfolding `[2, 2]`:** -``` -{location}/{hash_prefix}/{schema_name}/{h1}{h2}/{h3}{h4}/{hash}[.ext] -``` - -**Algorithm:** - -1. Serialize value using codec-specific format -2. Compute Blake2b hash of serialized data -3. Encode hash as base32 (lowercase, no padding) -4. Apply subfolding if configured -5. Construct path: `{hash_prefix}/{schema}/{subfolded_hash}` -6. Store metadata in relational database as JSON - -**Properties:** -- **Immutable**: Content defines path, cannot be changed -- **Deduplicated**: Identical content stored once -- **Integrity**: Hash validates content on retrieval - -**Example:** - -```python -# Table definition -@schema -class Experiment(dj.Manual): - definition = """ - experiment_id : int - --- - data : - """ - -# With config: -# hash_prefix = "blobs" -# location = "/data/store" -# subfolding = [2, 2] - -# Insert -Experiment.insert1({'experiment_id': 1, 'data': my_data}) - -# Resulting path: -# /data/store/blobs/my_schema/ab/cd/abcdef123456... -``` - -### Schema-Addressed Storage - -**Data types:** ``, `` - -**Path structure (no partitioning):** -``` -{location}/{schema_prefix}/{schema_name}/{table_name}/{key_string}/{field_name}.{token}.{ext} -``` - -**With partitioning:** -``` -{location}/{schema_prefix}/{partition_path}/{schema_name}/{table_name}/{remaining_key}/{field_name}.{token}.{ext} -``` - -**Algorithm:** - -1. Extract primary key values from the row -2. If partition pattern configured, extract partition attributes -3. Build partition path from partition attributes (if any) -4. Build remaining key string from non-partition primary key attributes -5. Generate random token (default 8 characters) -6. Construct full path -7. Store path metadata in relational database as JSON - -**Partition pattern format:** - -```json -{ - "partition_pattern": "subject_id/session_date" -} -``` - -This creates paths like: -``` -{schema_prefix}/subject_id=042/session_date=2024-01-15/{schema}/{table}/{remaining_key}/ -``` - -**Key string encoding:** - -Primary key values are encoded as: `{attr}={value}` - -- Multiple attributes joined with `/` -- Values URL-encoded if necessary -- Order matches table definition - -**Properties:** -- **Mutable**: Can overwrite by writing to same path -- **Streaming**: fsspec integration for lazy loading -- **Organized**: Hierarchical structure mirrors data relationships - -**Example without partitioning:** - -```python -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_id : int - --- - neural_data : - """ - -# With config: -# schema_prefix = "arrays" -# location = "/data/store" -# token_length = 8 - -Recording.insert1({ - 'subject_id': 42, - 'session_id': 100, - 'neural_data': zarr_array -}) - -# Resulting path: -# /data/store/arrays/neuroscience/Recording/subject_id=42/session_id=100/neural_data.x8a7b2c4.zarr -``` - -**Example with partitioning:** - -```python -# Same table, but with partition configuration: -# partition_pattern = "subject_id/session_date" - -@schema -class Recording(dj.Manual): - definition = """ - subject_id : int - session_date : date - session_id : int - --- - neural_data : - """ - -Recording.insert1({ - 'subject_id': 42, - 'session_date': '2024-01-15', - 'session_id': 100, - 'neural_data': zarr_array -}) - -# Resulting path: -# /data/store/arrays/subject_id=42/session_date=2024-01-15/neuroscience/Recording/session_id=100/neural_data.x8a7b2c4.zarr -``` - -**Partition extraction:** - -When a partition pattern is configured: - -1. Check if table has all partition attributes in primary key -2. If yes: extract those attributes to partition path, remaining attributes to key path -3. If no: use normal structure (no partitioning for this table) - -This allows a single `partition_pattern` to apply to multiple tables, with automatic fallback for tables lacking partition attributes. - -**Path collision prevention:** - -The random token ensures uniqueness: -- 8 characters (default): 62^8 = ~218 trillion combinations -- Collision probability negligible for typical table sizes -- Token regenerated on each write - -### Filepath Storage - -**Data type:** `` - -**Path structure:** -``` -{location}/{filepath_prefix}/{user_path} -``` - -Or if `filepath_prefix = null`: -``` -{location}/{user_path} -``` - -**Algorithm:** - -1. User provides relative path within store -2. Validate path doesn't use reserved sections (`hash_prefix`, `schema_prefix`) -3. If `filepath_prefix` configured, validate path starts with it -4. Check file exists at `{location}/{user_path}` -5. Record path, size, and timestamp in JSON metadata -6. No file copying occurs - -**Properties:** -- **Path-only storage**: DataJoint stores path string, no file management -- **No lifecycle management**: No garbage collection, transaction safety, or deduplication -- **User-managed**: User controls file creation, organization, and lifecycle -- **Collision-prone**: **User responsible for avoiding name collisions** -- **Flexible**: Can reference existing files or create new ones - -**Collision handling:** - -DataJoint does **not** prevent filename collisions for filepath storage. Users must ensure: - -1. Unique paths for each referenced file -2. No overwrites of files still referenced by database -3. Coordination if multiple processes write to same store - -**Strategies for avoiding collisions:** - -```python -# Strategy 1: Include primary key in path -recording_path = f"subject_{subject_id}/session_{session_id}/data.bin" - -# Strategy 2: Use UUIDs -import uuid -recording_path = f"recordings/{uuid.uuid4()}.nwb" - -# Strategy 3: Timestamps -from datetime import datetime -recording_path = f"data_{datetime.now().isoformat()}.dat" - -# Strategy 4: Enforce via filepath_prefix -# Config: "filepath_prefix": "recordings" -# All paths must start with recordings/, organize within that namespace -``` - -**Reserved sections:** - -Filepath storage cannot use paths starting with configured `hash_prefix` or `schema_prefix`: - -```python -# Invalid (default prefixes) -table.insert1({'id': 1, 'file': '_hash/data.bin'}) # ERROR -table.insert1({'id': 2, 'file': '_schema/data.zarr'}) # ERROR - -# Invalid (custom prefixes: hash_prefix="blobs") -table.insert1({'id': 3, 'file': 'blobs/data.bin'}) # ERROR - -# Valid -table.insert1({'id': 4, 'file': 'raw/subject01/rec.bin'}) # OK -``` - -**Example:** - -```python -@schema -class RawRecording(dj.Manual): - definition = """ - recording_id : uuid - --- - acquisition_file : - """ - -# With config: -# filepath_prefix = "imported" -# location = "/data/acquisition" - -# File already exists at: /data/acquisition/imported/subject01/session001/data.nwb - -RawRecording.insert1({ - 'recording_id': my_uuid, - 'acquisition_file': 'imported/subject01/session001/data.nwb' -}) - -# DataJoint validates file exists, stores reference -# User responsible for ensuring path uniqueness across recordings -``` - -## Storage Type Comparison - -| Feature | Hash-addressed | Schema-addressed | Filepath | -|---------|----------------|------------------|----------| -| **Mutability** | Immutable | Mutable | User-managed | -| **Deduplication** | Automatic | None | None | -| **Streaming** | No (load full) | Yes (fsspec) | Yes (fsspec) | -| **Organization** | Flat (by hash) | Hierarchical (by key) | User-defined | -| **Collision handling** | Automatic (by content) | Automatic (token) | **User responsibility** | -| **DataJoint manages lifecycle** | Yes | Yes | **No** | -| **Suitable for** | Immutable blobs | Large mutable arrays | Existing files | - -## Protocol-Specific Configuration - -### File Protocol - -```json -{ - "protocol": "file", - "location": "/data/my-project", - "hash_prefix": "blobs", - "schema_prefix": "arrays", - "filepath_prefix": null -} -``` - -**Required:** `protocol`, `location` - -### S3 Protocol - -```json -{ - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-bucket", - "location": "my-project/production", - "secure": true, - "hash_prefix": "blobs", - "schema_prefix": "arrays" -} -``` - -**Required:** `protocol`, `endpoint`, `bucket`, `location`, `access_key`, `secret_key` - -**Credentials:** Store in `.secrets/stores..access_key` and `.secrets/stores..secret_key` - -### GCS Protocol - -```json -{ - "protocol": "gcs", - "bucket": "my-gcs-bucket", - "location": "my-project", - "project": "my-gcp-project", - "hash_prefix": "blobs", - "schema_prefix": "arrays" -} -``` - -**Required:** `protocol`, `bucket`, `location`, `token` - -**Credentials:** Store in `.secrets/stores..token` (path to service account JSON) - -### Azure Protocol - -```json -{ - "protocol": "azure", - "container": "my-container", - "location": "my-project", - "hash_prefix": "blobs", - "schema_prefix": "arrays" -} -``` - -**Required:** `protocol`, `container`, `location`, `account_name`, `account_key` - -**Credentials:** Store in `.secrets/stores..account_key` - -## Migration from Legacy Storage - -DataJoint 0.14.x used separate configuration systems: - -### Legacy "External" Storage (Hash-addressed Integrated) - -```python -# 0.14.x config -dj.config['stores'] = { - 'my_store': { - 'protocol': 's3', - 'endpoint': 's3.amazonaws.com', - 'bucket': 'my-bucket', - 'location': 'my-project', - 'access_key': 'XXX', - 'secret_key': 'YYY' - } -} - -# 0.14.x usage -data : external-my_store -``` - -### 2.0 Equivalent - -```json -{ - "stores": { - "default": "my_store", - "my_store": { - "protocol": "s3", - "endpoint": "s3.amazonaws.com", - "bucket": "my-bucket", - "location": "my-project", - "hash_prefix": "_hash" - } - } -} -``` - -Credentials moved to `.secrets/`: -``` -.secrets/stores.my_store.access_key -.secrets/stores.my_store.secret_key -``` - -```python -# 2.0 usage (equivalent) -data : -``` - -### New in 2.0: Schema-addressed Storage - -Schema-addressed storage (``, ``) is entirely new in DataJoint 2.0. No migration needed as this feature didn't exist in 0.14.x. - -## Validation and Testing - -### Verify Store Configuration - -```python -import datajoint as dj - -# Check default store -spec = dj.config.get_store_spec() -print(f"Default store: {dj.config['stores']['default']}") -print(f"Protocol: {spec['protocol']}") -print(f"Location: {spec['location']}") -print(f"Hash prefix: {spec['hash_prefix']}") -print(f"Schema prefix: {spec['schema_prefix']}") -print(f"Filepath prefix: {spec['filepath_prefix']}") - -# Check named store -spec = dj.config.get_store_spec('archive') -print(f"Archive location: {spec['location']}") - -# List all stores -print(f"Configured stores: {list(dj.config['stores'].keys())}") -``` - -### Test Storage Access - -```python -from datajoint.hash_registry import get_store_backend - -# Test backend connectivity -backend = get_store_backend('main') -print(f"Backend type: {type(backend)}") - -# For file protocol, check paths exist -if spec['protocol'] == 'file': - import os - assert os.path.exists(spec['location']), f"Location not found: {spec['location']}" -``` - -## Best Practices - -### Store Organization - -1. **Use one default store** for most data -2. **Add specialized stores** for specific needs: - - `archive` — long-term cold storage - - `fast` — high-performance tier - - `shared` — cross-project data - - `raw` — acquisition files (filepath only) - -### Prefix Configuration - -1. **Use defaults** unless integrating with existing storage -2. **Choose meaningful names** if customizing: `blobs`, `arrays`, `user_files` -3. **Keep prefixes short** to minimize path length - -### Secret Management - -1. **Never commit credentials** to version control -2. **Use `.secrets/` directory** for all credentials -3. **Set restrictive permissions**: `chmod 700 .secrets` -4. **Document required secrets** in project README - -### Partitioning Strategy - -1. **Choose partition attributes carefully:** - - High cardinality (many unique values) - - Natural data organization (subject, date) - - Query patterns (often filtered by these attributes) - -2. **Example patterns:** - - Neuroscience: `subject_id/session_date` - - Genomics: `sample_id/sequencing_run` - - Microscopy: `experiment_id/imaging_session` - -3. **Avoid over-partitioning:** - - Don't partition by high-cardinality unique IDs - - Limit to 2-3 partition levels - -### Filepath Usage - -1. **Design naming conventions** before inserting data -2. **Include unique identifiers** in paths -3. **Document collision prevention strategy** for the team -4. **Consider using `filepath_prefix`** to enforce structure - -## See Also - -- [Configuration Reference](../configuration.md) — All configuration options -- [Configure Object Stores](../../how-to/configure-storage.md) — Setup guide -- [Type System Specification](type-system.md) — Data type definitions -- [Codec API Specification](codec-api.md) — Codec implementation details - - ---- -## File: reference/specs/primary-keys.md - -# Primary Key Rules in Relational Operators - -In DataJoint, the result of each query operator produces a valid **entity set** with a well-defined **entity type** and **primary key**. This section specifies how the primary key is determined for each relational operator. - -## General Principle - -The primary key of a query result identifies unique entities in that result. For most operators, the primary key is preserved from the left operand. For joins, the primary key depends on the functional dependencies between the operands. - -## Integration with Semantic Matching - -Primary key determination is applied **after** semantic compatibility is verified. The evaluation order is: - -1. **Semantic Check**: `assert_join_compatibility()` ensures all namesakes are homologous (same lineage) -2. **PK Determination**: The "determines" relationship is computed using attribute names -3. **Left Join Validation**: If `left=True`, verify A → B - -This ordering is important because: -- After semantic matching passes, namesakes represent semantically equivalent attributes -- The name-based "determines" check is therefore semantically valid -- Attribute names in the context of a semantically-valid join represent the same entity - -The "determines" relationship uses attribute **names** (not lineages directly) because: -- Lineage ensures namesakes are homologous -- Once verified, checking by name is equivalent to checking by semantic identity -- Aliased attributes (same lineage, different names) don't participate in natural joins anyway - -## Notation - -In the examples below, `*` marks primary key attributes: -- `A(x*, y*, z)` means A has primary key `{x, y}` and secondary attribute `z` -- `A → B` means "A determines B" (defined below) - -### Rules by Operator - -| Operator | Primary Key Rule | -|----------|------------------| -| `A & B` (restriction) | PK(A) — preserved from left operand | -| `A - B` (anti-restriction) | PK(A) — preserved from left operand | -| `A.proj(...)` (projection) | PK(A) — preserved from left operand | -| `A.aggr(B, ...)` (aggregation) | PK(A) — preserved from left operand | -| `A.extend(B)` (extension) | PK(A) — requires A → B | -| `A * B` (join) | Depends on functional dependencies (see below) | - -### Join Primary Key Rule - -The join operator requires special handling because it combines two entity sets. The primary key of `A * B` depends on the **functional dependency relationship** between the operands. - -#### Definitions - -**A determines B** (written `A → B`): Every attribute in PK(B) is in A. - -``` -A → B iff ∀b ∈ PK(B): b ∈ A -``` - -Since `PK(A) ∪ secondary(A) = all attributes in A`, this is equivalent to saying every attribute in B's primary key exists somewhere in A (as either a primary key or secondary attribute). - -Intuitively, `A → B` means that knowing A's primary key is sufficient to determine B's primary key through the functional dependencies implied by A's structure. - -**B determines A** (written `B → A`): Every attribute in PK(A) is in B. - -``` -B → A iff ∀a ∈ PK(A): a ∈ B -``` - -#### Join Primary Key Algorithm - -For `A * B`: - -| Condition | PK(A * B) | Attribute Order | -|-----------|-----------|-----------------| -| A → B | PK(A) | A's attributes first | -| B → A (and not A → B) | PK(B) | B's attributes first | -| Neither | PK(A) ∪ PK(B) | PK(A) first, then PK(B) − PK(A) | - -When both `A → B` and `B → A` hold, the left operand takes precedence (use PK(A)). - -#### Examples - -**Example 1: B → A** -``` -A: x*, y* -B: x*, z*, y (y is secondary in B, so z → y) -``` -- A → B? PK(B) = {x, z}. Is z in PK(A) or secondary in A? No (z not in A). **No.** -- B → A? PK(A) = {x, y}. Is y in PK(B) or secondary in B? Yes (secondary). **Yes.** -- Result: **PK(A * B) = {x, z}** with B's attributes first. - -**Example 2: Both directions (bijection-like)** -``` -A: x*, y*, z (z is secondary in A) -B: y*, z*, x (x is secondary in B) -``` -- A → B? PK(B) = {y, z}. Is z in PK(A) or secondary in A? Yes (secondary). **Yes.** -- B → A? PK(A) = {x, y}. Is x in PK(B) or secondary in B? Yes (secondary). **Yes.** -- Both hold, prefer left operand: **PK(A * B) = {x, y}** with A's attributes first. - -**Example 3: Neither direction** -``` -A: x*, y* -B: z*, x (x is secondary in B) -``` -- A → B? PK(B) = {z}. Is z in PK(A) or secondary in A? No. **No.** -- B → A? PK(A) = {x, y}. Is y in PK(B) or secondary in B? No (y not in B). **No.** -- Result: **PK(A * B) = {x, y, z}** (union) with A's attributes first. - -**Example 4: A → B (subordinate relationship)** -``` -Session: session_id* -Trial: session_id*, trial_num* (references Session) -``` -- A → B? PK(Trial) = {session_id, trial_num}. Is trial_num in PK(Session) or secondary? No. **No.** -- B → A? PK(Session) = {session_id}. Is session_id in PK(Trial)? Yes. **Yes.** -- Result: **PK(Session * Trial) = {session_id, trial_num}** with Trial's attributes first. - -**Join primary key determination**: - - `A * B` where `A → B`: result has PK(A) - - `A * B` where `B → A` (not `A → B`): result has PK(B), B's attributes first - - `A * B` where both `A → B` and `B → A`: result has PK(A) (left preference) - - `A * B` where neither direction: result has PK(A) ∪ PK(B) - - Verify attribute ordering matches primary key source - - Verify non-commutativity: `A * B` vs `B * A` may differ in PK and order - -### Design Tradeoff: Predictability vs. Minimality - -The join primary key rule prioritizes **predictability** over **minimality**. In some cases, the resulting primary key may not be minimal (i.e., it may contain functionally redundant attributes). - -**Example of non-minimal result:** -``` -A: x*, y* -B: z*, x (x is secondary in B, so z → x) -``` - -The mathematically minimal primary key for `A * B` would be `{y, z}` because: -- `z → x` (from B's structure) -- `{y, z} → {x, y, z}` (z gives us x, and we have y) - -However, `{y, z}` is problematic: -- It is **not the primary key of either operand** (A has `{x, y}`, B has `{z}`) -- It is **not the union** of the primary keys -- It represents a **novel entity type** that doesn't correspond to A, B, or their natural pairing - -This creates confusion: what kind of entity does `{y, z}` identify? - -**The simplified rule produces `{x, y, z}`** (the union), which: -- Is immediately recognizable as "one A entity paired with one B entity" -- Contains A's full primary key and B's full primary key -- May have redundancy (`x` is determined by `z`) but is semantically clear - -**Rationale:** Users can always project away redundant attributes if they need the minimal key. But starting with a predictable, interpretable primary key reduces confusion and errors. - -### Attribute Ordering - -The primary key attributes always appear **first** in the result's attribute list, followed by secondary attributes. When `B → A` (and not `A → B`), the join is conceptually reordered as `B * A` to maintain this invariant: - -- If PK = PK(A): A's attributes appear first -- If PK = PK(B): B's attributes appear first -- If PK = PK(A) ∪ PK(B): PK(A) attributes first, then PK(B) − PK(A), then secondaries - -### Non-Commutativity - -With these rules, join is **not commutative** in terms of: -1. **Primary key selection**: `A * B` may have a different PK than `B * A` when one direction determines but not the other -2. **Attribute ordering**: The left operand's attributes appear first (unless B → A) - -The **result set** (the actual rows returned) remains the same regardless of order, but the **schema** (primary key and attribute order) may differ. - -### Left Join Constraint - -For left joins (`A.join(B, left=True)`), the functional dependency **A → B is required**. - -**Why this constraint exists:** - -In a left join, all rows from A are retained even if there's no matching row in B. For unmatched rows, B's attributes are NULL. This creates a problem for primary key validity: - -| Scenario | PK by inner join rule | Left join problem | -|----------|----------------------|-------------------| -| A → B | PK(A) | ✅ Safe — A's attrs always present | -| B → A | PK(B) | ❌ B's PK attrs could be NULL | -| Neither | PK(A) ∪ PK(B) | ❌ B's PK attrs could be NULL | - -**Example of invalid left join:** -``` -A: x*, y* PK(A) = {x, y} -B: x*, z*, y PK(B) = {x, z}, y is secondary - -Inner join: PK = {x, z} (B → A rule) -Left join attempt: FAILS because z could be NULL for unmatched A rows -``` - -**Valid left join example:** -``` -Session: session_id*, date -Trial: session_id*, trial_num*, stimulus (references Session) - -Session.join(Trial, left=True) # OK: Session → Trial -# PK = {session_id}, all sessions retained even without trials -``` - -**Error message:** -``` -DataJointError: Left join requires the left operand to determine the right operand (A → B). -The following attributes from the right operand's primary key are not determined by -the left operand: ['z']. Use an inner join or restructure the query. -``` - -### Conceptual Note: Left Join as Extension - -When `A → B`, the left join `A.join(B, left=True)` is conceptually distinct from the general join operator `A * B`. It is better understood as an **extension** operation rather than a join: - -| Aspect | General Join (A * B) | Left Join when A → B | -|--------|---------------------|----------------------| -| Conceptual model | Cartesian product restricted to matching rows | Extend A with attributes from B | -| Row count | May increase, decrease, or stay same | Always equals len(A) | -| Primary key | Depends on functional dependencies | Always PK(A) | -| Relation to projection | Different operation | Variation of projection | - -**The extension perspective:** - -The operation `A.join(B, left=True)` when `A → B` is closer to **projection** than to **join**: -- It adds new attributes to A (like `A.proj(..., new_attr=...)`) -- It preserves all rows of A -- It preserves A's primary key -- It lacks the Cartesian product aspect that defines joins - -DataJoint provides an explicit `extend()` method for this pattern: - -```python -# These are equivalent when A → B: -A.join(B, left=True) -A.extend(B) # clearer intent: extend A with B's attributes -``` - -The `extend()` method: -- Requires `A → B` (raises `DataJointError` otherwise) -- Does not expose `allow_nullable_pk` (that's an internal mechanism) -- Expresses the semantic intent: "add B's attributes to A's entities" - -**Relationship to aggregation:** - -A similar argument applies to `A.aggr(B, ...)`: -- It preserves A's primary key -- It adds computed attributes derived from B -- It's conceptually a variation of projection with grouping - -Both `A.join(B, left=True)` (when A → B) and `A.aggr(B, ...)` can be viewed as **projection-like operations** that extend A's attributes while preserving its entity identity. - -### Bypassing the Left Join Constraint - -For special cases where the user takes responsibility for handling the potentially nullable primary key, the constraint can be bypassed using `allow_nullable_pk=True`: - -```python -# Normally blocked - A does not determine B -A.join(B, left=True) # Error: A → B not satisfied - -# Bypass the constraint - user takes responsibility -A.join(B, left=True, allow_nullable_pk=True) # Allowed, PK = PK(A) ∪ PK(B) -``` - -When bypassed, the resulting primary key is the union of both operands' primary keys (PK(A) ∪ PK(B)). The user must ensure that subsequent operations (such as `GROUP BY` or projection) establish a valid primary key. The parameter name `allow_nullable_pk` reflects the specific issue: primary key attributes from the right operand could be NULL for unmatched rows. - -This mechanism is used internally by aggregation (`aggr`) when `exclude_nonmatching=False` (the default), which resets the primary key via the `GROUP BY` clause. - -### Aggregation Exception - -`A.aggr(B)` (with default `exclude_nonmatching=False`) uses a left join internally but has the **opposite requirement**: **B → A** (the group expression B must have all of A's primary key attributes). - -This apparent contradiction is resolved by the `GROUP BY` clause: - -1. Aggregation requires B → A so that B can be grouped by A's primary key -2. The intermediate left join `A LEFT JOIN B` would have an invalid PK under the normal left join rules -3. Aggregation internally allows the invalid PK, producing PK(A) ∪ PK(B) -4. The `GROUP BY PK(A)` clause then **resets** the primary key to PK(A) -5. The final result has PK(A), which consists entirely of non-NULL values from A - -Note: The semantic check (homologous namesake validation) is still performed for aggregation's internal join. Only the primary key validity constraint is bypassed. - -**Example:** -``` -Session: session_id*, date -Trial: session_id*, trial_num*, response_time (references Session) - -# Aggregation (default keeps all rows) -Session.aggr(Trial, avg_rt='avg(response_time)') - -# Internally: Session LEFT JOIN Trial (with invalid PK allowed) -# Intermediate PK would be {session_id} ∪ {session_id, trial_num} = {session_id, trial_num} -# But GROUP BY session_id resets PK to {session_id} -# Result: All sessions, with avg_rt=NULL for sessions without trials -``` - -## Universal Set `dj.U` - -`dj.U()` or `dj.U('attr1', 'attr2', ...)` represents the universal set of all possible values and lineages. - -### Homology with `dj.U` -Since `dj.U` conceptually contains all possible lineages, its attributes are **homologous to any namesake attribute** in other expressions. - -### Valid Operations - -```python -# Restriction: promotes a, b to PK; lineage transferred from A -dj.U('a', 'b') & A - -# Aggregation: groups by a, b -dj.U('a', 'b').aggr(A, count='count(*)') -``` - -### Invalid Operations - -```python -# Anti-restriction: produces infinite set -dj.U('a', 'b') - A # DataJointError - -# Join: deprecated, use & instead -dj.U('a', 'b') * A # DataJointError with migration guidance -``` - - - ---- -## File: reference/specs/query-algebra.md - -# DataJoint Query Algebra Specification - -## Overview - -This document specifies the query algebra in DataJoint Python. Query expressions are composable objects that represent database queries. All operators return new QueryExpression objects without modifying the original—expressions are immutable. - -## 1. Query Expression Fundamentals - -### 1.1 Immutability - -All query expressions are immutable. Every operator creates a new expression: - -```python -original = Session() -restricted = original & "session_date > '2024-01-01'" # New object -# original is unchanged -``` - -### 1.2 Primary Key Preservation - -Most operators preserve the primary key of their input. The exceptions are: - -- **Join**: May expand or contract PK based on functional dependencies -- **U & table**: Sets PK to U's attributes - -### 1.3 Lazy Evaluation - -Expressions are not executed until data is fetched: - -```python -expr = (Session * Trial) & "trial_type = 'test'" # No database query yet -data = expr.to_dicts() # Query executed here -``` - ---- - -## 2. Restriction (`&` and `-`) - -### 2.1 Syntax - -```python -result = expression & condition # Select matching rows -result = expression - condition # Select non-matching rows (anti-restriction) -result = expression.restrict(condition, semantic_check=True) -``` - -### 2.2 Condition Types - -| Type | Example | Behavior | -|------|---------|----------| -| String | `"x > 5"` | SQL WHERE condition | -| Dict | `{"status": "active"}` | Equality on attributes | -| QueryExpression | `OtherTable` | Rows with matching keys in other table | -| List/Tuple/Set | `[cond1, cond2]` | OR of conditions | -| Boolean | `True` / `False` | No effect / empty result | -| pandas.DataFrame | `df` | OR of row conditions | -| numpy.void | `record` | Treated as dict | - -### 2.3 String Conditions - -SQL expressions using attribute names: - -```python -Session & "session_date > '2024-01-01'" -Session & "subject_id IN (1, 2, 3)" -Session & "notes LIKE '%test%'" -Session & "(x > 0) AND (y < 100)" -``` - -### 2.4 Dictionary Conditions - -Attribute-value equality: - -```python -Session & {"subject_id": 1} -Session & {"subject_id": 1, "session_type": "training"} -``` - -Multiple key-value pairs are combined with AND. - -### 2.5 Restriction by Query Expression - -Restrict to rows with matching primary keys in another expression: - -```python -# Sessions that have at least one trial -Session & Trial - -# Sessions for active subjects only -Session & (Subject & "status = 'active'") -``` - -### 2.6 Collection Conditions (OR) - -Lists, tuples, and sets create OR conditions: - -```python -# Either condition matches -Session & [{"subject_id": 1}, {"subject_id": 2}] - -# Equivalent to -Session & "subject_id IN (1, 2)" -``` - -### 2.7 Anti-Restriction - -The `-` operator selects rows that do NOT match: - -```python -# Sessions without any trials -Session - Trial - -# Sessions not from subject 1 -Session - {"subject_id": 1} -``` - -### 2.8 Chaining Restrictions - -Sequential restrictions combine with AND: - -```python -(Session & cond1) & cond2 -# Equivalent to -Session & cond1 & cond2 -``` - -### 2.9 Semantic Matching - -With `semantic_check=True` (default), expression conditions match only on homologous namesakes—attributes with the same name AND same lineage. - -```python -# Default: semantic matching -Session & Trial - -# Disable semantic check (natural join on all namesakes) -Session.restrict(Trial, semantic_check=False) -``` - -### 2.10 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | Preserved: PK(result) = PK(input) | -| Attributes | Preserved: all attributes retained | -| Entity Type | Preserved | - -### 2.11 Error Conditions - -| Condition | Error | -|-----------|-------| -| Unknown attribute in string | `UnknownAttributeError` | -| Non-homologous namesakes | `DataJointError` (semantic mismatch) | - ---- - -## 3. Projection (`.proj()`) - -### 3.1 Syntax - -```python -result = expression.proj() # Primary key only -result = expression.proj(...) # All attributes -result = expression.proj('attr1', 'attr2') # PK + specified -result = expression.proj(..., '-secret') # All except secret -result = expression.proj(new_name='old_name') # Rename -result = expression.proj(computed='x + y') # Computed attribute -``` - -### 3.2 Attribute Selection - -| Syntax | Meaning | -|--------|---------| -| `'attr'` | Include attribute | -| `...` (Ellipsis) | Include all secondary attributes | -| `'-attr'` | Exclude attribute (use with `...`) | - -Primary key attributes are always included, even if not specified. - -### 3.3 Renaming Attributes - -```python -# Rename 'name' to 'subject_name' -Subject.proj(subject_name='name') - -# Duplicate attribute with new name (parentheses preserve original) -Subject.proj('name', subject_name='(name)') -``` - -### 3.4 Computed Attributes - -Create new attributes from SQL expressions: - -```python -# Arithmetic -Trial.proj(speed='distance / duration') - -# Functions -Session.proj(year='YEAR(session_date)') - -# Aggregation-like (per row) -Trial.proj(centered='value - mean_value') -``` - -### 3.5 Primary Key Renaming - -Primary key attributes CAN be renamed: - -```python -Subject.proj(mouse_id='subject_id') -# Result PK: (mouse_id,) instead of (subject_id,) -``` - -### 3.6 Excluding Attributes - -Use `-` prefix with ellipsis to exclude: - -```python -# All attributes except 'internal_notes' -Session.proj(..., '-internal_notes') - -# Multiple exclusions -Session.proj(..., '-notes', '-metadata') -``` - -Cannot exclude primary key attributes. - -### 3.7 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | Preserved (may be renamed) | -| Attributes | Selected/computed subset | -| Entity Type | Preserved | - -### 3.8 Error Conditions - -| Condition | Error | -|-----------|-------| -| Attribute not found | `UnknownAttributeError` | -| Excluding PK attribute | `DataJointError` | -| Duplicate attribute name | `DataJointError` | - ---- - -## 4. Join (`*`) - -### 4.1 Syntax - -```python -result = A * B # Inner join -result = A.join(B, semantic_check=True, left=False) -``` - -### 4.2 Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `semantic_check` | `True` | Match only homologous namesakes | -| `left` | `False` | LEFT JOIN (preserve all rows from A) | - -### 4.3 Join Condition - -Joins match on all shared non-hidden attributes (namesakes): - -```python -# If Session has (subject_id, session_id) and Trial has (subject_id, session_id, trial_id) -# Join matches on (subject_id, session_id) -Session * Trial -``` - -### 4.4 Primary Key Determination - -The result's primary key depends on functional dependencies: - -| Condition | Result PK | Attribute Order | -|-----------|-----------|-----------------| -| A → B | PK(A) | A's attributes first | -| B → A | PK(B) | B's attributes first | -| Both | PK(A) | A's attributes first | -| Neither | PK(A) ∪ PK(B) | A's PK, then B's additional PK | - -**A → B** means: All of B's primary key attributes exist in A (as PK or secondary). - -### 4.5 Examples - -```python -# Session → Trial (Session's PK is subset of Trial's PK) -Session * Trial -# Result PK: (subject_id, session_id) — same as Session - -# Neither determines the other -Subject * Experimenter -# Result PK: (subject_id, experimenter_id) — union of PKs -``` - -### 4.6 Left Join - -Preserve all rows from left operand: - -```python -# All sessions, with trial data where available -Session.join(Trial, left=True) -``` - -**Constraint**: Left join requires A → B to prevent NULL values in result's primary key. - -### 4.7 Semantic Matching - -With `semantic_check=True`, only homologous namesakes are matched: - -```python -# Semantic join (default) -TableA * TableB - -# Natural join (match all namesakes regardless of lineage) -TableA.join(TableB, semantic_check=False) -``` - -### 4.8 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | Depends on functional dependencies | -| Attributes | Union of both operands' attributes | -| Commutativity | Result rows same, but PK/order may differ | - -### 4.9 Error Conditions - -| Condition | Error | -|-----------|-------| -| Different database connections | `DataJointError` | -| Non-homologous namesakes (semantic mode) | `DataJointError` | -| Left join without A → B | `DataJointError` | - ---- - -## 5. Aggregation (`.aggr()`) - -### 5.1 Syntax - -```python -result = A.aggr(B, ...) # All A attributes -result = A.aggr(B, 'attr1', 'attr2') # PK + specified from A -result = A.aggr(B, ..., count='count(*)') # With aggregate -result = A.aggr(B, ..., exclude_nonmatching=True) # Only rows with matches -``` - -### 5.2 Parameters - -| Parameter | Default | Description | -|-----------|---------|-------------| -| `*attributes` | — | Attributes from A to include | -| `exclude_nonmatching` | `False` | If True, exclude rows from A that have no matches in B (INNER JOIN). Default keeps all rows (LEFT JOIN). | -| `**named_attributes` | — | Computed aggregates | - -### 5.3 Requirement - -**B must contain all primary key attributes of A.** This enables grouping B's rows by A's primary key. - -### 5.4 Aggregate Functions - -```python -# Count -Session.aggr(Trial, n_trials='count(*)') - -# Sum, average, min, max -Session.aggr(Trial, - total='sum(score)', - avg_score='avg(score)', - best='max(score)', - worst='min(score)' -) - -# Group concatenation -Session.aggr(Trial, trial_list='group_concat(trial_id)') - -# Conditional count -Session.aggr(Trial, n_correct='sum(correct = 1)') -``` - -### 5.5 SQL Equivalent - -```sql -SELECT A.pk1, A.pk2, A.secondary, agg_func(B.col) AS new_attr -FROM A -[LEFT] JOIN B USING (pk1, pk2) -WHERE -GROUP BY A.pk1, A.pk2 -HAVING -``` - -### 5.6 Restriction Behavior - -Restrictions on A attributes → WHERE clause (before GROUP BY) -Restrictions on B attributes → HAVING clause (after GROUP BY) - -```python -# WHERE: only 2024 sessions, then count trials -(Session & "YEAR(session_date) = 2024").aggr(Trial, n='count(*)') - -# HAVING: sessions with more than 10 trials -Session.aggr(Trial, n='count(*)') & "n > 10" -``` - -### 5.7 Default Behavior: Keep All Rows - -By default (`exclude_nonmatching=False`), aggregation keeps all rows from A, even those without matches in B: - -```python -# All sessions included; those without trials have n=0 -Session.aggr(Trial, n='count(trial_id)') - -# Only sessions that have at least one trial -Session.aggr(Trial, n='count(trial_id)', exclude_nonmatching=True) -``` - -Note: Use `count(pk_attr)` rather than `count(*)` to correctly count 0 for sessions without trials. `count(*)` counts all rows including the NULL-filled left join row. - -### 5.8 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | PK(A) — grouping expression's PK | -| Entity Type | Same as A | - -### 5.9 Error Conditions - -| Condition | Error | -|-----------|-------| -| B missing A's PK attributes | `DataJointError` | -| Semantic mismatch | `DataJointError` | - ---- - -## 6. Extension (`.extend()`) - -### 6.1 Syntax - -```python -result = A.extend(B) -result = A.extend(B, semantic_check=True) -``` - -### 6.2 Semantics - -Extend is a left join that adds attributes from B while preserving A's entity identity: - -```python -A.extend(B) -# Equivalent to: -A.join(B, left=True) -``` - -### 6.3 Requirement - -**A must determine B** (A → B). All of B's primary key attributes must exist in A. - -### 6.4 Use Case - -Add optional attributes without losing rows: - -```python -# Add experimenter info to sessions (some sessions may lack experimenter) -Session.extend(Experimenter) -``` - -### 6.5 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | PK(A) | -| Attributes | A's attributes + B's non-PK attributes | -| Entity Type | Same as A | - -### 6.6 Error Conditions - -| Condition | Error | -|-----------|-------| -| A does not determine B | `DataJointError` | - ---- - -## 7. Union (`+`) - -### 7.1 Syntax - -```python -result = A + B -``` - -### 7.2 Requirements - -1. **Same connection**: Both from same database -2. **Same primary key**: Identical PK attributes (names and types) -3. **No secondary attribute overlap**: A and B cannot share secondary attributes - -### 7.3 Semantics - -Combines entity sets from both operands: - -```python -# All subjects that are either mice or rats -Mouse + Rat -``` - -### 7.4 Attribute Handling - -| Scenario | Result | -|----------|--------| -| PK only in both | Union of PKs | -| A has secondary attrs | A's secondaries (NULL for B-only rows) | -| B has secondary attrs | B's secondaries (NULL for A-only rows) | -| Overlapping PKs | A's values take precedence | - -### 7.5 SQL Implementation - -```sql --- With secondary attributes -(SELECT A.* FROM A LEFT JOIN B USING (pk)) -UNION -(SELECT B.* FROM B WHERE (B.pk) NOT IN (SELECT A.pk FROM A)) -``` - -### 7.6 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | PK(A) = PK(B) | -| Associative | (A + B) + C = A + (B + C) | -| Commutative | A + B has same rows as B + A | - -### 7.7 Error Conditions - -| Condition | Error | -|-----------|-------| -| Different connections | `DataJointError` | -| Different primary keys | `DataJointError` | -| Overlapping secondary attributes | `DataJointError` | - ---- - -## 8. Universal Sets (`dj.U()`) - -### 8.1 Syntax - -```python -dj.U() # Singular entity (one row, no attributes) -dj.U('attr1', 'attr2') # Set of all combinations -``` - -### 8.2 Unique Value Enumeration - -Extract distinct values: - -```python -# All unique last names -dj.U('last_name') & Student - -# All unique (year, month) combinations -dj.U('year', 'month') & Session.proj(year='YEAR(date)', month='MONTH(date)') -``` - -Result has specified attributes as primary key, with DISTINCT semantics. - -### 8.3 Universal Aggregation - -Aggregate entire table (no grouping): - -```python -# Count all students -dj.U().aggr(Student, n='count(*)') -# Result: single row with n = total count - -# Global statistics -dj.U().aggr(Trial, - total='count(*)', - avg_score='avg(score)', - std_score='std(score)' -) -``` - -### 8.4 Arbitrary Grouping - -Group by attributes not in original PK: - -```python -# Count students by graduation year -dj.U('grad_year').aggr(Student, n='count(*)') - -# Monthly session counts -dj.U('year', 'month').aggr( - Session.proj(year='YEAR(date)', month='MONTH(date)'), - n='count(*)' -) -``` - -### 8.5 Primary Key Behavior - -| Usage | Result PK | -|-------|-----------| -| `dj.U() & table` | Empty (single row) | -| `dj.U('a', 'b') & table` | (a, b) | -| `dj.U().aggr(table, ...)` | Empty (single row) | -| `dj.U('a').aggr(table, ...)` | (a,) | - -### 8.6 Restrictions - -```python -# U attributes must exist in the table -dj.U('name') & Student # OK: 'name' in Student -dj.U('invalid') & Student # Error: 'invalid' not found -``` - -### 8.7 Error Conditions - -| Condition | Error | -|-----------|-------| -| `table * dj.U()` | `DataJointError` (use `&` instead) | -| `dj.U() - table` | `DataJointError` (infinite set) | -| U attributes not in table | `DataJointError` | -| `dj.U().aggr(..., exclude_nonmatching=False)` | `DataJointError` (cannot keep all rows from infinite set) | - ---- - -## 9. Semantic Matching - -### 9.1 Attribute Lineage - -Every attribute has a lineage tracing to its original definition: - -``` -schema.table.attribute -``` - -Foreign key inheritance preserves lineage: - -```python -class Session(dj.Manual): - definition = """ - -> Subject # Inherits subject_id with Subject's lineage - session_id : int - """ -``` - -### 9.2 Homologous Namesakes - -Two attributes are **homologous namesakes** if they have: -1. Same name -2. Same lineage (trace to same original definition) - -### 9.3 Non-Homologous Namesakes - -Attributes with same name but different lineage create semantic collisions: - -```python -# Both have 'name' but from different origins -Student * Course # Error if both have 'name' attribute -``` - -### 9.4 Resolution - -Rename to avoid collisions: - -```python -Student * Course.proj(..., course_name='name') -``` - -### 9.5 Semantic Check Parameter - -| Value | Behavior | -|-------|----------| -| `True` (default) | Match only homologous namesakes; error on collisions | -| `False` | Natural join on all namesakes regardless of lineage | - ---- - -## 10. Operator Precedence - -Python operator precedence applies: - -| Precedence | Operator | Operation | -|------------|----------|-----------| -| Highest | `*` | Join | -| | `+`, `-` | Union, Anti-restriction | -| Lowest | `&` | Restriction | - -Use parentheses for clarity: - -```python -(Session & condition) * Trial # Restrict then join -Session & (Trial * Stimulus) # Join then restrict -``` - ---- - -## 11. Subquery Generation - -Subqueries are generated automatically when needed: - -| Situation | Subquery Created | -|-----------|------------------| -| Restrict on computed attribute | Yes | -| Join on computed attribute | Yes | -| Aggregation operand | Yes | -| Union operand | Yes | -| Restriction after TOP | Yes | - ---- - -## 12. Top (`dj.Top`) - -### 12.1 Syntax - -```python -result = expression & dj.Top() # First row by primary key -result = expression & dj.Top(limit=5) # First 5 rows by primary key -result = expression & dj.Top(5, 'score DESC') # Top 5 by score descending -result = expression & dj.Top(10, order_by='date DESC') # Top 10 by date descending -result = expression & dj.Top(5, offset=10) # Skip 10, take 5 -result = expression & dj.Top(None, 'score DESC') # All rows, ordered by score -``` - -### 12.2 Parameters - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `limit` | `int` or `None` | `1` | Maximum rows to return. `None` = unlimited. | -| `order_by` | `str`, `list[str]`, or `None` | `"KEY"` | Ordering. `"KEY"` = primary key order. `None` = inherit existing order. | -| `offset` | `int` | `0` | Rows to skip before taking `limit`. | - -### 12.3 Ordering Specification - -| Format | Meaning | -|--------|---------| -| `"KEY"` | Order by primary key (ascending) | -| `"attr"` | Order by attribute (ascending) | -| `"attr DESC"` | Order by attribute (descending) | -| `"attr ASC"` | Order by attribute (ascending, explicit) | -| `["attr1 DESC", "attr2"]` | Multiple columns | -| `None` | Inherit ordering from existing Top | - -### 12.4 SQL Equivalent - -```sql -SELECT * FROM table -ORDER BY order_by -LIMIT limit OFFSET offset -``` - -### 12.5 Chaining Tops - -When multiple Tops are chained, behavior depends on the `order_by` parameter: - -| Scenario | Behavior | -|----------|----------| -| Second Top has `order_by=None` | **Merge**: inherits ordering, limits combined | -| Both Tops have identical `order_by` | **Merge**: ordering preserved, limits combined | -| Tops have different `order_by` | **Subquery**: first Top executed, then second applied | - -**Merge behavior:** -- `limit` = minimum of both limits -- `offset` = sum of both offsets -- `order_by` = preserved from first Top - -```python -# Merge: same result, single query -(Table & dj.Top(10, "score DESC")) & dj.Top(5, order_by=None) -# Effective: Top(5, "score DESC", offset=0) - -# Merge with offsets -(Table & dj.Top(10, "x", offset=5)) & dj.Top(3, order_by=None, offset=2) -# Effective: Top(3, "x", offset=7) - -# Subquery: different orderings -(Table & dj.Top(10, "score DESC")) & dj.Top(3, "id ASC") -# First selects top 10 by score, then reorders those 10 by id and takes 3 -``` - -### 12.6 Preview and Limit - -When fetching with a `limit` parameter, the limit is applied as an additional Top that inherits existing ordering: - -```python -# User applies custom ordering -query = Table & dj.Top(order_by="score DESC") - -# Preview respects the ordering -query.to_arrays("id", "score", limit=5) # Top 5 by score descending -``` - -Internally, `to_arrays(..., limit=N)` applies `dj.Top(N, order_by=None)`, which inherits the existing ordering. - -### 12.7 Use Cases - -**Top N rows:** -```python -# Top 10 highest scores -Result & dj.Top(10, "score DESC") -``` - -**Pagination:** -```python -# Page 3 (rows 20-29) sorted by date -Session & dj.Top(10, "session_date DESC", offset=20) -``` - -**Sampling (deterministic):** -```python -# First 100 rows by primary key -BigTable & dj.Top(100) -``` - -**Ordering without limit:** -```python -# All rows ordered by date -Session & dj.Top(None, "session_date DESC") -``` - -### 12.8 Algebraic Properties - -| Property | Value | -|----------|-------| -| Primary Key | Preserved: PK(result) = PK(input) | -| Attributes | Preserved: all attributes retained | -| Entity Type | Preserved | -| Row Order | Determined by `order_by` | - -### 12.9 Error Conditions - -| Condition | Error | -|-----------|-------| -| `limit` not int or None | `TypeError` | -| `order_by` not str, list[str], or None | `TypeError` | -| `offset` not int | `TypeError` | -| Top in OR list | `DataJointError` | -| Top in AndList | `DataJointError` | - ---- - -## 13. SQL Transpilation - -This section describes how DataJoint translates query expressions to SQL. - -### 13.1 MySQL Clause Evaluation Order - -MySQL differs from standard SQL in clause evaluation: - -``` -Standard SQL: FROM → WHERE → GROUP BY → HAVING → SELECT -MySQL: FROM → WHERE → SELECT → GROUP BY → HAVING -``` - -This allows `GROUP BY` and `HAVING` clauses to use alias column names created by `SELECT`. DataJoint targets MySQL's behavior where column aliases can be used in `HAVING`. - -### 13.2 QueryExpression Properties - -Each `QueryExpression` represents a `SELECT` statement with these properties: - -| Property | SQL Clause | Description | -|----------|------------|-------------| -| `heading` | `SELECT` | Attributes to retrieve | -| `restriction` | `WHERE` | List of conditions (AND) | -| `support` | `FROM` | Tables/subqueries to query | - -Operators create new expressions by combining these properties: -- `proj` → creates new `heading` -- `&` → appends to `restriction` -- `*` → adds to `support` - -### 13.3 Subquery Generation Rules - -Operators merge properties when possible, avoiding subqueries. A subquery is generated when: - -| Situation | Reason | -|-----------|--------| -| Restriction uses alias attributes | Alias must exist in SELECT before WHERE can reference it | -| Projection creates alias from alias | Must materialize first alias | -| Join on alias attribute | Alias must exist before join condition | -| Aggregation as operand | GROUP BY requires complete subquery | -| Union operand | UNION requires complete subqueries | - -When a subquery is created, the input becomes a `FROM` clause element in a new `QueryExpression`. - -### 13.4 Join Mechanics - -Joins combine the properties of both inputs: - -```python -result.support = A.support + B.support -result.restriction = A.restriction + B.restriction -result.heading = merge(A.heading, B.heading) -``` - -Restrictions from inputs propagate to the output. Inputs that don't become subqueries donate their supports, restrictions, and projections directly to the join. - -### 13.5 Aggregation SQL - -Aggregation translates to: - -```sql -SELECT A.pk, A.secondary, agg_func(B.col) AS computed -FROM A -LEFT JOIN B USING (pk) -WHERE -GROUP BY A.pk -HAVING -``` - -Key behavior: -- Restrictions on A → `WHERE` clause (before grouping) -- Restrictions on B or computed attributes → `HAVING` clause (after grouping) -- Aggregation never generates a subquery when restricted - -### 13.6 Union SQL - -Union performs an outer join: - -```sql -(SELECT A.pk, A.secondary, NULL as B.secondary FROM A) -UNION -(SELECT B.pk, NULL as A.secondary, B.secondary FROM B - WHERE B.pk NOT IN (SELECT pk FROM A)) -``` - -All union inputs become subqueries except unrestricted unions. - -### 13.7 Query Backprojection - -Before execution, `finalize()` recursively projects out unnecessary attributes from all inputs. This optimization: - -- Reduces data transfer (especially for blobs) -- Compensates for MySQL's query optimizer limitations -- Produces leaner queries for complex expressions - ---- - -## 14. Implementation Reference - -| File | Purpose | -|------|---------| -| `expression.py` | QueryExpression base class, operators | -| `condition.py` | Restriction condition handling, Top class | -| `heading.py` | Attribute metadata and lineage | -| `table.py` | Table class, fetch interface | -| `U.py` | Universal set implementation | - ---- - -## 15. Quick Reference - -| Operation | Syntax | Result PK | -|-----------|--------|-----------| -| Restrict | `A & cond` | PK(A) | -| Anti-restrict | `A - cond` | PK(A) | -| Project | `A.proj(...)` | PK(A) | -| Join | `A * B` | Depends on A→B | -| Aggregate | `A.aggr(B, ...)` | PK(A) | -| Extend | `A.extend(B)` | PK(A) | -| Union | `A + B` | PK(A) = PK(B) | -| Unique values | `dj.U('x') & A` | (x,) | -| Global aggregate | `dj.U().aggr(A, ...)` | () | - - ---- -## File: reference/specs/semantic-matching.md - -# Semantic Matching for Joins - Specification - -## Overview - -This document specifies **semantic matching** for joins in DataJoint 2.0, replacing the current name-based matching rules. Semantic matching ensures that attributes are only matched when they share both the same name and the same **lineage** (origin), preventing accidental joins on unrelated attributes that happen to share names. - -### Goals - -1. **Prevent incorrect joins** on attributes that share names but represent different entities -2. **Enable valid joins** that are currently blocked due to overly restrictive rules -3. **Maintain backward compatibility** for well-designed schemas -4. **Provide clear error messages** when semantic conflicts are detected - ---- - -## User Guide - -### Quick Start - -Semantic matching is enabled by default in DataJoint 2.0. For most well-designed schemas, no changes are required. - -#### When You Might See Errors - -```python -# Two tables with generic 'id' attribute -class Student(dj.Manual): - definition = """ - id : uint32 - --- - name : varchar(100) - """ - -class Course(dj.Manual): - definition = """ - id : uint32 - --- - title : varchar(100) - """ - -# This will raise an error because 'id' has different lineages -Student() * Course() # DataJointError! -``` - -#### How to Resolve - -**Option 1: Rename attributes using projection** -```python -Student() * Course().proj(course_id='id') # OK -``` - -**Option 2: Bypass semantic check (use with caution)** -```python -Student().join(Course(), semantic_check=False) # OK, but be careful! -``` - -**Option 3: Use descriptive names (best practice)** -```python -class Student(dj.Manual): - definition = """ - student_id : uint32 - --- - name : varchar(100) - """ -``` - -### Migrating from DataJoint 1.x - -#### Removed Operators - -| Old Syntax | New Syntax | -|------------|------------| -| `A @ B` | `A.join(B, semantic_check=False)` | -| `A ^ B` | `A.restrict(B, semantic_check=False)` | -| `dj.U('a') * B` | `dj.U('a') & B` | - -#### Rebuilding Lineage for Existing Schemas - -If you have existing schemas created before DataJoint 2.0, rebuild their lineage tables: - -```python -import datajoint as dj - -# Connect and get your schema -schema = dj.Schema('my_database') - -# Rebuild lineage (do this once per schema) -schema.rebuild_lineage() - -# Restart Python kernel to pick up changes -``` - -**Important**: If your schema references tables in other schemas, rebuild those upstream schemas first. - ---- - -## API Reference - -### Schema Methods - -#### `schema.rebuild_lineage()` - -Rebuild the `~lineage` table for all tables in this schema. - -```python -schema.rebuild_lineage() -``` - -**Description**: Recomputes lineage for all attributes by querying FK relationships from the database's `information_schema`. Use this to restore lineage for schemas that predate the lineage system or after corruption. - -**Requirements**: -- Schema must exist -- Upstream schemas (referenced via cross-schema FKs) must have their lineage rebuilt first - -**Side Effects**: -- Creates `~lineage` table if it doesn't exist -- Deletes and repopulates all lineage entries for tables in the schema - -**Post-Action**: Restart Python kernel and reimport to pick up new lineage information. - -#### `schema.lineage_table_exists` - -Property indicating whether the `~lineage` table exists in this schema. - -```python -if schema.lineage_table_exists: - print("Lineage tracking is enabled") -``` - -**Returns**: `bool` - `True` if `~lineage` table exists, `False` otherwise. - -#### `schema.lineage` - -Property returning all lineage entries for the schema. - -```python -schema.lineage -# {'myschema.session.session_id': 'myschema.session.session_id', -# 'myschema.trial.session_id': 'myschema.session.session_id', -# 'myschema.trial.trial_num': 'myschema.trial.trial_num'} -``` - -**Returns**: `dict` - Maps `'schema.table.attribute'` to its lineage origin - -### Join Methods - -#### `expr.join(other, semantic_check=True)` - -Join two expressions with optional semantic checking. - -```python -result = A.join(B) # semantic_check=True (default) -result = A.join(B, semantic_check=False) # bypass semantic check -``` - -**Parameters**: -- `other`: Another query expression to join with -- `semantic_check` (bool): If `True` (default), raise error on non-homologous namesakes. If `False`, perform natural join without lineage checking. - -**Raises**: `DataJointError` if `semantic_check=True` and namesake attributes have different lineages. - -#### `expr.restrict(other, semantic_check=True)` - -Restrict expression with optional semantic checking. - -```python -result = A.restrict(B) # semantic_check=True (default) -result = A.restrict(B, semantic_check=False) # bypass semantic check -``` - -**Parameters**: -- `other`: Restriction condition (expression, dict, string, etc.) -- `semantic_check` (bool): If `True` (default), raise error on non-homologous namesakes when restricting by another expression. If `False`, no lineage checking. - -**Raises**: `DataJointError` if `semantic_check=True` and namesake attributes have different lineages. - -### Operators - -#### `A * B` (Join) - -Equivalent to `A.join(B, semantic_check=True)`. - -#### `A & B` (Restriction) - -Equivalent to `A.restrict(B, semantic_check=True)`. - -#### `A - B` (Anti-restriction) - -Restriction with negation. Semantic checking applies. - -To bypass semantic checking: `A.restrict(dj.Not(B), semantic_check=False)` - -#### `A + B` (Union) - -Union of expressions. Requires all namesake attributes to have matching lineage. - -### Removed Operators - -#### `A @ B` (Removed) - -Raises `DataJointError` with migration guidance to use `.join(semantic_check=False)`. - -#### `A ^ B` (Removed) - -Raises `DataJointError` with migration guidance to use `.restrict(semantic_check=False)`. - -#### `dj.U(...) * A` (Removed) - -Raises `DataJointError` with migration guidance to use `dj.U(...) & A`. - -### Universal Set (`dj.U`) - -#### Valid Operations - -```python -dj.U('a', 'b') & A # Restriction: promotes a, b to PK -dj.U('a', 'b').aggr(A, ...) # Aggregation: groups by a, b -dj.U() & A # Distinct primary keys of A -``` - -#### Invalid Operations - -```python -dj.U('a', 'b') - A # DataJointError: produces infinite set -dj.U('a', 'b') * A # DataJointError: use & instead -``` - ---- - -## Concepts - -### Attribute Lineage - -Lineage identifies the **origin** of an attribute—the **dimension** where it was first defined. A dimension is an independent axis of variation introduced by a table that defines new primary key attributes. See [Schema Dimensions](../../explanation/entity-integrity.md#schema-dimensions) for details. - -Lineage is represented as a string: - -``` -schema_name.table_name.attribute_name -``` - -#### Lineage Assignment Rules - -| Attribute Type | Lineage Value | -|----------------|---------------| -| Native primary key | `this_schema.this_table.attr_name` | -| FK-inherited (primary or secondary) | Traced to original definition | -| Native secondary | `None` | -| Computed (in projection) | `None` | - -#### Example - -```python -class Session(dj.Manual): # table: session - definition = """ - session_id : uint32 - --- - session_date : date - """ - -class Trial(dj.Manual): # table: trial - definition = """ - -> Session - trial_num : uint16 - --- - stimulus : varchar(100) - """ -``` - -Lineages: -- `Session.session_id` → `myschema.session.session_id` (native PK) -- `Session.session_date` → `None` (native secondary) -- `Trial.session_id` → `myschema.session.session_id` (inherited via FK) -- `Trial.trial_num` → `myschema.trial.trial_num` (native PK) -- `Trial.stimulus` → `None` (native secondary) - -### Terminology - -| Term | Definition | -|------|------------| -| **Lineage** | The origin of an attribute: `schema.table.attribute` | -| **Homologous attributes** | Attributes with the same lineage | -| **Namesake attributes** | Attributes with the same name | -| **Homologous namesakes** | Same name AND same lineage — used for join matching | -| **Non-homologous namesakes** | Same name BUT different lineage — cause join errors | - -### Semantic Matching Rules - -| Scenario | Action | -|----------|--------| -| Same name, same lineage (both non-null) | **Match** | -| Same name, different lineage | **Error** | -| Same name, either lineage is null | **Error** | -| Different names | **No match** | - ---- - -## Implementation Details - -### `~lineage` Table - -Each schema has a hidden `~lineage` table storing lineage information: - -```sql -CREATE TABLE `schema_name`.`~lineage` ( - table_name VARCHAR(64) NOT NULL, - attribute_name VARCHAR(64) NOT NULL, - lineage VARCHAR(255) NOT NULL, - PRIMARY KEY (table_name, attribute_name) -) -``` - -### Lineage Population - -**At table declaration**: -1. Delete any existing lineage entries for the table -2. For FK attributes: copy lineage from parent (with warning if parent lineage missing) -3. For native PK attributes: set lineage to `schema.table.attribute` -4. Native secondary attributes: no entry (lineage = None) - -**At table drop**: -- Delete all lineage entries for the table - -### Missing Lineage Handling - -**If `~lineage` table doesn't exist**: -- Warning issued during semantic check -- Semantic checking disabled (join proceeds as natural join) - -**If parent lineage missing during declaration**: -- Warning issued -- Parent attribute used as origin -- Recommend rebuilding lineage after parent schema is fixed - -### Heading's `lineage_available` Property - -The `Heading` class tracks whether lineage information is available: - -```python -heading.lineage_available # True if ~lineage table exists for this schema -``` - -This property is: -- Set when heading is loaded from database -- Propagated through projections, joins, and other operations -- Used by `assert_join_compatibility` to decide whether to perform semantic checking - ---- - -## Error Messages - -### Non-Homologous Namesakes - -``` -DataJointError: Cannot join on attribute `id`: different lineages -(university.student.id vs university.course.id). -Use .proj() to rename one of the attributes. -``` - -### Removed `@` Operator - -``` -DataJointError: The @ operator has been removed in DataJoint 2.0. -Use .join(other, semantic_check=False) for permissive joins. -``` - -### Removed `^` Operator - -``` -DataJointError: The ^ operator has been removed in DataJoint 2.0. -Use .restrict(other, semantic_check=False) for permissive restrictions. -``` - -### Removed `dj.U * table` - -``` -DataJointError: dj.U(...) * table is no longer supported in DataJoint 2.0. -Use dj.U(...) & table instead. -``` - -### Missing Lineage Warning - -``` -WARNING: Semantic check disabled: ~lineage table not found. -To enable semantic matching, rebuild lineage with: schema.rebuild_lineage() -``` - -### Parent Lineage Missing Warning - -``` -WARNING: Lineage for `parent_db`.`parent_table`.`attr` not found -(parent schema's ~lineage table may be missing or incomplete). -Using it as origin. Once the parent schema's lineage is rebuilt, -run schema.rebuild_lineage() on this schema to correct the lineage. -``` - ---- - -## Examples - -### Example 1: Valid Join (Shared Lineage) - -```python -class Student(dj.Manual): - definition = """ - student_id : uint32 - --- - name : varchar(100) - """ - -class Enrollment(dj.Manual): - definition = """ - -> Student - -> Course - --- - grade : varchar(2) - """ - -# Works: student_id has same lineage in both -Student() * Enrollment() -``` - -### Example 2: Invalid Join (Different Lineage) - -```python -class TableA(dj.Manual): - definition = """ - id : uint32 - --- - value_a : int32 - """ - -class TableB(dj.Manual): - definition = """ - id : uint32 - --- - value_b : int32 - """ - -# Error: 'id' has different lineages -TableA() * TableB() - -# Solution 1: Rename -TableA() * TableB().proj(b_id='id') - -# Solution 2: Bypass (use with caution) -TableA().join(TableB(), semantic_check=False) -``` - -### Example 3: Multi-hop FK Inheritance - -```python -class Session(dj.Manual): - definition = """ - session_id : uint32 - --- - session_date : date - """ - -class Trial(dj.Manual): - definition = """ - -> Session - trial_num : uint16 - """ - -class Response(dj.Computed): - definition = """ - -> Trial - --- - response_time : float64 - """ - -# All work: session_id traces back to Session in all tables -Session() * Trial() -Session() * Response() -Trial() * Response() -``` - -### Example 4: Secondary FK Attribute - -```python -class Course(dj.Manual): - definition = """ - course_id : int unsigned - --- - title : varchar(100) - """ - -class FavoriteCourse(dj.Manual): - definition = """ - student_id : int unsigned - --- - -> Course - """ - -class RequiredCourse(dj.Manual): - definition = """ - major_id : int unsigned - --- - -> Course - """ - -# Works: course_id is secondary in both, but has same lineage -FavoriteCourse() * RequiredCourse() -``` - -### Example 5: Aliased Foreign Key - -```python -class Person(dj.Manual): - definition = """ - person_id : int unsigned - --- - full_name : varchar(100) - """ - -class Marriage(dj.Manual): - definition = """ - -> Person.proj(husband='person_id') - -> Person.proj(wife='person_id') - --- - marriage_date : date - """ - -# husband and wife both have lineage: schema.person.person_id -# They are homologous (same lineage) but have different names -``` - ---- - -## Best Practices - -1. **Use descriptive attribute names**: Prefer `student_id` over generic `id` - -2. **Leverage foreign keys**: Inherited attributes maintain lineage automatically - -3. **Rebuild lineage for legacy schemas**: Run `schema.rebuild_lineage()` once - -4. **Rebuild upstream schemas first**: For cross-schema FKs, rebuild parent schemas before child schemas - -5. **Restart after rebuilding**: Restart Python kernel to pick up new lineage information - -6. **Use `semantic_check=False` sparingly**: Only when you're certain the natural join is correct - - ---- -## File: reference/specs/table-declaration.md - -# DataJoint Table Declaration Specification - -## Overview - -This document specifies the table declaration mechanism in DataJoint Python. Table declarations define the schema structure using a domain-specific language (DSL) embedded in Python class definitions. - -## 1. Table Class Structure - -### 1.1 Basic Declaration Pattern - -```python -@schema -class TableName(dj.Manual): - definition = """ - # table comment - primary_attr : int32 - --- - secondary_attr : float64 - """ -``` - -### 1.2 Table Tiers - -| Tier | Base Class | Table Prefix | Purpose | -|------|------------|--------------|---------| -| Manual | `dj.Manual` | (none) | User-entered data | -| Lookup | `dj.Lookup` | `#` | Reference/enumeration data | -| Imported | `dj.Imported` | `_` | Data from external sources | -| Computed | `dj.Computed` | `__` | Derived from other tables | -| Part | `dj.Part` | `master__` | Detail records of master table | - -### 1.3 Class Naming Rules - -- **Format**: Strict CamelCase (e.g., `MyTable`, `ProcessedData`) -- **Pattern**: `^[A-Z][A-Za-z0-9]*$` -- **Conversion**: CamelCase to snake_case for SQL table name -- **Examples**: - - `SessionTrial` -> `session_trial` - - `ProcessedEMG` -> `processed_emg` - -### 1.4 Table Name Constraints - -- **Maximum length**: 64 characters (MySQL limit) -- **Final name**: prefix + snake_case(class_name) -- **Validation**: Checked at declaration time - ---- - -## 2. Definition String Grammar - -### 2.1 Overall Structure - -``` -[table_comment] -primary_key_section ---- -secondary_section -``` - -### 2.2 Table Comment (Optional) - -``` -# Free-form description of the table purpose -``` - -- Must be first non-empty line if present -- Starts with `#` -- Cannot start with `#:` -- Stored in MySQL table COMMENT - -### 2.3 Primary Key Separator - -``` ---- -``` - -or equivalently: - -``` -___ -``` - -- Three dashes or three underscores -- Separates primary key attributes (above) from secondary attributes (below) -- Required if table has secondary attributes - -### 2.4 Line Types - -Each non-empty, non-comment line is one of: - -1. **Attribute definition** -2. **Foreign key reference** -3. **Index declaration** - ---- - -## 3. Attribute Definition - -### 3.1 Syntax - -``` -attribute_name [= default_value] : type [# comment] -``` - -### 3.2 Components - -| Component | Required | Description | -|-----------|----------|-------------| -| `attribute_name` | Yes | Identifier for the column | -| `default_value` | No | Default value (before colon) | -| `type` | Yes | Data type specification | -| `comment` | No | Documentation (after `#`) | - -### 3.3 Attribute Name Rules - -- **Pattern**: `^[a-z][a-z0-9_]*$` -- **Start**: Lowercase letter -- **Contains**: Lowercase letters, digits, underscores -- **Convention**: snake_case - -### 3.4 Examples - -```python -definition = """ -# Experimental session with subject and timing info -session_id : int32 # auto-assigned ---- -subject_name : varchar(100) # subject identifier -trial_number = 1 : int32 # default to 1 -score = null : float32 # nullable -timestamp = CURRENT_TIMESTAMP : datetime # auto-timestamp -notes = '' : varchar(4000) # empty default -""" -``` - ---- - -## 4. Type System - -### 4.1 Core Types - -Scientist-friendly type names with guaranteed semantics: - -| Type | SQL Mapping | Size | Description | -|------|-------------|------|-------------| -| `int8` | `tinyint` | 1 byte | 8-bit signed integer | -| `uint8` | `tinyint unsigned` | 1 byte | 8-bit unsigned integer | -| `int16` | `smallint` | 2 bytes | 16-bit signed integer | -| `uint16` | `smallint unsigned` | 2 bytes | 16-bit unsigned integer | -| `int32` | `int` | 4 bytes | 32-bit signed integer | -| `uint32` | `int unsigned` | 4 bytes | 32-bit unsigned integer | -| `int64` | `bigint` | 8 bytes | 64-bit signed integer | -| `uint64` | `bigint unsigned` | 8 bytes | 64-bit unsigned integer | -| `float32` | `float` | 4 bytes | 32-bit IEEE 754 float | -| `float64` | `double` | 8 bytes | 64-bit IEEE 754 float | -| `bool` | `tinyint` | 1 byte | Boolean (0 or 1) | -| `uuid` | `binary(16)` | 16 bytes | UUID stored as binary | -| `bytes` | `longblob` | Variable | Binary data (up to 4GB) | - -### 4.2 String Types - -| Type | SQL Mapping | Description | -|------|-------------|-------------| -| `char(N)` | `char(N)` | Fixed-length string | -| `varchar(N)` | `varchar(N)` | Variable-length string (max N) | -| `enum('a','b',...)` | `enum(...)` | Enumerated values | - -### 4.3 Temporal Types - -| Type | SQL Mapping | Description | -|------|-------------|-------------| -| `date` | `date` | Date (YYYY-MM-DD) | -| `datetime` | `datetime` | Date and time | -| `datetime(N)` | `datetime(N)` | With fractional seconds (0-6) | - -### 4.4 Other Types - -| Type | SQL Mapping | Description | -|------|-------------|-------------| -| `json` | `json` | JSON document | -| `decimal(P,S)` | `decimal(P,S)` | Fixed-point decimal | - -### 4.5 Native SQL Types (Passthrough) - -These SQL types are accepted but generate a warning recommending core types: - -- Integer variants: `tinyint`, `smallint`, `mediumint`, `bigint`, `integer`, `serial` -- Float variants: `float`, `double`, `real` (with size specifiers) -- Text variants: `tinytext`, `mediumtext`, `longtext` -- Blob variants: `tinyblob`, `smallblob`, `mediumblob`, `longblob` -- Temporal: `time`, `timestamp`, `year` -- Numeric: `numeric(P,S)` - -### 4.6 Codec Types - -Format: `` or `` - -| Codec | Internal dtype | External dtype | Purpose | -|-------|---------------|----------------|---------| -| `` | `bytes` | `` | Serialized Python objects | -| `` | N/A (external only) | `json` | Hash-addressed deduped storage | -| `` | `bytes` | `` | File attachments with filename | -| `` | N/A (external only) | `json` | Reference to managed file | -| `` | N/A (external only) | `json` | Object storage (Zarr, HDF5) | - -External storage syntax: -- `` - default store -- `` - named store - -### 4.7 Type Reconstruction - -Core types and codecs are stored in the SQL COMMENT field for reconstruction: - -```sql -COMMENT ':float32:user comment here' -COMMENT '::user comment' -``` - ---- - -## 5. Default Values - -### 5.1 Syntax - -``` -attribute_name = default_value : type -``` - -### 5.2 Literal Types - -| Value | Meaning | SQL | -|-------|---------|-----| -| `null` | Nullable attribute | `DEFAULT NULL` | -| `CURRENT_TIMESTAMP` | Server timestamp | `DEFAULT CURRENT_TIMESTAMP` | -| `"string"` or `'string'` | String literal | `DEFAULT "string"` | -| `123` | Numeric literal | `DEFAULT 123` | -| `true`/`false` | Boolean | `DEFAULT 1`/`DEFAULT 0` | - -### 5.3 Constant Literals - -These values are used without quotes in SQL: -- `NULL` -- `CURRENT_TIMESTAMP` - -### 5.4 Nullable Attributes - -``` -score = null : float32 -``` - -- The special default `null` (case-insensitive) makes the attribute nullable -- Nullable attributes can be omitted from INSERT -- Primary key attributes CANNOT be nullable - -### 5.5 Blob/JSON Default Restrictions - -Blob and JSON attributes can only have `null` as default: - -```python -# Valid -data = null : - -# Invalid - raises DataJointError -data = '' : -``` - ---- - -## 6. Foreign Key References - -### 6.1 Syntax - -``` --> [options] ReferencedTable -``` - -### 6.2 Options - -| Option | Effect | -|--------|--------| -| `nullable` | All inherited attributes become nullable | -| `unique` | Creates UNIQUE INDEX on FK attributes | - -Options are comma-separated in brackets: -``` --> [nullable, unique] ParentTable -``` - -### 6.3 Attribute Inheritance - -Foreign keys automatically inherit all primary key attributes from the referenced table: - -```python -# Parent -class Subject(dj.Manual): - definition = """ - subject_id : int32 - --- - name : varchar(100) - """ - -# Child - inherits subject_id -class Session(dj.Manual): - definition = """ - -> Subject - session_id : int32 - --- - session_date : date - """ -``` - -### 6.4 Position Rules - -| Position | Effect | -|----------|--------| -| Before `---` | FK attributes become part of primary key | -| After `---` | FK attributes are secondary (dependent) | - -### 6.5 Nullable Foreign Keys - -``` --> [nullable] OptionalParent -``` - -- Only allowed after `---` (secondary) -- Primary key FKs cannot be nullable -- Creates optional relationship - -### 6.6 Unique Foreign Keys - -``` --> [unique] ParentTable -``` - -- Creates UNIQUE INDEX on inherited attributes -- Enforces one-to-one relationship from child perspective - -### 6.7 Projections in Foreign Keys - -``` --> Parent.proj(alias='original_name') -``` - -- Reference same table multiple times with different attribute names -- Useful for self-referential or multi-reference patterns - -### 6.8 Referential Actions - -All foreign keys use: -- `ON UPDATE CASCADE` - Parent key changes propagate -- `ON DELETE RESTRICT` - Cannot delete parent with children - -### 6.9 Lineage Tracking - -Foreign key relationships are recorded in the `~lineage` table: - -```python -{ - 'child_attr': ('parent_schema.parent_table', 'parent_attr') -} -``` - -Used for semantic attribute matching in queries. - ---- - -## 7. Index Declarations - -### 7.1 Syntax - -``` -index(attr1, attr2, ...) -unique index(attr1, attr2, ...) -``` - -### 7.2 Examples - -```python -definition = """ -# User contact information -user_id : int32 ---- -first_name : varchar(50) -last_name : varchar(50) -email : varchar(100) -index(last_name, first_name) -unique index(email) -""" -``` - -### 7.3 Computed Expressions - -Indexes can include SQL expressions: - -``` -index(last_name, (YEAR(birth_date))) -``` - -### 7.4 Limitations - -- Cannot be altered after table creation (via `table.alter()`) -- Must reference existing attributes - ---- - -## 8. Part Tables - -### 8.1 Declaration - -```python -@schema -class Master(dj.Manual): - definition = """ - master_id : int32 - """ - - class Detail(dj.Part): - definition = """ - -> master - detail_id : int32 - --- - value : float32 - """ -``` - -### 8.2 Naming - -- SQL name: `master_table__part_name` -- Example: `experiment__trial` - -### 8.3 Master Reference - -Within Part definition, use: -- `-> master` (lowercase keyword) -- `-> MasterClassName` (class name) - -### 8.4 Constraints - -- Parts must reference their master -- Cannot delete Part records directly (use master) -- Cannot drop Part table directly (use master) -- Part inherits master's primary key - ---- - -## 9. Auto-Populated Tables - -### 9.1 Classes - -- `dj.Imported` - Data from external sources -- `dj.Computed` - Derived from other DataJoint tables - -### 9.2 Primary Key Constraint - -All primary key attributes must come from foreign key references. - -**Valid:** -```python -class Analysis(dj.Computed): - definition = """ - -> Session - -> Parameter - --- - result : float64 - """ -``` - -**Invalid** (by default): -```python -class Analysis(dj.Computed): - definition = """ - -> Session - analysis_id : int32 # ERROR: non-FK primary key - --- - result : float64 - """ -``` - -**Override:** -```python -dj.config['jobs.allow_new_pk_fields_in_computed_tables'] = True -``` - -### 9.3 Job Metadata - -When `config['jobs.add_job_metadata'] = True`, auto-populated tables receive: - -| Column | Type | Description | -|--------|------|-------------| -| `_job_start_time` | `datetime(3)` | Job start timestamp | -| `_job_duration` | `float64` | Duration in seconds | -| `_job_version` | `varchar(64)` | Code version | - ---- - -## 10. Validation - -### 10.1 Parse-Time Checks - -| Check | Error | -|-------|-------| -| Unknown type | `DataJointError: Unsupported attribute type` | -| Invalid attribute name | `DataJointError: Declaration error` | -| Comment starts with `:` | `DataJointError: comment must not start with colon` | -| Non-null blob default | `DataJointError: default value for blob can only be NULL` | - -### 10.2 Declaration-Time Checks - -| Check | Error | -|-------|-------| -| Table name > 64 chars | `DataJointError: Table name exceeds max length` | -| No primary key | `DataJointError: Table must have a primary key` | -| Nullable primary key attr | `DataJointError: Primary key attributes cannot be nullable` | -| Invalid CamelCase | `DataJointError: Invalid table name` | -| FK resolution failure | `DataJointError: Foreign key reference could not be resolved` | - -### 10.3 Insert-Time Validation - -The `table.validate()` method checks: -- Required fields present -- NULL constraints satisfied -- Primary key completeness -- Codec validation (if defined) -- UUID format -- JSON serializability - ---- - -## 11. SQL Generation - -### 11.1 CREATE TABLE Template - -```sql -CREATE TABLE `schema`.`table_name` ( - `attr1` TYPE1 NOT NULL COMMENT "...", - `attr2` TYPE2 DEFAULT NULL COMMENT "...", - PRIMARY KEY (`pk1`, `pk2`), - FOREIGN KEY (`fk_attr`) REFERENCES `parent` (`pk`) - ON UPDATE CASCADE ON DELETE RESTRICT, - INDEX (`idx_attr`), - UNIQUE INDEX (`uniq_attr`) -) ENGINE=InnoDB COMMENT="table comment" -``` - -### 11.2 Type Comment Encoding - -Core types and codecs are preserved in comments: - -```sql -`value` float NOT NULL COMMENT ":float32:measurement value" -`data` longblob DEFAULT NULL COMMENT "::serialized data" -`archive` json DEFAULT NULL COMMENT "::external storage" -``` - ---- - -## 12. Implementation Files - -| File | Purpose | -|------|---------| -| `declare.py` | Definition parsing, SQL generation | -| `heading.py` | Attribute metadata, type reconstruction | -| `table.py` | Base Table class, declaration interface | -| `user_tables.py` | Tier classes (Manual, Computed, etc.) | -| `schemas.py` | Schema binding, table decoration | -| `codecs.py` | Codec registry and resolution | -| `lineage.py` | Attribute lineage tracking | - ---- - -## 13. Future Considerations - -Potential improvements identified for the declaration system: - -1. **Better error messages** with suggestions and context -2. **Import-time validation** via `__init_subclass__` -3. **Parser alternatives** (regex-based for simpler grammar) -4. **SQL dialect abstraction** for multi-database support -5. **Extended constraints** (CHECK, custom validation) -6. **Migration support** for schema evolution -7. **Definition caching** for performance -8. **IDE tooling** support via structured intermediate representation - - ---- -## File: reference/specs/type-system.md - -# Storage Types Redesign Spec - -## Overview - -This document defines a three-layer type architecture: - -1. **Native database types** - Backend-specific (`FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB`). Discouraged for direct use. -2. **Core DataJoint types** - Standardized across backends, scientist-friendly (`float32`, `uint8`, `bool`, `json`). -3. **Codec Types** - Programmatic types with `encode()`/`decode()` semantics. Composable. - -| Layer | Description | Examples | -|-------|-------------|----------| -| **3. Codec Types** | Programmatic types with `encode()`/`decode()` semantics | ``, ``, ``, ``, ``, user-defined | -| **2. Core DataJoint** | Standardized, scientist-friendly types (preferred) | `int32`, `float64`, `varchar(n)`, `bool`, `datetime`, `json`, `bytes` | -| **1. Native Database** | Backend-specific types (discouraged) | `INT`, `FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB` | - -Codec types resolve through core types to native types: `` → `bytes` → `LONGBLOB`. - -**Syntax distinction:** -- Core types: `int32`, `float64`, `varchar(255)` - no brackets -- Codec types: ``, ``, `` - angle brackets -- The `@` character indicates store (object storage vs in-table) - -### OAS Addressing Schemes - -| Scheme | Path Pattern | Description | Use Case | -|--------|--------------|-------------|----------| -| **Schema-addressed** | `{schema}/{table}/{pk}/` | Path mirrors database structure | Large objects, Zarr, HDF5, numpy arrays | -| **Hash-addressed** | `_hash/{hash}` | Path from content hash (MD5) | Deduplicated blobs/attachments | - -### URL Representation - -DataJoint uses consistent URL representation for all storage backends: - -| Protocol | URL Format | Example | -|----------|------------|---------| -| Local filesystem | `file://` | `file:///data/objects/file.dat` | -| Amazon S3 | `s3://` | `s3://bucket/path/file.dat` | -| Google Cloud | `gs://` | `gs://bucket/path/file.dat` | -| Azure Blob | `az://` | `az://container/path/file.dat` | - -This unified approach treats all storage backends uniformly via fsspec, enabling: -- Consistent path handling across local and cloud storage -- Transparent switching between storage backends -- Streaming access to any storage type - -### Store References - -`` provides portable relative paths within configured stores with lazy ObjectRef access. -For arbitrary URLs that don't need ObjectRef semantics, use `varchar` instead. - -## Core DataJoint Types (Layer 2) - -Core types provide a standardized, scientist-friendly interface that works identically across -MySQL and PostgreSQL backends. Users should prefer these over native database types. - -**All core types are recorded in field comments using `:type:` syntax for reconstruction.** - -### Numeric Types - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `int8` | 8-bit signed | `TINYINT` | `SMALLINT` | -| `int16` | 16-bit signed | `SMALLINT` | `SMALLINT` | -| `int32` | 32-bit signed | `INT` | `INTEGER` | -| `int64` | 64-bit signed | `BIGINT` | `BIGINT` | -| `uint8` | 8-bit unsigned | `TINYINT UNSIGNED` | `SMALLINT` | -| `uint16` | 16-bit unsigned | `SMALLINT UNSIGNED` | `INTEGER` | -| `uint32` | 32-bit unsigned | `INT UNSIGNED` | `BIGINT` | -| `uint64` | 64-bit unsigned | `BIGINT UNSIGNED` | `NUMERIC(20)` | -| `float32` | 32-bit float | `FLOAT` | `REAL` | -| `float64` | 64-bit float | `DOUBLE` | `DOUBLE PRECISION` | -| `decimal(n,f)` | Fixed-point | `DECIMAL(n,f)` | `NUMERIC(n,f)` | - -### String Types - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `char(n)` | Fixed-length | `CHAR(n)` | `CHAR(n)` | -| `varchar(n)` | Variable-length | `VARCHAR(n)` | `VARCHAR(n)` | - -**Encoding:** All strings use UTF-8 (`utf8mb4` in MySQL, `UTF8` in PostgreSQL). -See [Encoding and Collation Policy](#encoding-and-collation-policy) for details. - -### Boolean - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `bool` | True/False | `TINYINT` | `BOOLEAN` | - -### Date/Time Types - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `date` | Date only | `DATE` | `DATE` | -| `datetime` | Date and time | `DATETIME` | `TIMESTAMP` | - -**Timezone policy:** All `datetime` values should be stored as **UTC**. Timezone conversion is a -presentation concern handled by the application layer, not the database. This ensures: -- Reproducible computations regardless of server or client timezone settings -- Simple arithmetic on temporal values (no DST ambiguity) -- Portable data across systems and regions - -Use `CURRENT_TIMESTAMP` for auto-populated creation times: -``` -created_at : datetime = CURRENT_TIMESTAMP -``` - -### Binary Types - -The core `bytes` type stores raw bytes without any serialization. Use the `` codec -for serialized Python objects. - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `bytes` | Raw bytes | `LONGBLOB` | `BYTEA` | - -### Other Types - -| Core Type | Description | MySQL | PostgreSQL | -|-----------|-------------|-------|------------| -| `json` | JSON document | `JSON` | `JSONB` | -| `uuid` | UUID | `BINARY(16)` | `UUID` | -| `enum(...)` | Enumeration | `ENUM(...)` | `CREATE TYPE ... AS ENUM` | - -### Native Passthrough Types - -Users may use native database types directly (e.g., `int`, `float`, `mediumint`, `tinyblob`), -but these are discouraged and will generate a warning. Native types lack explicit size -information, are not recorded in field comments, and may have portability issues across -database backends. - -**Prefer core DataJoint types over native types:** - -| Native (discouraged) | Core DataJoint (preferred) | -|---------------------|---------------------------| -| `int` | `int32` | -| `float` | `float32` or `float64` | -| `double` | `float64` | -| `tinyint` | `int8` | -| `tinyint unsigned` | `uint8` | -| `smallint` | `int16` | -| `bigint` | `int64` | - -### Type Modifiers Policy - -DataJoint table definitions have their own syntax for constraints and metadata. SQL type -modifiers are **not allowed** in type specifications because they conflict with DataJoint's -declarative syntax: - -| Modifier | Status | DataJoint Alternative | -|----------|--------|----------------------| -| `NOT NULL` / `NULL` | ❌ Not allowed | Use `= NULL` for nullable; omit default for required | -| `DEFAULT value` | ❌ Not allowed | Use `= value` syntax before the type | -| `PRIMARY KEY` | ❌ Not allowed | Position above `---` line | -| `UNIQUE` | ❌ Not allowed | Use DataJoint index syntax | -| `COMMENT 'text'` | ❌ Not allowed | Use `# comment` syntax | -| `CHARACTER SET` | ❌ Not allowed | Database-level configuration | -| `COLLATE` | ❌ Not allowed | Database-level configuration | -| `AUTO_INCREMENT` | ⚠️ Discouraged | Allowed with native types only, generates warning | -| `UNSIGNED` | ✅ Allowed | Part of type semantics (use `uint*` core types) | - -**Nullability and defaults:** DataJoint handles nullability through the default value syntax. -An attribute is nullable if and only if its default is `NULL`: - -``` -# Required (NOT NULL, no default) -name : varchar(100) - -# Nullable (default is NULL) -nickname = NULL : varchar(100) - -# Required with default value -status = "active" : varchar(20) -``` - -**Auto-increment policy:** DataJoint discourages `AUTO_INCREMENT` / `SERIAL` because: -- Breaks reproducibility (IDs depend on insertion order) -- Makes pipelines non-deterministic -- Complicates data migration and replication -- Primary keys should be meaningful, not arbitrary - -If required, use native types: `int auto_increment` or `serial` (with warning). - -### Encoding and Collation Policy - -Character encoding and collation are **database-level configuration**, not part of type -definitions. This ensures consistent behavior across all tables and simplifies portability. - -**Configuration** (in `dj.config` or `datajoint.json`): -```json -{ - "database.charset": "utf8mb4", - "database.collation": "utf8mb4_bin" -} -``` - -**Defaults:** - -| Setting | MySQL | PostgreSQL | -|---------|-------|------------| -| Charset | `utf8mb4` | `UTF8` | -| Collation | `utf8mb4_bin` | `C` | - -**Policy:** -- **UTF-8 required**: DataJoint validates charset is UTF-8 compatible at connection time -- **Case-sensitive by default**: Binary collation (`utf8mb4_bin` / `C`) ensures predictable comparisons -- **No per-column overrides**: `CHARACTER SET` and `COLLATE` are rejected in type definitions -- **Like timezone**: Encoding is infrastructure configuration, not part of the data model - -## Codec Types (Layer 3) - -Codec types provide `encode()`/`decode()` semantics on top of core types. They are -composable and can be built-in or user-defined. - -### Storage Mode: `@` Convention - -The `@` character in codec syntax indicates **object store** (vs in-table): - -- **No `@`**: In-table storage (database column) - e.g., ``, `` -- **`@` present**: Object store - e.g., ``, `` -- **`@` alone**: Use default store - e.g., `` -- **`@name`**: Use named store - e.g., `` - -Some codecs support both modes (``, ``), others are store-only (``, ``, ``, ``). - -### Codec Base Class - -Codecs inherit from `dj.Codec` and auto-register when their class is defined. See the [Codec API Specification](codec-api.md) for complete details on creating custom codecs. - -```python -class GraphCodec(dj.Codec): - """Auto-registered as .""" - name = "graph" - - def get_dtype(self, is_store: bool) -> str: - return "" - - def encode(self, graph, *, key=None, store_name=None): - return {'nodes': list(graph.nodes()), 'edges': list(graph.edges())} - - def decode(self, stored, *, key=None): - import networkx as nx - G = nx.Graph() - G.add_nodes_from(stored['nodes']) - G.add_edges_from(stored['edges']) - return G -``` - -### Codec Resolution and Chaining - -Codecs resolve to core types through chaining. The `get_dtype(is_store)` method -returns the appropriate dtype based on storage mode: - -| Codec | `is_store` | Resolution Chain | SQL Type | -|-------|------------|------------------|----------| -| `` | `False` | `"bytes"` | `LONGBLOB`/`BYTEA` | -| `` | `True` | `""` → `"json"` | `JSON`/`JSONB` | -| `` | `True` | `""` → `"json"` (store=cold) | `JSON`/`JSONB` | -| `` | `False` | `"bytes"` | `LONGBLOB`/`BYTEA` | -| `` | `True` | `""` → `"json"` | `JSON`/`JSONB` | -| `` | `True` | `"json"` | `JSON`/`JSONB` | -| `` | `True` | `"json"` | `JSON`/`JSONB` | -| `` | `False` | ERROR (store only) | — | -| `` | `False` | ERROR (store only) | — | -| `` | `True` | `"json"` | `JSON`/`JSONB` | -| `` | `True` | `"json"` | `JSON`/`JSONB` | - -### `` / `` - Schema-Addressed Storage - -**Built-in codec. Store only.** - -Schema-addressed OAS storage for complex, multi-part objects (files, folders, Zarr arrays, HDF5): - -- **Schema-addressed**: Path mirrors database structure: `{schema}/{table}/{pk}/{attribute}/` -- **Complex objects**: Can store directory structures with multiple files (e.g., Zarr arrays) -- One-to-one relationship with table row -- Deleted when row is deleted -- Returns `ObjectRef` for lazy access -- Supports direct writes (Zarr, HDF5) via fsspec -- **dtype**: `json` (stores path, store name, metadata) - -```python -class Analysis(dj.Computed): - definition = """ - -> Recording - --- - results : # default store - archive : # specific store - """ -``` - -#### Implementation - -```python -class ObjectCodec(SchemaCodec): - """Schema-addressed OAS storage. Store only.""" - name = "object" - - # get_dtype inherited from SchemaCodec - - def encode(self, value, *, key=None, store_name=None) -> dict: - schema, table, field, pk = self._extract_context(key) - path, _ = self._build_path(schema, table, field, pk) - backend = self._get_backend(store_name) - backend.put(path, value) - return {"path": path, "store": store_name, ...} - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - backend = self._get_backend(stored["store"]) - return ObjectRef.from_json(stored, backend=backend) -``` - -### `` / `` - Hash-Addressed Storage - -**Built-in codec. Store only.** - -Hash-addressed storage with deduplication for individual, atomic objects: - -- **Hash-addressed**: Path derived from content hash: `_hash/{hash[:2]}/{hash[2:4]}/{hash}` -- **Individual/atomic objects only**: Stores single files or serialized blobs (not directory structures) -- Cannot handle complex multi-part objects like Zarr arrays—use `` for those -- **Per-project scope**: content is shared across all schemas in a project (not per-schema) -- Many-to-one: multiple rows (even across schemas) can reference same content -- Reference counted for garbage collection -- Deduplication: identical content stored once across the entire project -- **dtype**: `json` (stores hash, store name, size, metadata) - -``` -store_root/ -├── {schema}/{table}/{pk}/ # schema-addressed storage -│ └── {attribute}/ -│ -└── _hash/ # hash-addressed storage - └── {hash[:2]}/{hash[2:4]}/{hash} -``` - -#### Implementation - -```python -class HashCodec(dj.Codec): - """Hash-addressed storage. Store only.""" - name = "hash" - - def get_dtype(self, is_store: bool) -> str: - if not is_store: - raise DataJointError(" requires @ (store only)") - return "json" - - def encode(self, data: bytes, *, key=None, store_name=None) -> dict: - """Store content, return metadata as JSON.""" - hash_id = hashlib.md5(data).hexdigest() # 32-char hex - store = get_store(store_name or dj.config['stores']['default']) - path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" - - if not store.exists(path): - store.put(path, data) - - # Metadata stored in JSON column (no separate registry) - return {"hash": hash_id, "store": store_name, "size": len(data)} - - def decode(self, stored: dict, *, key=None) -> bytes: - """Retrieve content by hash.""" - store = get_store(stored["store"]) - path = f"_hash/{stored['hash'][:2]}/{stored['hash'][2:4]}/{stored['hash']}" - return store.get(path) -``` - -#### Database Column - -The `` type stores JSON metadata: - -```sql --- content column (MySQL) -features JSON NOT NULL --- Contains: {"hash": "abc123...", "store": "main", "size": 12345} - --- content column (PostgreSQL) -features JSONB NOT NULL -``` - -### `` - Portable External Reference - -**Built-in codec. External only (store required).** - -Relative path references within configured stores: - -- **Relative paths**: paths within a configured store (portable across environments) -- **Store-aware**: resolves paths against configured store backend -- Returns `ObjectRef` for lazy access via fsspec -- Stores optional checksum for verification -- **dtype**: `json` (stores path, store name, checksum, metadata) - -**Key benefit**: Portability. The path is relative to the store, so pipelines can be moved -between environments (dev → prod, cloud → local) by changing store configuration without -updating data. - -```python -class RawData(dj.Manual): - definition = """ - session_id : int32 - --- - recording : # relative path within 'main' store - """ - -# Insert - user provides relative path within the store -table.insert1({ - 'session_id': 1, - 'recording': 'experiment_001/data.nwb' # relative to main store root -}) - -# Fetch - returns ObjectRef (lazy) -row = (table & 'session_id=1').fetch1() -ref = row['recording'] # ObjectRef -ref.download('/local/path') # explicit download -ref.open() # fsspec streaming access -``` - -#### When to Use `` vs `varchar` - -| Use Case | Recommended Type | -|----------|------------------| -| Need ObjectRef/lazy access | `` | -| Need portability (relative paths) | `` | -| Want checksum verification | `` | -| Just storing a URL string | `varchar` | -| External URLs you don't control | `varchar` | - -For arbitrary URLs (S3, HTTP, etc.) where you don't need ObjectRef semantics, -just use `varchar`. A string is simpler and more transparent. - -#### Implementation - -```python -class FilepathCodec(dj.Codec): - """Store-relative file references. External only.""" - name = "filepath" - - def get_dtype(self, is_external: bool) -> str: - if not is_external: - raise DataJointError(" requires @store") - return "json" - - def encode(self, relative_path: str, *, key=None, store_name=None) -> dict: - """Register reference to file in store.""" - store = get_store(store_name) # store_name required for filepath - return {'path': relative_path, 'store': store_name} - - def decode(self, stored: dict, *, key=None) -> ObjectRef: - """Return ObjectRef for lazy access.""" - return ObjectRef(store=get_store(stored['store']), path=stored['path']) -``` - -#### Database Column - -```sql --- filepath column (MySQL) -recording JSON NOT NULL --- Contains: {"path": "experiment_001/data.nwb", "store": "main", "checksum": "...", "size": ...} - --- filepath column (PostgreSQL) -recording JSONB NOT NULL -``` - -#### Key Differences from Legacy `filepath@store` (now ``) - -| Feature | Legacy | New | -|---------|--------|-----| -| Access | Copy to local stage | ObjectRef (lazy) | -| Copying | Automatic | Explicit via `ref.download()` | -| Streaming | No | Yes via `ref.open()` | -| Paths | Relative | Relative (unchanged) | -| Store param | Required (`@store`) | Required (`@store`) | - -## Database Types - -### `json` - Cross-Database JSON Type - -JSON storage compatible across MySQL and PostgreSQL: - -```sql --- MySQL -column_name JSON NOT NULL - --- PostgreSQL (uses JSONB for better indexing) -column_name JSONB NOT NULL -``` - -The `json` database type: -- Used as dtype by built-in codecs (``, ``, ``) -- Stores arbitrary JSON-serializable data -- Automatically uses appropriate type for database backend -- Supports JSON path queries where available - -## Built-in Codecs - -### `` / `` - Serialized Python Objects - -**Supports both internal and external storage.** - -Serializes Python objects (NumPy arrays, dicts, lists, etc.) using DataJoint's -blob format. Compatible with MATLAB. - -- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) -- **``**: Stored externally via `` with deduplication -- **``**: Stored in specific named store - -```python -class BlobCodec(dj.Codec): - """Serialized Python objects. Supports internal and external.""" - name = "blob" - - def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "bytes" - - def encode(self, value, *, key=None, store_name=None) -> bytes: - from . import blob - return blob.pack(value, compress=True) - - def decode(self, stored, *, key=None) -> Any: - from . import blob - return blob.unpack(stored) -``` - -Usage: -```python -class ProcessedData(dj.Computed): - definition = """ - -> RawData - --- - small_result : # internal (in database) - large_result : # external (default store) - archive_result : # external (specific store) - """ -``` - -### `` / `` - File Attachments - -**Supports both internal and external storage.** - -Stores files with filename preserved. On fetch, extracts to configured download path. - -- **``**: Stored in database (`bytes` → `LONGBLOB`/`BYTEA`) -- **``**: Stored externally via `` with deduplication -- **``**: Stored in specific named store - -```python -class AttachCodec(dj.Codec): - """File attachment with filename. Supports internal and external.""" - name = "attach" - - def get_dtype(self, is_external: bool) -> str: - return "" if is_external else "bytes" - - def encode(self, filepath, *, key=None, store_name=None) -> bytes: - path = Path(filepath) - return path.name.encode() + b"\0" + path.read_bytes() - - def decode(self, stored, *, key=None) -> str: - filename, contents = stored.split(b"\0", 1) - filename = filename.decode() - download_path = Path(dj.config['download_path']) / filename - download_path.write_bytes(contents) - return str(download_path) -``` - -Usage: -```python -class Attachments(dj.Manual): - definition = """ - attachment_id : int32 - --- - config : # internal (small file in DB) - data_file : # external (default store) - archive : # external (specific store) - """ -``` - -## User-Defined Codecs - -Users can define custom codecs for domain-specific data. See the [Codec API Specification](codec-api.md) for complete examples including: - -- Simple serialization codecs -- External storage codecs -- JSON with schema validation -- Context-dependent encoding -- External-only codecs (Zarr, HDF5) - -## Storage Comparison - -| Type | get_dtype | Resolves To | Storage Location | Dedup | Returns | -|------|-----------|-------------|------------------|-------|---------| -| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Python object | -| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | -| `` | `` | `json` | `_hash/{hash}` | Yes | Python object | -| `` | `bytes` | `LONGBLOB`/`BYTEA` | Database | No | Local file path | -| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | -| `` | `` | `json` | `_hash/{hash}` | Yes | Local file path | -| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `JSON`/`JSONB` | `{schema}/{table}/{pk}/` | No | ObjectRef | -| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | -| `` | `json` | `JSON`/`JSONB` | `_hash/{hash}` | Yes | bytes | -| `` | `json` | `JSON`/`JSONB` | Configured store | No | ObjectRef | - -## Garbage Collection for Hash Storage - -Hash metadata (hash, store, size) is stored directly in each table's JSON column - no separate -registry table is needed. Garbage collection scans all tables to find referenced hashes: - -```python -def garbage_collect(store_name): - """Remove hash-addressed data not referenced by any table.""" - # Scan store for all hash files - store = get_store(store_name) - all_hashes = set(store.list_hashes()) # from _hash/ directory - - # Scan all tables for referenced hashes - referenced = set() - for schema in project.schemas: - for table in schema.tables: - for attr in table.heading.attributes: - if uses_hash_storage(attr): # , , - for row in table: - val = row.get(attr.name) - if val and val.get('store') == store_name: - referenced.add(val['hash']) - - # Delete orphaned files - for hash_id in (all_hashes - referenced): - store.delete(hash_path(hash_id)) -``` - -## Built-in Codec Comparison - -| Feature | `` | `` | `` | `` | `` | -|---------|----------|------------|-------------|--------------|---------------| -| Storage modes | Both | Both | External only | External only | External only | -| Internal dtype | `bytes` | `bytes` | N/A | N/A | N/A | -| External dtype | `` | `` | `json` | `json` | `json` | -| Addressing | Hash | Hash | Primary key | Hash | Relative path | -| Deduplication | Yes (external) | Yes (external) | No | Yes | No | -| Structure | Single blob | Single file | Files, folders | Single blob | Any | -| Returns | Python object | Local path | ObjectRef | bytes | ObjectRef | -| GC | Ref counted | Ref counted | With row | Ref counted | User managed | - -**When to use each:** -- **``**: Serialized Python objects (NumPy arrays, dicts). Use `` for large/duplicated data -- **``**: File attachments with filename preserved. Use `` for large files -- **``**: Large/complex file structures (Zarr, HDF5) where DataJoint controls organization -- **``**: Raw bytes with deduplication (typically used via `` or ``) -- **``**: Portable references to externally-managed files -- **`varchar`**: Arbitrary URLs/paths where ObjectRef semantics aren't needed - -## Key Design Decisions - -1. **Three-layer architecture**: - - Layer 1: Native database types (backend-specific, discouraged) - - Layer 2: Core DataJoint types (standardized, scientist-friendly) - - Layer 3: Codec types (encode/decode, composable) -2. **Core types are scientist-friendly**: `float32`, `uint8`, `bool`, `bytes` instead of `FLOAT`, `TINYINT UNSIGNED`, `LONGBLOB` -3. **Codecs use angle brackets**: ``, ``, `` - distinguishes from core types -4. **`@` indicates external storage**: No `@` = database, `@` present = object store -5. **`get_dtype(is_external)` method**: Codecs resolve dtype at declaration time based on storage mode -6. **Codecs are composable**: `` uses ``, which uses `json` -7. **Built-in external codecs use JSON dtype**: Stores metadata (path, hash, store name, etc.) -8. **Two OAS regions**: object (PK-addressed) and hash (hash-addressed) within managed stores -9. **Filepath for portability**: `` uses relative paths within stores for environment portability -10. **No `uri` type**: For arbitrary URLs, use `varchar`—simpler and more transparent -11. **Naming conventions**: - - `@` = external storage (object store) - - No `@` = internal storage (database) - - `@` alone = default store - - `@name` = named store -12. **Dual-mode codecs**: `` and `` support both internal and external storage -13. **External-only codecs**: ``, ``, `` require `@` -14. **Transparent access**: Codecs return Python objects or file paths -15. **Lazy access**: `` and `` return ObjectRef -16. **MD5 for content hashing**: See [Hash Algorithm Choice](#hash-algorithm-choice) below -17. **No separate registry**: Hash metadata stored in JSON columns, not a separate table -18. **Auto-registration via `__init_subclass__`**: Codecs register automatically when subclassed—no decorator needed. Use `register=False` for abstract bases. Requires Python 3.10+. - -### Hash Algorithm Choice - -Content-addressed storage uses **MD5** (128-bit, 32-char hex) rather than SHA256 (256-bit, 64-char hex). - -**Rationale:** - -1. **Practical collision resistance is sufficient**: The birthday bound for MD5 is ~2^64 operations - before 50% collision probability. No scientific project will store anywhere near 10^19 files. - For content deduplication (not cryptographic verification), MD5 provides adequate uniqueness. - -2. **Storage efficiency**: 32-char hashes vs 64-char hashes in every JSON metadata field. - With millions of records, this halves the storage overhead for hash identifiers. - -3. **Performance**: MD5 is ~2-3x faster than SHA256 for large files. While both are fast, - the difference is measurable when hashing large scientific datasets. - -4. **Legacy compatibility**: DataJoint's existing `uuid_from_buffer()` function uses MD5. - The new system changes only the storage format (hex string in JSON vs binary UUID), - not the underlying hash algorithm. This simplifies migration. - -5. **Consistency with existing codebase**: Internal functions use MD5 for query caching. - -**Why not SHA256?** - -SHA256 is the modern standard for content-addressable storage (Git, Docker, IPFS). However: -- These systems prioritize cryptographic security against adversarial collision attacks -- Scientific data pipelines face no adversarial threat model -- The practical benefits (storage, speed, compatibility) outweigh theoretical security gains - -**Note**: If cryptographic verification is ever needed (e.g., for compliance or reproducibility -audits), SHA256 checksums can be computed on-demand without changing the storage addressing scheme. - -## Migration from Legacy Types - -| Legacy | New Equivalent | -|--------|----------------| -| `longblob` (auto-serialized) | `` | -| `blob@store` | `` | -| `attach` | `` | -| `attach@store` | `` | -| `filepath@store` (copy-based) | `` (ObjectRef-based) | - -### Migration from Legacy `~external_*` Stores - -Legacy external storage used per-schema `~external_{store}` tables with UUID references. -Migration to the new JSON-based hash storage requires: - -```python -def migrate_external_store(schema, store_name): - """ - Migrate legacy ~external_{store} to new HashRegistry. - - 1. Read all entries from ~external_{store} - 2. For each entry: - - Fetch content from legacy location - - Compute MD5 hash - - Copy to _hash/{hash}/ if not exists - - Update table column to new hash format - 3. After all schemas migrated, drop ~external_{store} tables - """ - external_table = schema.external[store_name] - - for entry in external_table: - legacy_uuid = entry['hash'] - - # Fetch content from legacy location - content = external_table.get(legacy_uuid) - - # Compute new content hash - hash_id = hashlib.md5(content).hexdigest() - - # Store in new location if not exists - new_path = f"_hash/{hash_id[:2]}/{hash_id[2:4]}/{hash_id}" - store = get_store(store_name) - if not store.exists(new_path): - store.put(new_path, content) - - # Update referencing tables: convert UUID column to JSON with hash metadata - # The JSON column stores {"hash": hash_id, "store": store_name, "size": len(content)} - # ... update all tables that reference this UUID ... - - # After migration complete for all schemas: - # DROP TABLE `{schema}`.`~external_{store}` -``` - -**Migration considerations:** -- Legacy UUIDs were based on MD5 content hash stored as `binary(16)` (UUID format) -- New system uses `char(32)` MD5 hex strings stored in JSON -- The hash algorithm is unchanged (MD5), only the storage format differs -- Migration can be done incrementally per schema -- Backward compatibility layer can read both formats during transition - -## Open Questions - -1. How long should the backward compatibility layer support legacy `~external_*` format? -2. Should `` (without store name) use a default store or require explicit store name? - - ---- -## File: reference/specs/virtual-schemas.md - -# Virtual Schemas Specification - -## Overview - -Virtual schemas provide a way to access existing database schemas without the original Python source code. This is useful for: - -- Exploring schemas created by other users -- Accessing legacy schemas -- Quick data inspection and queries -- Schema migration and maintenance - ---- - -## 1. Schema-Module Convention - -DataJoint maintains a **1:1 mapping** between database schemas and Python modules: - -| Database | Python | -|----------|--------| -| Schema | Module | -| Table | Class | - -This convention reduces conceptual complexity: **modules are schemas, classes are tables**. - -When you define tables in Python: -```python -# lab.py module -import datajoint as dj -schema = dj.Schema('lab') - -@schema -class Subject(dj.Manual): # Subject class → `lab`.`subject` table - ... - -@schema -class Session(dj.Manual): # Session class → `lab`.`session` table - ... -``` - -Virtual schemas recreate this mapping when the Python source isn't available: -```python -# Creates module-like object with table classes -lab = dj.virtual_schema('lab') -lab.Subject # Subject class for `lab`.`subject` -lab.Session # Session class for `lab`.`session` -``` - ---- - -## 2. Schema Introspection API - -### 2.1 Direct Table Access - -Access individual tables by name using bracket notation: - -```python -schema = dj.Schema('my_schema') - -# By CamelCase class name -experiment = schema['Experiment'] - -# By snake_case SQL name -experiment = schema['experiment'] - -# Query the table -experiment.fetch() -``` - -### 2.2 `get_table()` Method - -Explicit method for table access: - -```python -table = schema.get_table('Experiment') -table = schema.get_table('experiment') # also works -``` - -**Parameters:** -- `name` (str): Table name in CamelCase or snake_case - -**Returns:** `FreeTable` instance - -**Raises:** `DataJointError` if table doesn't exist - -### 2.3 Iteration - -Iterate over all tables in dependency order: - -```python -for table in schema: - print(table.full_table_name, len(table)) -``` - -Tables are yielded as `FreeTable` instances in topological order (dependencies before dependents). - -### 2.4 Containment Check - -Check if a table exists: - -```python -if 'Experiment' in schema: - print("Table exists") - -if 'nonexistent' not in schema: - print("Table doesn't exist") -``` - ---- - -## 3. Virtual Schema Function - -### 3.1 `dj.virtual_schema()` - -The recommended way to access existing schemas as modules: - -```python -lab = dj.virtual_schema('my_lab_schema') - -# Access tables as attributes (classes) -lab.Subject.fetch() -lab.Session & 'subject_id="M001"' - -# Full query algebra supported -(lab.Session * lab.Subject).fetch() -``` - -This maintains the module-class convention: `lab` behaves like a Python module with table classes as attributes. - -**Parameters:** - -| Parameter | Type | Default | Description | -|-----------|------|---------|-------------| -| `schema_name` | str | required | Database schema name | -| `connection` | Connection | None | Database connection (uses default) | -| `create_schema` | bool | False | Create schema if missing | -| `create_tables` | bool | False | Allow new table declarations | -| `add_objects` | dict | None | Additional objects for namespace | - -**Returns:** `VirtualModule` instance - -### 3.2 VirtualModule Class - -The underlying class (prefer `virtual_schema()` function): - -```python -module = dj.VirtualModule('lab', 'my_lab_schema') -module.Subject.fetch() -``` - -The first argument is the module display name, second is the schema name. - -### 3.3 Accessing the Schema Object - -Virtual modules expose the underlying Schema: - -```python -lab = dj.virtual_schema('my_lab_schema') -lab.schema.database # 'my_lab_schema' -lab.schema.list_tables() # ['subject', 'session', ...] -``` - ---- - -## 4. Table Class Generation - -### 4.1 `make_classes()` - -Create Python classes for all tables in a schema: - -```python -schema = dj.Schema('existing_schema') -schema.make_classes() - -# Now table classes are available in local namespace -Subject.fetch() -Session & 'date > "2024-01-01"' -``` - -**Parameters:** -- `into` (dict, optional): Namespace to populate. Defaults to caller's locals. - -### 4.2 Generated Class Types - -Classes are created based on table naming conventions: - -| Table Name Pattern | Generated Class | -|-------------------|-----------------| -| `subject` | `dj.Manual` | -| `#lookup_table` | `dj.Lookup` | -| `_imported_table` | `dj.Imported` | -| `__computed_table` | `dj.Computed` | -| `master__part` | `dj.Part` | - -### 4.3 Part Table Handling - -Part tables are attached to their master classes: - -```python -lab = dj.virtual_schema('my_lab') - -# Part tables are nested attributes -lab.Session.Trial.fetch() # Session.Trial is a Part table -``` - ---- - -## 5. Use Cases - -### 5.1 Data Exploration - -```python -# Quick exploration of unknown schema -lab = dj.virtual_schema('collaborator_lab') - -# List all tables -print(lab.schema.list_tables()) - -# Check table structure -print(lab.Subject.describe()) - -# Preview data -lab.Subject.fetch(limit=5) -``` - -### 5.2 Cross-Schema Queries - -```python -my_schema = dj.Schema('my_analysis') -external = dj.virtual_schema('external_lab') - -# Reference external tables in queries -@my_schema -class Analysis(dj.Computed): - definition = """ - -> external.Session - --- - result : float - """ -``` - -### 5.3 Schema Migration - -```python -old = dj.virtual_schema('old_schema') -new = dj.Schema('new_schema') - -# Copy data in topological order (iteration yields dependencies first) -for table in old: - new_table = new.get_table(table.table_name) - # Server-side INSERT...SELECT (no client-side data transfer) - new_table.insert(table) -``` - -### 5.4 Garbage Collection - -```python -from datajoint.gc import scan_hash_references - -schema = dj.Schema('my_schema') - -# Scan all tables for hash references -refs = scan_hash_references(schema, verbose=True) -``` - ---- - -## 6. Comparison of Methods - -| Method | Use Case | Returns | -|--------|----------|---------| -| `schema['Name']` | Quick single table access | `FreeTable` | -| `schema.get_table('name')` | Explicit table access | `FreeTable` | -| `for t in schema` | Iterate all tables | `FreeTable` generator | -| `'Name' in schema` | Check existence | `bool` | -| `dj.virtual_schema(name)` | Module-like access | `VirtualModule` | -| `make_classes()` | Populate namespace | None (side effect) | - ---- - -## 7. Implementation Reference - -| File | Purpose | -|------|---------| -| `schemas.py` | Schema, VirtualModule, virtual_schema | -| `table.py` | FreeTable class | -| `gc.py` | Uses get_table() for scanning | - ---- - -## 8. Error Messages - -| Error | Cause | Solution | -|-------|-------|----------| -| "Table does not exist" | `get_table()` on missing table | Check table name spelling | -| "Schema must be activated" | Operations on unactivated schema | Call `schema.activate(name)` | -| "Schema does not exist" | Schema name not in database | Check schema name, create if needed | - - -============================================================ -# About -============================================================ - - ---- -## File: about/citation.md - -# Citation Guidelines - -When your work uses the DataJoint Python, MATLAB, or Elements framework, please cite the -respective manuscripts and include their associated Research Resource Identifiers -(RRIDs). Proper citation helps credit the contributors and supports the broader -scientific community by highlighting the tools used in research. - -## Citing DataJoint Elements - -If your work utilizes **DataJoint Elements**, please cite the following manuscript: - -- **Manuscript**: Yatsenko D, Nguyen T, Shen S, Gunalan K, Turner CA, Guzman R, Sasaki - M, Sitonic D, Reimer J, Walker EY, Tolias AS. DataJoint Elements: Data Workflows for - Neurophysiology. bioRxiv. 2021 Jan 1. doi: https://doi.org/10.1101/2021.03.30.437358 - -- **RRID**: [RRID:SCR_021894](https://scicrunch.org/resolver/SCR_021894) - -You should also cite the **DataJoint Core manuscript** detailed below. - -## Citing the DataJoint Relational Model - -For any work relying on the **DataJoint Relational Model**, include the following -citation: - -- **Manuscript**: Yatsenko D, Walker EY, Tolias AS. DataJoint: A simpler relational data - model. arXiv:1807.11104. 2018 Jul 29. doi: https://doi.org/10.48550/arXiv.1807.11104 - -- **RRID**: [RRID:SCR_014543](https://scicrunch.org/resolver/SCR_014543) - -## Citing DataJoint Python and MATLAB - -For work using **DataJoint Python** or **DataJoint MATLAB**, cite the following -manuscript: - -- **Manuscript**: Yatsenko D, Reimer J, Ecker AS, Walker EY, Sinz F, Berens P, - Hoenselaar A, Cotton RJ, Siapas AS, Tolias AS. DataJoint: Managing big scientific data - using MATLAB or Python. bioRxiv. 2015 Jan 1:031658. doi: - https://doi.org/10.1101/031658 - -- **RRID**: [RRID:SCR_014543](https://scicrunch.org/resolver/SCR_014543) - -## Citing SciOps and Capability Maturity Model - -If your work references **SciOps** or the **Capability Maturity Model for Data-Intensive -Research**, please use the following citation: - -- Manuscript: Johnson EC, Nguyen TT, Dichter BK, Zappulla F, Kosma M, Gunalan K, - Halchenko YO, Neufeld SQ, Schirner M, Ritter P, Martone ME. SciOps: Achieving - Productivity and Reliability in Data-Intensive Research. arXiv preprint - arXiv:2401.00077v2. 2023 Dec 29. - -- **RRID**: TBD - -# Why Cite DataJoint? - -By citing DataJoint and its associated resources: - -You give credit to the authors and contributors who developed these tools. - -You help other researchers identify and use these tools effectively. - -You strengthen the visibility and impact of open-source tools in scientific research. - -For further questions or assistance with citations, please reach out to the DataJoint -support team (support@datajoint.com). - - ---- -## File: about/contributing.md - -# Contributing to DataJoint - -DataJoint is developed openly and welcomes contributions from the community. - -## Ways to Contribute - -### Report Issues - -Found a bug or have a feature request? Open an issue on GitHub: - -- [datajoint-python issues](https://github.com/datajoint/datajoint-python/issues) -- [datajoint-docs issues](https://github.com/datajoint/datajoint-docs/issues) - -### Propose Enhancements (RFC Process) - -For significant changes to DataJoint—new features, API changes, or specification updates—we use an RFC (Request for Comments) process via GitHub Discussions. - -**When to use an RFC:** - -- API changes or new features in datajoint-python -- Changes to the DataJoint specification -- Breaking changes or deprecations -- Major documentation restructuring - -**RFC Process:** - -1. **Propose** — Create a new Discussion using the RFC template in the appropriate repository: - - [datajoint-python Discussions](https://github.com/datajoint/datajoint-python/discussions/new?category=rfc) - - [datajoint-docs Discussions](https://github.com/datajoint/datajoint-docs/discussions/new?category=rfc) - -2. **Discuss** — Community and maintainers provide feedback (2-4 weeks). Use 👍/👎 reactions to signal support. Prototyping in parallel is encouraged. - -3. **Final Comment Period** — Once consensus emerges, maintainers announce a 1-2 week final comment period. No changes during this time. - -4. **Decision** — RFC is accepted, rejected, or postponed. Accepted RFCs become tracking issues for implementation. - -**RFC Labels:** - -| Label | Meaning | -|-------|---------| -| `rfc` | All enhancement proposals | -| `status: proposed` | Initial submission | -| `status: under-review` | Active discussion | -| `status: final-comment` | Final comment period | -| `status: accepted` | Approved for implementation | -| `status: rejected` | Not accepted | -| `status: postponed` | Deferred to future | - -**Tips for a good RFC:** - -- Search existing discussions first -- Include concrete use cases and code examples -- Consider backwards compatibility -- Start with motivation before diving into design - -### Improve Documentation - -Documentation improvements are valuable contributions: - -1. Fork the [datajoint-docs](https://github.com/datajoint/datajoint-docs) repository -2. Make your changes -3. Submit a pull request - -### Contribute Code - -For code contributions to datajoint-python: - -1. Fork the repository -2. Create a feature branch -3. Write tests for your changes -4. Ensure all tests pass -5. Submit a pull request - -See the [Developer Guide](https://github.com/datajoint/datajoint-python/blob/main/CONTRIBUTING.md) -for detailed instructions. - -## Development Setup - -### datajoint-python - -```bash -git clone https://github.com/datajoint/datajoint-python.git -cd datajoint-python -pip install -e ".[dev]" -pre-commit install -``` - -### datajoint-docs - -```bash -git clone https://github.com/datajoint/datajoint-docs.git -cd datajoint-docs -pip install -r pip_requirements.txt -mkdocs serve -``` - -## Code Style - -- Python code follows [PEP 8](https://pep8.org/) -- Docstrings use [NumPy style](https://numpydoc.readthedocs.io/en/latest/format.html) -- Pre-commit hooks enforce formatting - -## Testing - -See the [Developer Guide](https://github.com/datajoint/datajoint-python/blob/main/CONTRIBUTING.md) -for current testing instructions using `pixi` and `testcontainers`. - -## Questions? - -- Open a [GitHub Discussion](https://github.com/datajoint/datajoint-python/discussions) - - ---- -## File: about/history.md - -# History - -Dimitri Yatsenko began development of DataJoint in Andreas S. Tolias' lab in the Neuroscience Department at Baylor College of Medicine in the fall of 2009. Initially implemented as a thin MySQL API in MATLAB, it defined the major principles of the DataJoint model. The [original DataJoint project](https://code.google.com/archive/p/datajoint/wikis/DataJoint.wiki) is archived on Google Code. - -In 2015, additional contributors joined to develop the Python implementation, resulting in the [foundational publication](https://doi.org/10.1101/031658) describing the DataJoint framework. - -In 2016, Vathes LLC was founded to provide support to groups using DataJoint. - -In 2017, DARPA awarded a Phase I SBIR grant (Contract D17PC00162, PI: Dimitri Yatsenko, $150,000, 2017–2018) titled "Tools for Sharing and Analyzing Neuroscience Data" to further develop and publicize the DataJoint framework. - -In 2018, the key theoretical framework was formulated in ["DataJoint: A Simpler Relational Data Model"](https://doi.org/10.48550/arXiv.1807.11104), establishing the formal basis for DataJoint's approach to scientific data management. - -In 2022, NIH awarded a Phase II SBIR grant ([R44 NS129492](https://reporter.nih.gov/project-details/10600812), PI: Dimitri Yatsenko, $2,124,457, 2022–2024) titled "DataJoint SciOps: A Managed Service for Neuroscience Data Workflows" to DataJoint (then Vathes LLC) in collaboration with the Johns Hopkins University Applied Physics Laboratory (Co-PI: Erik C. Johnson) to build a scalable cloud platform for DataJoint pipelines. - -## DataJoint Elements - -[DataJoint Elements](https://docs.datajoint.com/elements/) is an NIH-funded project ([U24 NS116470](https://reporter.nih.gov/project-details/10547509), PI: Dimitri Yatsenko, $3,780,000, 2020–2025) titled "DataJoint Pipelines for Neurophysiology." The project developed standard, open-source data pipelines for neurophysiology research ([Press Release](https://www.pr.com/press-release/873164)). - -Building on DataJoint's workflow framework, Elements provides curated, modular components for common experimental modalities including calcium imaging, electrophysiology, pose estimation, and optogenetics. The project distilled best practices from leading neuroscience labs into reusable pipeline modules that integrate with third-party analysis tools (Suite2p, DeepLabCut, Kilosort, etc.) and data standards (NWB, DANDI). - -The project is described in the position paper ["DataJoint Elements: Data Workflows for Neurophysiology"](https://www.biorxiv.org/content/10.1101/2021.03.30.437358v2). - -## Recent Developments - -In January 2024, Vathes LLC was re-incorporated as DataJoint Inc. - -In 2025, Jim Olson was appointed as CEO of DataJoint ([Press Release](https://www.prweb.com/releases/datajoint-appoints-former-flywheel-exec-jim-olson-as-new-ceo-302342644.html)). - -In August 2025, DataJoint closed a $4.9M seed funding round to expand data management and AI capabilities in academic and life sciences ([Press Release](https://www.prnewswire.com/news-releases/datajoint-closes-4-9m-seed-funding-to-revolutionize-data-management-and-ai-in-academic-and-life-sciences-pharma-302568792.html)). - -Today, DataJoint is used in hundreds of research labs worldwide for managing scientific data pipelines. - - ---- -## File: about/index.md - -# About DataJoint - -DataJoint is an open-source framework for building scientific data pipelines. -It was created to address the challenges of managing complex, interconnected -data in research laboratories. - -## What is DataJoint? - -DataJoint implements the **Relational Workflow Model**—a paradigm that extends -relational databases with native support for computational workflows. Unlike -traditional databases that only store data, DataJoint pipelines define how data -flows through processing steps, when computations run, and how results depend -on inputs. - -Key characteristics: - -- **Declarative schema design** — Define tables and relationships in Python -- **Automatic dependency tracking** — Foreign keys encode workflow dependencies -- **Built-in computation** — Imported and Computed tables run automatically -- **Data integrity** — Referential integrity and transaction support -- **Reproducibility** — Immutable data with full provenance - -## History - -DataJoint was developed at Baylor College of Medicine starting in 2009 to -support neuroscience research. It has since been adopted by laboratories -worldwide for a variety of scientific applications. - -[:octicons-arrow-right-24: Read the full history](history.md) - -## Citation - -If you use DataJoint in your research, please cite it appropriately. - -[:octicons-arrow-right-24: Citation guidelines](citation.md) - -## Contributing - -DataJoint is developed openly on GitHub. Contributions are welcome. - -[:octicons-arrow-right-24: Contribution guidelines](contributing.md) - -## License - -DataJoint is released under the [Apache License 2.0](https://www.apache.org/licenses/LICENSE-2.0). - -Copyright 2024 DataJoint Inc. and contributors. - - ---- -## File: about/publications.md - -# Publications - -The following publications relied on DataJoint open-source software for data analysis. -If your work uses DataJoint or DataJoint Elements, please cite the respective -[manuscripts and RRIDs](./citation.md). - -## 2025 - -+ Bae, J. A., Baptiste, M., Bodor, A. L., Brittain, D., Buchanan, J., Bumbarger, D. J., Castro, M. A., Celii, B., Cobos, E., Collman, F., ... (2025). [Functional connectomics spanning multiple areas of mouse visual cortex](https://doi.org/10.1038/s41586-025-08790-w). *Nature*, 640(8058), 435-447. - -+ Celii, B., Papadopoulos, S., Ding, Z., Fahey, P. G., Wang, E., Papadopoulos, C., ... & Reimer, J. (2025). [NEURD offers automated proofreading and feature extraction for connectomics](https://doi.org/10.1038/s41586-025-08660-5). *Nature*, 640(8058), 487-496. - -+ Ding, Z., Fahey, P.G., Papadopoulos, S., Wang, E.Y., Celii, B., Papadopoulos, C., Chang, A., Kunin, A.B., Tran, D., Fu, J. ... & Tolias, A. S. (2025). [Functional connectomics reveals general wiring rule in mouse visual cortex](https://doi.org/10.1038/s41586-025-08840-3). *Nature*, 640(8058), 459-469. - -+ Dyszkant, N., Oesterle, J., Qiu, Y., Harrer, M., Schubert, T., Gonschorek, D. & Euler, T. (2025). [Photoreceptor degeneration has heterogeneous effects on functional retinal ganglion cell types](https://doi.org/10.1113/JP287643). *The Journal of Physiology*, 603(21), 6599-6621. - -+ Finkelstein, A., Daie, K., Rózsa, M., Darshan, R. & Svoboda, K. (2025). [Connectivity underlying motor cortex activity during goal-directed behaviour](https://doi.org/10.1038/s41586-025-09758-6). *Nature*. - -+ Gillon, C.J., Baker, C., Ly, R., Balzani, E., Brunton, B.W., Schottdorf, M., Ghosh, S. and Dehghani, N.(2025). [Open data in neurophysiology: Advancements, solutions & challenges](https://doi.org/10.1523/ENEURO.0486-24.2025). *eNeuro*, 12(11). - -+ Huang, J.Y., Hess, M., Bajpai, A., Li, X., Hobson, L.N., Xu, A.J., Barton, S.J. and Lu, H.C.(2025). [From initial formation to developmental refinement: GABAergic inputs shape neuronal subnetworks in the primary somatosensory cortex](https://doi.org/10.1016/j.isci.2025.112104). *iScience*, 28(3). - -+ Lee, K. H., Denovellis, E. L., Ly, R., Magland, J., Soules, J., Comrie, A. E., Gramling, D. P., Guidera, J. A., Nevers, R., Adenekan, P., Brozdowski, C., Bray, S. R., Monroe, E., Bak, J. H., Coulter, M. E., Sun, X., Broyles, E., Shin, D., Chiang, S., Holobetz, C., ... Frank, L. M. (2025). [Spyglass: a data analysis framework for reproducible and shareable neuroscience research](https://elifesciences.org/reviewed-preprints/108089). *eLife*. - -+ Lees, R.M., Bianco, I.H., Campbell, R.A.A., Orlova, N., Peterka, D.S., Pichler, B., Smith, S.L., Yatsenko, D., Yu, C.H. & Packer, A.M. (2025). [Standardized measurements for monitoring and comparing multiphoton microscope systems](https://doi.org/10.1038/s41596-024-01120-w). *Nature Protocols*, 20, 2171–2208. - -+ Schmors, L., Kotkat, A.H., Bauer, Y., Huang, Z., Crombie, D., Meyerolbersleben, L.S., Sokoloski, S., Berens, P. & Busse, L. (2025). [Effects of corticothalamic feedback depend on visual responsiveness and stimulus type](https://doi.org/10.1016/j.isci.2025.112481). *iScience*, 28, 112481. - -+ Sibener, L.J., Mosberger, A.C., Chen, T.X., Athalye, V.R., Murray, J.M. & Costa, R.M. (2025). [Dissociable roles of distinct thalamic circuits in learning reaches to spatial targets](https://doi.org/10.1038/s41467-025-58143-4). *Nature Communications*, 16, 2962. - -## 2024 - -+ Chen, S., Liu, Y., Wang, Z. A., Colonell, J., Liu, L. D., Hou, H., ... & Svoboda, K. (2024). [Brain-wide neural activity underlying memory-guided movement](https://www.cell.com/cell/pdf/S0092-8674(23)01445-9.pdf). *Cell*, 187(3), 676-691. - -+ Gonzalo Cogno, S., Obenhaus, H. A., Lautrup, A., Jacobsen, R. I., Clopath, C., Andersson, S. O., ... & Moser, E. I. (2024). [Minute-scale oscillatory sequences in medial entorhinal cortex](https://www.nature.com/articles/s41586-023-06864-1). *Nature*, 625(7994), 338-344. - -+ Korympidou, M.M., Strauss, S., Schubert, T., Franke, K., Berens, P., Euler, T. & Vlasits, A.L. (2024). [GABAergic amacrine cells balance biased chromatic information in the mouse retina](https://doi.org/10.1016/j.celrep.2024.114953). *Cell Reports*, 43(11), 114953. - -+ Mosberger, A.C., Sibener, L.J., Chen, T.X., Rodrigues, H.F., Hormigo, R., Ingram, J.N., Athalye, V.R., Tabachnik, T., Wolpert, D.M., Murray, J.M. and Costa, R.M. (2024). [Exploration biases forelimb reaching strategies](https://www.cell.com/cell-reports/fulltext/S2211-1247(24)00286-9). *Cell Reports*, 43(4). - -+ Reimer, M. L., Kauer, S. D., Benson, C. A., King, J. F., Patwa, S., Feng, S., Estacion, M. A., Bangalore, L., Waxman, S. G., & Tan, A. M. (2024). [A FAIR, open-source virtual reality platform for dendritic spine analysis](https://www.cell.com/patterns/pdf/S2666-3899(24)00183-1.pdf). *Patterns*, 5(9). - -## 2023 - -+ Willeke, K.F., Restivo, K., Franke, K., Nix, A.F., Cadena, S.A., Shinn, T., Nealley, C., Rodriguez, G., Patel, S., Ecker, A.S., Sinz, F.H. & Tolias, A.S. (2023). [Deep learning-driven characterization of single cell tuning in primate visual area V4 supports topological organization](https://doi.org/10.1101/2023.05.12.540591). *bioRxiv*. - -+ Laboratory, I. B., Bonacchi, N., Chapuis, G. A., Churchland, A. K., DeWitt, E. E., Faulkner, M., ... & Wells, M. J. (2023). [A modular architecture for organizing, processing and sharing neurophysiology data](https://doi.org/10.1038/s41592-022-01742-6). *Nature Methods*. 1-5. - -## 2022 - -+ Franke, K., Willeke, K. F., Ponder, K., Galdamez, M., Zhou, N., Muhammad, T., ... & Tolias, A. S. (2022). [State-dependent pupil dilation rapidly shifts visual feature selectivity](https://www.nature.com/articles/s41586-022-05270-3). *Nature*, 610(7930), 128-134. - -+ Wang, Y., Chiola, S., Yang, G., Russell, C., Armstrong, C.J., Wu, Y., ... & Shcheglovitov, A. (2022). [Modeling human telencephalic development and autism-associated SHANK3 deficiency using organoids generated from single neural rosettes](https://doi.org/10.1038/s41467-022-33364-z). *Nature Communications*, 13, 5688. - -+ Goetz, J., Jessen, Z. F., Jacobi, A., Mani, A., Cooler, S., Greer, D., ... & Schwartz, G. W. (2022). [Unified classification of mouse retinal ganglion cells using function, morphology, and gene expression](https://doi.org/10.1016/j.celrep.2022.111040). *Cell Reports*, 40(2), 111040. - -+ Obenhaus, H.A., Zong, W., Jacobsen, R.I., Rose, T., Donato, F., Chen, L., Cheng, H., Bonhoeffer, T., Moser, M.B. & Moser, E.I. (2022). [Functional network topography of the medial entorhinal cortex](https://doi.org/10.1073/pnas.2121655119). *Proceedings of the National Academy of Sciences*, 119(7). - -+ Pettit, N. H., Yap, E., Greenberg, M. E., Harvey, C. D. (2022). [Fos ensembles encode and shape stable spatial maps in the hippocampus](https://www.nature.com/articles/s41586-022-05113-1). *Nature*. - -+ Tseng, S. Y., Chettih, S. N., Arlt, C., Barroso-Luque, R., & Harvey, C. D. (2022). [Shared and specialized coding across posterior cortical areas for dynamic navigation decisions](https://doi.org/10.1016/j.neuron.2022.05.012). *Neuron*. - -+ Turner, N. L., Macrina, T., Bae, J. A., Yang, R., Wilson, A. M., Schneider-Mizell, C., ... & Seung, H. S. (2022). [Reconstruction of neocortex: Organelles, compartments, cells, circuits, and activity](https://doi.org/10.1016/j.cell.2022.01.023). *Cell*, 185(6), 1082-1100. - -+ Zong, W., Obenhaus, H.A., Skytoen, E.R., Eneqvist, H., de Jong, N.L., Vale, R., Jorge, M.R., Moser, M.B. and Moser, E.I. (2022). [Large-scale two-photon calcium imaging in freely moving mice](https://www.sciencedirect.com/science/article/pii/S0092867422001970). *Cell*, 185(7), 1240-1256. - -## 2021 - -+ Dennis, E.J., El Hady, A., Michaiel, A., Clemens, A., Tervo, D.R.G., Voigts, J. & Datta, S.R. (2021). [Systems Neuroscience of Natural Behaviors in Rodents](https://doi.org/10.1523/JNEUROSCI.1877-20.2020). *Journal of Neuroscience*, 41(5), 911-919. - -+ Born, G., Schneider-Soupiadis, F. A., Erisken, S., Vaiceliunaite, A., Lao, C. L., Mobarhan, M. H., Spacek, M. A., Einevoll, G. T., & Busse, L. (2021). [Corticothalamic feedback sculpts visual spatial integration in mouse thalamus](https://doi.org/10.1038/s41593-021-00943-0). *Nature Neuroscience*, 24(12), 1711-1720. - -+ Finkelstein, A., Fontolan, L., Economo, M. N., Li, N., Romani, S., & Svoboda, K. (2021). [Attractor dynamics gate cortical information flow during decision-making](https://doi.org/10.1038/s41593-021-00840-6). *Nature Neuroscience*, 24(6), 843-850. - -+ Laboratory, T. I. B., Aguillon-Rodriguez, V., Angelaki, D., Bayer, H., Bonacchi, N., Carandini, M., Cazettes, F., Chapuis, G., Churchland, A. K., Dan, Y., ... (2021). [Standardized and reproducible measurement of decision-making in mice](https://doi.org/10.7554/eLife.63711). *eLife*, 10. - -## 2020 - -+ Angelaki, D. E., Ng, J., Abrego, A. M., Cham, H. X., Asprodini, E. K., Dickman, J. D., & Laurens, J. (2020). [A gravity-based three-dimensional compass in the mouse brain](https://doi.org/10.1038/s41467-020-15566-5). *Nature Communications*, 11(1), 1-13. - -+ Heath, S. L., Christenson, M. P., Oriol, E., Saavedra-Weisenhaus, M., Kohn, J. R., & Behnia, R. (2020). [Circuit mechanisms underlying chromatic encoding in drosophila photoreceptors](https://doi.org/10.1016/j.cub.2019.11.075). *Current Biology*. - -+ Yatsenko, D., Moreaux, L. C., Choi, J., Tolias, A., Shepard, K. L., & Roukes, M. L. (2020). [Signal separability in integrated neurophotonics](https://doi.org/10.1101/2020.09.27.315556). *bioRxiv*. - -## 2019 - -+ Chettih, S. N., & Harvey, C. D. (2019). [Single-neuron perturbations reveal feature-specific competition in V1](https://doi.org/10.1038/s41586-019-0997-6). *Nature*, 567(7748), 334-340. - -+ Walker, E. Y., Sinz, F. H., Cobos, E., Muhammad, T., Froudarakis, E., Fahey, P. G., Ecker, A. S., Reimer, J., Pitkow, X., & Tolias, A. S. (2019). [Inception loops discover what excites neurons most using deep predictive models](https://doi.org/10.1038/s41593-019-0517-x). *Nature Neuroscience*, 22(12), 2060-2065. - -## 2018 - -+ Denfield, G. H., Ecker, A. S., Shinn, T. J., Bethge, M., & Tolias, A. S. (2018). [Attentional fluctuations induce shared variability in macaque primary visual cortex](https://doi.org/10.1038/s41467-018-05123-6). *Nature Communications*, 9(1), 2654. - -## 2017 - -+ Franke, K., Berens, P., Schubert, T., Bethge, M., Euler, T., & Baden, T. (2017). [Inhibition decorrelates visual feature representations in the inner retina](https://doi.org/10.1038/nature21394). *Nature*, 542(7642), 439. - -## 2016 - -+ Baden, T., Berens, P., Franke, K., Rosen, M. R., Bethge, M., & Euler, T. (2016). [The functional diversity of retinal ganglion cells in the mouse](https://doi.org/10.1038/nature16468). *Nature*, 529(7586), 345-350. - -+ Reimer, J., McGinley, M. J., Liu, Y., Rodenkirch, C., Wang, Q., McCormick, D. A., & Tolias, A. S. (2016). [Pupil fluctuations track rapid changes in adrenergic and cholinergic activity in cortex](https://doi.org/10.1038/ncomms13289). *Nature Communications*, 7, 13289. - -## 2015 - -+ Jiang, X., Shen, S., Cadwell, C. R., Berens, P., Sinz, F., Ecker, A. S., Patel, S., & Tolias, A. S. (2015). [Principles of connectivity among morphologically defined cell types in adult neocortex](https://doi.org/10.1126/science.aac9462). *Science*, 350(6264), aac9462. - -## 2014 - -+ Froudarakis, E., Berens, P., Ecker, A. S., Cotton, R. J., Sinz, F. H., Yatsenko, D., Saggau, P., Bethge, M., & Tolias, A. S. (2014). [Population code in mouse V1 facilitates readout of natural scenes through increased sparseness](https://doi.org/10.1038/nn.3707). *Nature Neuroscience*, 17(6), 851-857. - -+ Reimer, J., Froudarakis, E., Cadwell, C. R., Yatsenko, D., Denfield, G. H., & Tolias, A. S. (2014). [Pupil fluctuations track fast switching of cortical states during quiet wakefulness](https://doi.org/10.1016/j.neuron.2014.09.033). *Neuron*, 84(2), 355-362. - - ---- -## File: about/versioning.md - -# Documentation Versioning - -This page explains how version information is indicated throughout the DataJoint documentation. - -## Documentation Scope - -**This documentation covers DataJoint 2.0 and later.** All code examples and tutorials use DataJoint 2.0+ syntax and APIs. - -**DataJoint 2.0 is the baseline.** Features and APIs introduced in 2.0 are documented without version markers, as they are the standard for this documentation. - -If you're using legacy DataJoint (version 0.14.x or earlier), please visit the [legacy documentation](https://datajoint.github.io/datajoint-python) or follow the [Migration Guide](../how-to/migrate-to-v20.md) to upgrade. - -## Version Indicators - -### Global Indicators - -**Site-wide banner:** Every page displays a banner indicating you're viewing documentation for DataJoint 2.0+, with a link to the migration guide for legacy users. - -### Feature-Level Indicators - -Version admonitions are used for features introduced **after 2.0** (i.e., version 2.1 and later): - -#### New Features - -!!! version-added "New in 2.1" - - This indicates a feature that was introduced after the 2.0 baseline. - -**Example usage:** - -!!! version-added "New in 2.1" - - The `dj.Top` operator with ordering support was introduced in DataJoint 2.1. - -**Note:** Features present in DataJoint 2.0 (the baseline) are not marked with version indicators. - -#### Changed Behavior - -!!! version-changed "Changed in 2.1" - - This indicates behavior that changed in a post-2.0 release. - -**Example usage:** - -!!! version-changed "Changed in 2.1" - - The `populate()` method now supports priority-based scheduling by default. - - Use `priority=50` to control execution order when using `reserve_jobs=True`. - -#### Deprecated Features - -!!! version-deprecated "Deprecated in 2.1, removed in 3.0" - - This indicates features that are deprecated and will be removed in future versions. - -**Example usage:** - -!!! version-deprecated "Deprecated in 2.1, removed in 3.0" - - The `allow_direct_insert` parameter is deprecated. Use `dj.config['safemode']` instead. - -**Note:** Features deprecated at the 2.0 baseline (coming from pre-2.0) are documented in the [Migration Guide](../how-to/migrate-to-v20.md) rather than with admonitions, since this documentation assumes 2.0 as the baseline. - -### Inline Version Badges - -For features introduced **after 2.0**, inline version badges may appear in API reference: - -- `dj.Top()` v2.1+ - Top N restriction with ordering -- `some_method()` deprecated - Legacy method - -**Note:** Methods and features present in DataJoint 2.0 (the baseline) do not have version badges. - -## Checking Your Version - -To check which version of DataJoint you're using: - -```python -import datajoint as dj -print(dj.__version__) -``` - -- **Version 2.0 or higher:** You're on the current version -- **Version 0.14.x or lower:** You're on legacy DataJoint - -## Migration Path - -If you're upgrading from legacy DataJoint (pre-2.0): - -1. **Review** the [What's New in 2.0](../explanation/whats-new-2.md) page to understand major changes -2. **Follow** the [Migration Guide](../how-to/migrate-to-v20.md) for step-by-step upgrade instructions -3. **Reference** this documentation for updated syntax and APIs - -## Legacy Documentation - -For DataJoint 0.x documentation, visit: - -**[datajoint.github.io/datajoint-python](https://datajoint.github.io/datajoint-python)** - -## Version History - -| Version | Release Date | Major Changes | -|---------|--------------|---------------| -| 2.0 | 2026 | Redesigned fetch API, unified stores, per-table jobs, semantic matching | -| 0.14.x | 2020-2025 | Legacy version with external storage | -| 0.13.x | 2019 | Legacy version | - -For complete version history, see the [changelog](https://github.com/datajoint/datajoint-python/releases). -