Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 21 additions & 19 deletions src/datajoint/builtin_codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@
want to create their own custom codecs.

Built-in Codecs:
- ``<blob>``: Serialize Python objects (internal) or external with dedup
- ``<hash>``: Hash-addressed storage with SHA256 deduplication
- ``<object>``: Schema-addressed storage for files/folders (Zarr, HDF5)
- ``<attach>``: File attachment (internal) or external with dedup
- ``<filepath@store>``: Reference to existing file in store
- ``<blob>``: Serialize Python objects (in-table storage)
- ``<blob@>``: Serialize Python objects (external with hash-addressed dedup)
- ``<attach>``: File attachment (in-table storage)
- ``<attach@>``: File attachment (external with hash-addressed dedup)
- ``<hash@>``: Hash-addressed storage with MD5 deduplication (external only)
- ``<object@>``: Schema-addressed storage for files/folders (external only)
- ``<npy@>``: Store numpy arrays as portable .npy files (external only)
- ``<filepath@store>``: Reference to existing file in store (external only)

Example - Creating a Custom Codec:
Here's how to define your own codec, modeled after the built-in codecs::
Expand Down Expand Up @@ -75,9 +77,9 @@ class BlobCodec(Codec):
The ``<blob>`` codec handles serialization of arbitrary Python objects
including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs.

Supports both internal and external storage:
Supports both in-table and in-store storage:
- ``<blob>``: Stored in database (bytes → LONGBLOB)
- ``<blob@>``: Stored externally via ``<hash@>`` with deduplication
- ``<blob@>``: Stored in object store via ``<hash@>`` with deduplication
- ``<blob@store>``: Stored in specific named store

Format Features:
Expand All @@ -92,9 +94,9 @@ class ProcessedData(dj.Manual):
definition = '''
data_id : int
---
small_result : <blob> # internal (in database)
large_result : <blob@> # external (default store)
archive : <blob@cold> # external (specific store)
small_result : <blob> # in-table (in database)
large_result : <blob@> # in-store (default store)
archive : <blob@cold> # in-store (specific store)
'''

# Insert any serializable object
Expand All @@ -104,7 +106,7 @@ class ProcessedData(dj.Manual):
name = "blob"

def get_dtype(self, is_store: bool) -> str:
"""Return bytes for internal, <hash> for external storage."""
"""Return bytes for in-table, <hash> for in-store storage."""
return "<hash>" if is_store else "bytes"

def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
Expand Down Expand Up @@ -165,9 +167,9 @@ class RawContent(dj.Manual):
name = "hash"

def get_dtype(self, is_store: bool) -> str:
"""Hash storage is external only."""
"""Hash storage is in-store only."""
if not is_store:
raise DataJointError("<hash> requires @ (external storage only)")
raise DataJointError("<hash> requires @ (in-store storage only)")
return "json"

def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict:
Expand Down Expand Up @@ -608,9 +610,9 @@ class AttachCodec(Codec):
"""
File attachment with filename preserved.

Supports both internal and external storage:
Supports both in-table and in-store storage:
- ``<attach>``: Stored in database (bytes → LONGBLOB)
- ``<attach@>``: Stored externally via ``<hash@>`` with deduplication
- ``<attach@>``: Stored in object store via ``<hash@>`` with deduplication
- ``<attach@store>``: Stored in specific named store

The filename is preserved and the file is extracted to the configured
Expand All @@ -623,9 +625,9 @@ class Documents(dj.Manual):
definition = '''
doc_id : int
---
config : <attach> # internal (small file in DB)
dataset : <attach@> # external (default store)
archive : <attach@cold> # external (specific store)
config : <attach> # in-table (small file in DB)
dataset : <attach@> # in-store (default store)
archive : <attach@cold> # in-store (specific store)
'''

# Insert a file
Expand All @@ -642,7 +644,7 @@ class Documents(dj.Manual):
name = "attach"

def get_dtype(self, is_store: bool) -> str:
"""Return bytes for internal, <hash> for external storage."""
"""Return bytes for in-table, <hash> for in-store storage."""
return "<hash>" if is_store else "bytes"

def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes:
Expand Down
6 changes: 3 additions & 3 deletions src/datajoint/codecs.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None
key : dict, optional
Primary key values. May be needed for path construction.
store_name : str, optional
Target store name for external storage.
Target store name for object storage.

Returns
-------
Expand Down Expand Up @@ -514,7 +514,7 @@ def decode_attribute(attr, data, squeeze: bool = False):
This is the central decode function used by all fetch methods. It handles:
- Codec chains (e.g., <blob@store> → <hash> → bytes)
- Native type conversions (JSON, UUID)
- External storage downloads (via config["download_path"])
- Object storage downloads (via config["download_path"])

Args:
attr: Attribute from the table's heading.
Expand All @@ -533,7 +533,7 @@ def decode_attribute(attr, data, squeeze: bool = False):
return None

if attr.codec:
# Get store if present for external storage
# Get store if present for object storage
store = getattr(attr, "store", None)
if store is not None:
dtype_spec = f"<{attr.codec.name}@{store}>"
Expand Down
4 changes: 2 additions & 2 deletions src/datajoint/condition.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def make_condition(
- ``str``: Used directly as SQL condition
- ``dict``: AND of equality conditions for matching attributes
- ``bool``: Returns the boolean value (possibly negated)
- ``QueryExpression``: Generates subquery (semijoin/antijoin)
- ``QueryExpression``: Generates subquery for restriction
- ``AndList``: AND of all conditions
- ``list/set/tuple``: OR of all conditions
- ``numpy.void``: Like dict, from record array
Expand Down Expand Up @@ -398,7 +398,7 @@ def combine_conditions(negate, conditions):
if inspect.isclass(condition) and issubclass(condition, QueryExpression):
condition = condition()

# restrict by another expression (aka semijoin and antijoin)
# restrict by another expression
if isinstance(condition, QueryExpression):
assert_join_compatibility(query_expression, condition, semantic_check=semantic_check)
# Match on all non-hidden namesakes (hidden attributes excluded)
Expand Down
2 changes: 1 addition & 1 deletion src/datajoint/declare.py
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,7 @@ def substitute_special_type(match: dict, category: str, foreign_key_sql: list[st
codec, store_name = lookup_codec(match["type"])
if store_name is not None:
match["store"] = store_name
# Determine if external storage is used (store_name is present, even if empty string for default)
# Determine if in-store storage is used (store_name is present, even if empty string for default)
is_store = store_name is not None
inner_dtype = codec.get_dtype(is_store=is_store)

Expand Down
4 changes: 2 additions & 2 deletions src/datajoint/expression.py
Original file line number Diff line number Diff line change
Expand Up @@ -686,7 +686,7 @@ def to_dicts(self, order_by=None, limit=None, offset=None, squeeze=False):
:param squeeze: if True, remove extra dimensions from arrays
:return: list of dictionaries, one per row

For external storage types (attachments, filepaths), files are downloaded
For object storage types (attachments, filepaths), files are downloaded
to config["download_path"]. Use config.override() to change::

with dj.config.override(download_path="/data"):
Expand Down Expand Up @@ -1078,7 +1078,7 @@ def make_sql(self):
alias=next(self.__count),
sorting=self.sorting_clauses(),
)
# with secondary attributes, use union of left join with antijoin
# with secondary attributes, use union of left join with anti-restriction
fields = self.heading.names
sql1 = arg1.join(arg2, left=True).make_sql(fields)
sql2 = (arg2 - arg1).proj(..., **{k: "NULL" for k in arg1.heading.secondary_attributes}).make_sql(fields)
Expand Down
10 changes: 5 additions & 5 deletions src/datajoint/gc.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
"""
Garbage collection for external storage.
Garbage collection for object storage.

This module provides utilities to identify and remove orphaned items
from external storage. Storage items become orphaned when all database rows
from object storage. Storage items become orphaned when all database rows
referencing them are deleted.

DataJoint uses two external storage patterns:
DataJoint uses two object storage patterns:

Hash-addressed storage
Types: ``<hash@>``, ``<blob@>``, ``<attach@>``
Expand All @@ -31,7 +31,7 @@

See Also
--------
datajoint.builtin_codecs : Codec implementations for external storage types.
datajoint.builtin_codecs : Codec implementations for object storage types.
"""

from __future__ import annotations
Expand Down Expand Up @@ -638,7 +638,7 @@ def format_stats(stats: dict[str, Any]) -> str:
str
Formatted string.
"""
lines = ["External Storage Statistics:"]
lines = ["Object Storage Statistics:"]

# Show hash-addressed storage stats if present
if "hash_referenced" in stats:
Expand Down
2 changes: 1 addition & 1 deletion src/datajoint/heading.py
Original file line number Diff line number Diff line change
Expand Up @@ -449,7 +449,7 @@ def _init_from_database(self) -> None:
# if no codec, then delay the error until the first invocation
attr["codec"] = _MissingType(codec_spec)
else:
# Determine if external storage based on store presence
# Determine if in-store storage based on store presence
is_store = attr.get("store") is not None
attr["type"] = attr["codec"].get_dtype(is_store=is_store)
if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()):
Expand Down
6 changes: 6 additions & 0 deletions src/datajoint/migrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@
This module provides tools for migrating existing schemas to use the new
Codec system, particularly for upgrading blob columns to use
explicit `<blob>` type declarations.

Note on Terminology
-------------------
This module uses "external storage" because that was the term in DataJoint 0.14.6.
In DataJoint 2.0 documentation, this is called "object storage" (general term)
or "in-store storage" (specific to the @ modifier).
"""

from __future__ import annotations
Expand Down
4 changes: 2 additions & 2 deletions src/datajoint/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ class DisplaySettings(BaseSettings):

class StoresSettings(BaseSettings):
"""
Unified external storage configuration.
Unified object storage configuration.

Stores configuration supports both hash-addressed and schema-addressed storage
using the same named stores with _hash and _schema sections.
Expand Down Expand Up @@ -296,7 +296,7 @@ class Config(BaseSettings):
# Unified stores configuration (replaces external and object_storage)
stores: dict[str, Any] = Field(
default_factory=dict,
description="Unified external storage configuration. "
description="Unified object storage configuration. "
"Use stores.default to designate default store. "
"Configure named stores as stores.<name>.protocol, stores.<name>.location, etc.",
)
Expand Down
Loading