diff --git a/src/datajoint/builtin_codecs.py b/src/datajoint/builtin_codecs.py index 70a1cb047..c87ab4716 100644 --- a/src/datajoint/builtin_codecs.py +++ b/src/datajoint/builtin_codecs.py @@ -6,12 +6,14 @@ want to create their own custom codecs. Built-in Codecs: - - ````: Serialize Python objects (internal) or external with dedup - - ````: Hash-addressed storage with SHA256 deduplication - - ````: Schema-addressed storage for files/folders (Zarr, HDF5) - - ````: File attachment (internal) or external with dedup - - ````: Reference to existing file in store + - ````: Serialize Python objects (in-table storage) + - ````: Serialize Python objects (external with hash-addressed dedup) + - ````: File attachment (in-table storage) + - ````: File attachment (external with hash-addressed dedup) + - ````: Hash-addressed storage with MD5 deduplication (external only) + - ````: Schema-addressed storage for files/folders (external only) - ````: Store numpy arrays as portable .npy files (external only) + - ````: Reference to existing file in store (external only) Example - Creating a Custom Codec: Here's how to define your own codec, modeled after the built-in codecs:: @@ -75,9 +77,9 @@ class BlobCodec(Codec): The ```` codec handles serialization of arbitrary Python objects including NumPy arrays, dictionaries, lists, datetime objects, and UUIDs. - Supports both internal and external storage: + Supports both in-table and in-store storage: - ````: Stored in database (bytes → LONGBLOB) - - ````: Stored externally via ```` with deduplication + - ````: Stored in object store via ```` with deduplication - ````: Stored in specific named store Format Features: @@ -92,9 +94,9 @@ class ProcessedData(dj.Manual): definition = ''' data_id : int --- - small_result : # internal (in database) - large_result : # external (default store) - archive : # external (specific store) + small_result : # in-table (in database) + large_result : # in-store (default store) + archive : # in-store (specific store) ''' # Insert any serializable object @@ -104,7 +106,7 @@ class ProcessedData(dj.Manual): name = "blob" def get_dtype(self, is_store: bool) -> str: - """Return bytes for internal, for external storage.""" + """Return bytes for in-table, for in-store storage.""" return "" if is_store else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: @@ -165,9 +167,9 @@ class RawContent(dj.Manual): name = "hash" def get_dtype(self, is_store: bool) -> str: - """Hash storage is external only.""" + """Hash storage is in-store only.""" if not is_store: - raise DataJointError(" requires @ (external storage only)") + raise DataJointError(" requires @ (in-store storage only)") return "json" def encode(self, value: bytes, *, key: dict | None = None, store_name: str | None = None) -> dict: @@ -608,9 +610,9 @@ class AttachCodec(Codec): """ File attachment with filename preserved. - Supports both internal and external storage: + Supports both in-table and in-store storage: - ````: Stored in database (bytes → LONGBLOB) - - ````: Stored externally via ```` with deduplication + - ````: Stored in object store via ```` with deduplication - ````: Stored in specific named store The filename is preserved and the file is extracted to the configured @@ -623,9 +625,9 @@ class Documents(dj.Manual): definition = ''' doc_id : int --- - config : # internal (small file in DB) - dataset : # external (default store) - archive : # external (specific store) + config : # in-table (small file in DB) + dataset : # in-store (default store) + archive : # in-store (specific store) ''' # Insert a file @@ -642,7 +644,7 @@ class Documents(dj.Manual): name = "attach" def get_dtype(self, is_store: bool) -> str: - """Return bytes for internal, for external storage.""" + """Return bytes for in-table, for in-store storage.""" return "" if is_store else "bytes" def encode(self, value: Any, *, key: dict | None = None, store_name: str | None = None) -> bytes: diff --git a/src/datajoint/codecs.py b/src/datajoint/codecs.py index e6ab22931..afa60321f 100644 --- a/src/datajoint/codecs.py +++ b/src/datajoint/codecs.py @@ -154,7 +154,7 @@ def encode(self, value: Any, *, key: dict | None = None, store_name: str | None key : dict, optional Primary key values. May be needed for path construction. store_name : str, optional - Target store name for external storage. + Target store name for object storage. Returns ------- @@ -514,7 +514,7 @@ def decode_attribute(attr, data, squeeze: bool = False): This is the central decode function used by all fetch methods. It handles: - Codec chains (e.g., → bytes) - Native type conversions (JSON, UUID) - - External storage downloads (via config["download_path"]) + - Object storage downloads (via config["download_path"]) Args: attr: Attribute from the table's heading. @@ -533,7 +533,7 @@ def decode_attribute(attr, data, squeeze: bool = False): return None if attr.codec: - # Get store if present for external storage + # Get store if present for object storage store = getattr(attr, "store", None) if store is not None: dtype_spec = f"<{attr.codec.name}@{store}>" diff --git a/src/datajoint/condition.py b/src/datajoint/condition.py index ea7d7a504..9c6f933d1 100644 --- a/src/datajoint/condition.py +++ b/src/datajoint/condition.py @@ -293,7 +293,7 @@ def make_condition( - ``str``: Used directly as SQL condition - ``dict``: AND of equality conditions for matching attributes - ``bool``: Returns the boolean value (possibly negated) - - ``QueryExpression``: Generates subquery (semijoin/antijoin) + - ``QueryExpression``: Generates subquery for restriction - ``AndList``: AND of all conditions - ``list/set/tuple``: OR of all conditions - ``numpy.void``: Like dict, from record array @@ -398,7 +398,7 @@ def combine_conditions(negate, conditions): if inspect.isclass(condition) and issubclass(condition, QueryExpression): condition = condition() - # restrict by another expression (aka semijoin and antijoin) + # restrict by another expression if isinstance(condition, QueryExpression): assert_join_compatibility(query_expression, condition, semantic_check=semantic_check) # Match on all non-hidden namesakes (hidden attributes excluded) diff --git a/src/datajoint/declare.py b/src/datajoint/declare.py index eaea163b8..8775b2a0f 100644 --- a/src/datajoint/declare.py +++ b/src/datajoint/declare.py @@ -650,7 +650,7 @@ def substitute_special_type(match: dict, category: str, foreign_key_sql: list[st codec, store_name = lookup_codec(match["type"]) if store_name is not None: match["store"] = store_name - # Determine if external storage is used (store_name is present, even if empty string for default) + # Determine if in-store storage is used (store_name is present, even if empty string for default) is_store = store_name is not None inner_dtype = codec.get_dtype(is_store=is_store) diff --git a/src/datajoint/expression.py b/src/datajoint/expression.py index bbbc12807..5ca7fdaa5 100644 --- a/src/datajoint/expression.py +++ b/src/datajoint/expression.py @@ -686,7 +686,7 @@ def to_dicts(self, order_by=None, limit=None, offset=None, squeeze=False): :param squeeze: if True, remove extra dimensions from arrays :return: list of dictionaries, one per row - For external storage types (attachments, filepaths), files are downloaded + For object storage types (attachments, filepaths), files are downloaded to config["download_path"]. Use config.override() to change:: with dj.config.override(download_path="/data"): @@ -1078,7 +1078,7 @@ def make_sql(self): alias=next(self.__count), sorting=self.sorting_clauses(), ) - # with secondary attributes, use union of left join with antijoin + # with secondary attributes, use union of left join with anti-restriction fields = self.heading.names sql1 = arg1.join(arg2, left=True).make_sql(fields) sql2 = (arg2 - arg1).proj(..., **{k: "NULL" for k in arg1.heading.secondary_attributes}).make_sql(fields) diff --git a/src/datajoint/gc.py b/src/datajoint/gc.py index c3f2d6f0f..71a4e8d08 100644 --- a/src/datajoint/gc.py +++ b/src/datajoint/gc.py @@ -1,11 +1,11 @@ """ -Garbage collection for external storage. +Garbage collection for object storage. This module provides utilities to identify and remove orphaned items -from external storage. Storage items become orphaned when all database rows +from object storage. Storage items become orphaned when all database rows referencing them are deleted. -DataJoint uses two external storage patterns: +DataJoint uses two object storage patterns: Hash-addressed storage Types: ````, ````, ```` @@ -31,7 +31,7 @@ See Also -------- -datajoint.builtin_codecs : Codec implementations for external storage types. +datajoint.builtin_codecs : Codec implementations for object storage types. """ from __future__ import annotations @@ -638,7 +638,7 @@ def format_stats(stats: dict[str, Any]) -> str: str Formatted string. """ - lines = ["External Storage Statistics:"] + lines = ["Object Storage Statistics:"] # Show hash-addressed storage stats if present if "hash_referenced" in stats: diff --git a/src/datajoint/heading.py b/src/datajoint/heading.py index c2fb0d96d..99d7246a4 100644 --- a/src/datajoint/heading.py +++ b/src/datajoint/heading.py @@ -449,7 +449,7 @@ def _init_from_database(self) -> None: # if no codec, then delay the error until the first invocation attr["codec"] = _MissingType(codec_spec) else: - # Determine if external storage based on store presence + # Determine if in-store storage based on store presence is_store = attr.get("store") is not None attr["type"] = attr["codec"].get_dtype(is_store=is_store) if not any(r.match(attr["type"]) for r in TYPE_PATTERN.values()): diff --git a/src/datajoint/migrate.py b/src/datajoint/migrate.py index 12f27612e..579d16642 100644 --- a/src/datajoint/migrate.py +++ b/src/datajoint/migrate.py @@ -4,6 +4,12 @@ This module provides tools for migrating existing schemas to use the new Codec system, particularly for upgrading blob columns to use explicit `` type declarations. + +Note on Terminology +------------------- +This module uses "external storage" because that was the term in DataJoint 0.14.6. +In DataJoint 2.0 documentation, this is called "object storage" (general term) +or "in-store storage" (specific to the @ modifier). """ from __future__ import annotations diff --git a/src/datajoint/settings.py b/src/datajoint/settings.py index 9af95ec2d..e9b6f6570 100644 --- a/src/datajoint/settings.py +++ b/src/datajoint/settings.py @@ -208,7 +208,7 @@ class DisplaySettings(BaseSettings): class StoresSettings(BaseSettings): """ - Unified external storage configuration. + Unified object storage configuration. Stores configuration supports both hash-addressed and schema-addressed storage using the same named stores with _hash and _schema sections. @@ -296,7 +296,7 @@ class Config(BaseSettings): # Unified stores configuration (replaces external and object_storage) stores: dict[str, Any] = Field( default_factory=dict, - description="Unified external storage configuration. " + description="Unified object storage configuration. " "Use stores.default to designate default store. " "Configure named stores as stores..protocol, stores..location, etc.", )