Skip to content

utils

Shared utility functions for snakesee.

This module consolidates common utilities used across multiple modules to avoid duplication and ensure consistent behavior.

Classes

MetadataCache

Thread-safe cache for parsed metadata files.

Tracks file mtimes to skip re-reading unchanged files.

Source code in snakesee/utils.py
class MetadataCache:
    """Thread-safe cache for parsed metadata files.

    Tracks file mtimes to skip re-reading unchanged files.
    """

    __slots__ = ("_cache", "_lock")

    def __init__(self) -> None:
        """Initialize empty cache."""
        self._cache: dict[Path, tuple[float, int, dict[str, Any]]] = {}
        self._lock = threading.Lock()

    def get(self, path: Path, mtime: float, inode: int) -> dict[str, Any] | None:
        """Get cached data if file hasn't changed.

        Args:
            path: Path to the metadata file.
            mtime: Current file modification time.
            inode: Current file inode.

        Returns:
            Cached data if valid, None if cache miss or stale.
        """
        with self._lock:
            cached = self._cache.get(path)
            if cached is not None:
                cached_mtime, cached_inode, data = cached
                if cached_mtime == mtime and cached_inode == inode:
                    return data
        return None

    def put(self, path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None:
        """Store parsed data in cache.

        Args:
            path: Path to the metadata file.
            mtime: File modification time.
            inode: File inode.
            data: Parsed JSON data.
        """
        with self._lock:
            self._cache[path] = (mtime, inode, data)

    def clear(self) -> None:
        """Clear all cached data."""
        with self._lock:
            self._cache.clear()

    def __len__(self) -> int:
        """Number of cached entries."""
        with self._lock:
            return len(self._cache)

Functions

__init__
__init__() -> None

Initialize empty cache.

Source code in snakesee/utils.py
def __init__(self) -> None:
    """Initialize empty cache."""
    self._cache: dict[Path, tuple[float, int, dict[str, Any]]] = {}
    self._lock = threading.Lock()
__len__
__len__() -> int

Number of cached entries.

Source code in snakesee/utils.py
def __len__(self) -> int:
    """Number of cached entries."""
    with self._lock:
        return len(self._cache)
clear
clear() -> None

Clear all cached data.

Source code in snakesee/utils.py
def clear(self) -> None:
    """Clear all cached data."""
    with self._lock:
        self._cache.clear()
get
get(path: Path, mtime: float, inode: int) -> dict[str, Any] | None

Get cached data if file hasn't changed.

Parameters:

Name Type Description Default
path Path

Path to the metadata file.

required
mtime float

Current file modification time.

required
inode int

Current file inode.

required

Returns:

Type Description
dict[str, Any] | None

Cached data if valid, None if cache miss or stale.

Source code in snakesee/utils.py
def get(self, path: Path, mtime: float, inode: int) -> dict[str, Any] | None:
    """Get cached data if file hasn't changed.

    Args:
        path: Path to the metadata file.
        mtime: Current file modification time.
        inode: Current file inode.

    Returns:
        Cached data if valid, None if cache miss or stale.
    """
    with self._lock:
        cached = self._cache.get(path)
        if cached is not None:
            cached_mtime, cached_inode, data = cached
            if cached_mtime == mtime and cached_inode == inode:
                return data
    return None
put
put(path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None

Store parsed data in cache.

Parameters:

Name Type Description Default
path Path

Path to the metadata file.

required
mtime float

File modification time.

required
inode int

File inode.

required
data dict[str, Any]

Parsed JSON data.

required
Source code in snakesee/utils.py
def put(self, path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None:
    """Store parsed data in cache.

    Args:
        path: Path to the metadata file.
        mtime: File modification time.
        inode: File inode.
        data: Parsed JSON data.
    """
    with self._lock:
        self._cache[path] = (mtime, inode, data)

Functions

get_metadata_cache

get_metadata_cache() -> MetadataCache

Get the global metadata cache instance.

Source code in snakesee/utils.py
def get_metadata_cache() -> MetadataCache:
    """Get the global metadata cache instance."""
    return _metadata_cache

get_scan_cache

get_scan_cache() -> _ScanCache

Get the global scan cache instance.

Source code in snakesee/utils.py
def get_scan_cache() -> _ScanCache:
    """Get the global scan cache instance."""
    return _scan_cache

iterate_metadata_files

iterate_metadata_files(metadata_dir: Path, progress_callback: ProgressCallback | None = None, *, sort_by_mtime: bool = True, newest_first: bool = True, use_cache: bool = True, use_parallel: bool = True, max_workers: int = DEFAULT_METADATA_WORKERS) -> Iterator[tuple[Path, dict[str, Any]]]

Iterate metadata files with optional progress reporting.

Iterates over all files in the metadata directory, parsing each as JSON. Invalid files (non-JSON or unreadable) are silently skipped with debug logging.

Performance optimizations: - Uses os.scandir instead of rglob (6-7x faster directory iteration) - Sorts by mtime to process newest files first (better for recent data) - Caches parsed files to skip re-reading unchanged files - Uses parallel I/O for very large directories (>=1000 files)

Parameters:

Name Type Description Default
metadata_dir Path

Path to .snakemake/metadata/ directory.

required
progress_callback ProgressCallback | None

Optional callback(current, total) for progress reporting.

None
sort_by_mtime bool

Sort files by modification time.

True
newest_first bool

If sorting, put newest files first.

True
use_cache bool

Use global cache to skip unchanged files.

True
use_parallel bool

Use parallel I/O for large directories.

True
max_workers int

Maximum number of parallel workers.

DEFAULT_METADATA_WORKERS

Yields:

Type Description
tuple[Path, dict[str, Any]]

Tuples of (file_path, parsed_json_data) for each valid metadata file.

Source code in snakesee/utils.py
def iterate_metadata_files(
    metadata_dir: Path,
    progress_callback: ProgressCallback | None = None,
    *,
    sort_by_mtime: bool = True,
    newest_first: bool = True,
    use_cache: bool = True,
    use_parallel: bool = True,
    max_workers: int = DEFAULT_METADATA_WORKERS,
) -> Iterator[tuple[Path, dict[str, Any]]]:
    """Iterate metadata files with optional progress reporting.

    Iterates over all files in the metadata directory, parsing each as JSON.
    Invalid files (non-JSON or unreadable) are silently skipped with debug logging.

    Performance optimizations:
    - Uses os.scandir instead of rglob (6-7x faster directory iteration)
    - Sorts by mtime to process newest files first (better for recent data)
    - Caches parsed files to skip re-reading unchanged files
    - Uses parallel I/O for very large directories (>=1000 files)

    Args:
        metadata_dir: Path to .snakemake/metadata/ directory.
        progress_callback: Optional callback(current, total) for progress reporting.
        sort_by_mtime: Sort files by modification time.
        newest_first: If sorting, put newest files first.
        use_cache: Use global cache to skip unchanged files.
        use_parallel: Use parallel I/O for large directories.
        max_workers: Maximum number of parallel workers.

    Yields:
        Tuples of (file_path, parsed_json_data) for each valid metadata file.
    """
    if not metadata_dir.exists():
        return

    # Use fast scandir-based recursive scan (bypass scan cache when use_cache=False)
    files = _scandir_files(metadata_dir, use_scan_cache=use_cache)
    if not files:
        return

    # Sort by mtime (newest first by default)
    if sort_by_mtime:
        files = sorted(files, key=lambda f: f.mtime, reverse=newest_first)

    total = len(files)
    cache = get_metadata_cache() if use_cache else None

    # Use parallel reading for large directories
    if use_parallel and total >= PARALLEL_READ_THRESHOLD:
        yield from _iterate_metadata_parallel(files, cache, progress_callback, total, max_workers)
    else:
        yield from _iterate_metadata_sequential(files, cache, progress_callback, total)

json_loads

json_loads(data: str | bytes) -> Any

Parse JSON using orjson for better performance.

Parameters:

Name Type Description Default
data str | bytes

JSON string or bytes to parse.

required

Returns:

Type Description
Any

Parsed JSON data.

Raises:

Type Description
JSONDecodeError

If the data is not valid JSON.

Source code in snakesee/utils.py
def json_loads(data: str | bytes) -> Any:
    """Parse JSON using orjson for better performance.

    Args:
        data: JSON string or bytes to parse.

    Returns:
        Parsed JSON data.

    Raises:
        orjson.JSONDecodeError: If the data is not valid JSON.
    """
    if isinstance(data, str):
        data = data.encode("utf-8")
    return orjson.loads(data)

safe_file_size

safe_file_size(path: Path) -> int

Safely get file size in bytes, returning 0 on error.

Parameters:

Name Type Description Default
path Path

Path to the file.

required

Returns:

Type Description
int

File size in bytes, or 0 if file doesn't exist or can't be accessed.

Source code in snakesee/utils.py
def safe_file_size(path: Path) -> int:
    """Safely get file size in bytes, returning 0 on error.

    Args:
        path: Path to the file.

    Returns:
        File size in bytes, or 0 if file doesn't exist or can't be accessed.
    """
    try:
        return path.stat().st_size
    except (FileNotFoundError, OSError):
        return 0

safe_mtime

safe_mtime(path: Path) -> float

Get file modification time, returning 0.0 if file doesn't exist.

This handles the common race condition where a file may be deleted between checking for existence and reading its mtime.

Parameters:

Name Type Description Default
path Path

Path to the file.

required

Returns:

Type Description
float

The file's modification time as a Unix timestamp, or 0.0 if the

float

file doesn't exist.

Source code in snakesee/utils.py
def safe_mtime(path: Path) -> float:
    """Get file modification time, returning 0.0 if file doesn't exist.

    This handles the common race condition where a file may be deleted
    between checking for existence and reading its mtime.

    Args:
        path: Path to the file.

    Returns:
        The file's modification time as a Unix timestamp, or 0.0 if the
        file doesn't exist.
    """
    try:
        return path.stat().st_mtime
    except (FileNotFoundError, OSError):
        return 0.0

safe_read_json

safe_read_json(path: Path, default: dict[str, Any] | None = None) -> dict[str, Any] | None

Safely read and parse JSON from a file.

Handles file access errors and JSON parse errors gracefully.

Parameters:

Name Type Description Default
path Path

Path to the JSON file.

required
default dict[str, Any] | None

Value to return if file cannot be read or parsed.

None

Returns:

Type Description
dict[str, Any] | None

Parsed JSON as dict, or default if reading/parsing fails.

Source code in snakesee/utils.py
def safe_read_json(path: Path, default: dict[str, Any] | None = None) -> dict[str, Any] | None:
    """Safely read and parse JSON from a file.

    Handles file access errors and JSON parse errors gracefully.

    Args:
        path: Path to the JSON file.
        default: Value to return if file cannot be read or parsed.

    Returns:
        Parsed JSON as dict, or default if reading/parsing fails.
    """
    try:
        content = path.read_bytes()
        result: dict[str, Any] = orjson.loads(content)
        return result
    except (FileNotFoundError, OSError, PermissionError, orjson.JSONDecodeError):
        return default

safe_read_text

safe_read_text(path: Path, default: str = '', errors: str = 'ignore') -> str

Safely read text from a file, returning default on error.

Handles common race conditions and encoding issues gracefully.

Parameters:

Name Type Description Default
path Path

Path to the file.

required
default str

Value to return if file cannot be read.

''
errors str

How to handle encoding errors (passed to read_text).

'ignore'

Returns:

Type Description
str

File contents as string, or default if reading fails.

Source code in snakesee/utils.py
def safe_read_text(path: Path, default: str = "", errors: str = "ignore") -> str:
    """Safely read text from a file, returning default on error.

    Handles common race conditions and encoding issues gracefully.

    Args:
        path: Path to the file.
        default: Value to return if file cannot be read.
        errors: How to handle encoding errors (passed to read_text).

    Returns:
        File contents as string, or default if reading fails.
    """
    try:
        return path.read_text(errors=errors)
    except (FileNotFoundError, OSError, PermissionError):
        return default