utils

Shared utility functions for snakesee.

This module consolidates common utilities used across multiple modules to avoid duplication and ensure consistent behavior.

Classes¶

MetadataCache ¶

Thread-safe cache for parsed metadata files.

Tracks file mtimes to skip re-reading unchanged files.

Source code in snakesee/utils.py

class MetadataCache:
    """Thread-safe cache for parsed metadata files.

    Tracks file mtimes to skip re-reading unchanged files.
    """

    __slots__ = ("_cache", "_lock")

    def __init__(self) -> None:
        """Initialize empty cache."""
        self._cache: dict[Path, tuple[float, int, dict[str, Any]]] = {}
        self._lock = threading.Lock()

    def get(self, path: Path, mtime: float, inode: int) -> dict[str, Any] | None:
        """Get cached data if file hasn't changed.

        Args:
            path: Path to the metadata file.
            mtime: Current file modification time.
            inode: Current file inode.

        Returns:
            Cached data if valid, None if cache miss or stale.
        """
        with self._lock:
            cached = self._cache.get(path)
            if cached is not None:
                cached_mtime, cached_inode, data = cached
                if cached_mtime == mtime and cached_inode == inode:
                    return data
        return None

    def put(self, path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None:
        """Store parsed data in cache.

        Args:
            path: Path to the metadata file.
            mtime: File modification time.
            inode: File inode.
            data: Parsed JSON data.
        """
        with self._lock:
            self._cache[path] = (mtime, inode, data)

    def clear(self) -> None:
        """Clear all cached data."""
        with self._lock:
            self._cache.clear()

    def __len__(self) -> int:
        """Number of cached entries."""
        with self._lock:
            return len(self._cache)

Methods:¶

init ¶

__init__() -> None

Initialize empty cache.

Source code in snakesee/utils.py

def __init__(self) -> None:
    """Initialize empty cache."""
    self._cache: dict[Path, tuple[float, int, dict[str, Any]]] = {}
    self._lock = threading.Lock()

len ¶

__len__() -> int

Number of cached entries.

Source code in snakesee/utils.py

def __len__(self) -> int:
    """Number of cached entries."""
    with self._lock:
        return len(self._cache)

clear ¶

clear() -> None

Clear all cached data.

Source code in snakesee/utils.py

def clear(self) -> None:
    """Clear all cached data."""
    with self._lock:
        self._cache.clear()

get ¶

get(path: Path, mtime: float, inode: int) -> dict[str, Any] | None

Get cached data if file hasn't changed.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the metadata file.	required
`mtime`	`float`	Current file modification time.	required
`inode`	`int`	Current file inode.	required

Returns:

Type	Description
`dict[str, Any] \| None`	Cached data if valid, None if cache miss or stale.

Source code in snakesee/utils.py

def get(self, path: Path, mtime: float, inode: int) -> dict[str, Any] | None:
    """Get cached data if file hasn't changed.

    Args:
        path: Path to the metadata file.
        mtime: Current file modification time.
        inode: Current file inode.

    Returns:
        Cached data if valid, None if cache miss or stale.
    """
    with self._lock:
        cached = self._cache.get(path)
        if cached is not None:
            cached_mtime, cached_inode, data = cached
            if cached_mtime == mtime and cached_inode == inode:
                return data
    return None

put ¶

put(path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None

Store parsed data in cache.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the metadata file.	required
`mtime`	`float`	File modification time.	required
`inode`	`int`	File inode.	required
`data`	`dict[str, Any]`	Parsed JSON data.	required

Source code in snakesee/utils.py

def put(self, path: Path, mtime: float, inode: int, data: dict[str, Any]) -> None:
    """Store parsed data in cache.

    Args:
        path: Path to the metadata file.
        mtime: File modification time.
        inode: File inode.
        data: Parsed JSON data.
    """
    with self._lock:
        self._cache[path] = (mtime, inode, data)

Functions:¶

get_metadata_cache ¶

get_metadata_cache() -> MetadataCache

Get the global metadata cache instance.

Source code in snakesee/utils.py

def get_metadata_cache() -> MetadataCache:
    """Get the global metadata cache instance."""
    return _metadata_cache

get_scan_cache ¶

get_scan_cache() -> _ScanCache

Get the global scan cache instance.

Source code in snakesee/utils.py

def get_scan_cache() -> _ScanCache:
    """Get the global scan cache instance."""
    return _scan_cache

iterate_metadata_files ¶

iterate_metadata_files(metadata_dir: Path, progress_callback: ProgressCallback | None = None, *, sort_by_mtime: bool = True, newest_first: bool = True, use_cache: bool = True, use_parallel: bool = True, max_workers: int = DEFAULT_METADATA_WORKERS) -> Iterator[tuple[Path, dict[str, Any]]]

Iterate metadata files with optional progress reporting.

Iterates over all files in the metadata directory, parsing each as JSON. Invalid files (non-JSON or unreadable) are silently skipped with debug logging.

Performance optimizations: - Uses os.scandir instead of rglob (6-7x faster directory iteration) - Sorts by mtime to process newest files first (better for recent data) - Caches parsed files to skip re-reading unchanged files - Uses parallel I/O for very large directories (>=1000 files)

Parameters:

Name	Type	Description	Default
`metadata_dir`	`Path`	Path to .snakemake/metadata/ directory.	required
`progress_callback`	`ProgressCallback \| None`	Optional callback(current, total) for progress reporting.	`None`
`sort_by_mtime`	`bool`	Sort files by modification time.	`True`
`newest_first`	`bool`	If sorting, put newest files first.	`True`
`use_cache`	`bool`	Use global cache to skip unchanged files.	`True`
`use_parallel`	`bool`	Use parallel I/O for large directories.	`True`
`max_workers`	`int`	Maximum number of parallel workers.	`DEFAULT_METADATA_WORKERS`

Yields:

Type	Description
`tuple[Path, dict[str, Any]]`	Tuples of (file_path, parsed_json_data) for each valid metadata file.

Source code in snakesee/utils.py

def iterate_metadata_files(
    metadata_dir: Path,
    progress_callback: ProgressCallback | None = None,
    *,
    sort_by_mtime: bool = True,
    newest_first: bool = True,
    use_cache: bool = True,
    use_parallel: bool = True,
    max_workers: int = DEFAULT_METADATA_WORKERS,
) -> Iterator[tuple[Path, dict[str, Any]]]:
    """Iterate metadata files with optional progress reporting.

    Iterates over all files in the metadata directory, parsing each as JSON.
    Invalid files (non-JSON or unreadable) are silently skipped with debug logging.

    Performance optimizations:
    - Uses os.scandir instead of rglob (6-7x faster directory iteration)
    - Sorts by mtime to process newest files first (better for recent data)
    - Caches parsed files to skip re-reading unchanged files
    - Uses parallel I/O for very large directories (>=1000 files)

    Args:
        metadata_dir: Path to .snakemake/metadata/ directory.
        progress_callback: Optional callback(current, total) for progress reporting.
        sort_by_mtime: Sort files by modification time.
        newest_first: If sorting, put newest files first.
        use_cache: Use global cache to skip unchanged files.
        use_parallel: Use parallel I/O for large directories.
        max_workers: Maximum number of parallel workers.

    Yields:
        Tuples of (file_path, parsed_json_data) for each valid metadata file.
    """
    if not metadata_dir.exists():
        return

    # Use fast scandir-based recursive scan (bypass scan cache when use_cache=False)
    files = _scandir_files(metadata_dir, use_scan_cache=use_cache)
    if not files:
        return

    # Sort by mtime (newest first by default)
    if sort_by_mtime:
        files = sorted(files, key=lambda f: f.mtime, reverse=newest_first)

    total = len(files)
    cache = get_metadata_cache() if use_cache else None

    # Use parallel reading for large directories
    if use_parallel and total >= PARALLEL_READ_THRESHOLD:
        yield from _iterate_metadata_parallel(files, cache, progress_callback, total, max_workers)
    else:
        yield from _iterate_metadata_sequential(files, cache, progress_callback, total)

json_loads ¶

json_loads(data: str | bytes) -> Any

Parse JSON using orjson for better performance.

Parameters:

Name	Type	Description	Default
`data`	`str \| bytes`	JSON string or bytes to parse.	required

Returns:

Type	Description
`Any`	Parsed JSON data.

Raises:

Type	Description
`JSONDecodeError`	If the data is not valid JSON.

Source code in snakesee/utils.py

def json_loads(data: str | bytes) -> Any:
    """Parse JSON using orjson for better performance.

    Args:
        data: JSON string or bytes to parse.

    Returns:
        Parsed JSON data.

    Raises:
        orjson.JSONDecodeError: If the data is not valid JSON.
    """
    if isinstance(data, str):
        data = data.encode("utf-8")
    return orjson.loads(data)

safe_file_size ¶

safe_file_size(path: Path) -> int

Safely get file size in bytes, returning 0 on error.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the file.	required

Returns:

Type	Description
`int`	File size in bytes, or 0 if file doesn't exist or can't be accessed.

Source code in snakesee/utils.py

def safe_file_size(path: Path) -> int:
    """Safely get file size in bytes, returning 0 on error.

    Args:
        path: Path to the file.

    Returns:
        File size in bytes, or 0 if file doesn't exist or can't be accessed.
    """
    try:
        return path.stat().st_size
    except (FileNotFoundError, OSError):
        return 0

safe_mtime ¶

safe_mtime(path: Path) -> float

Get file modification time, returning 0.0 if file doesn't exist.

This handles the common race condition where a file may be deleted between checking for existence and reading its mtime.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the file.	required

Returns:

Type	Description
`float`	The file's modification time as a Unix timestamp, or 0.0 if the
`float`	file doesn't exist.

Source code in snakesee/utils.py

def safe_mtime(path: Path) -> float:
    """Get file modification time, returning 0.0 if file doesn't exist.

    This handles the common race condition where a file may be deleted
    between checking for existence and reading its mtime.

    Args:
        path: Path to the file.

    Returns:
        The file's modification time as a Unix timestamp, or 0.0 if the
        file doesn't exist.
    """
    try:
        return path.stat().st_mtime
    except (FileNotFoundError, OSError):
        return 0.0

safe_read_json ¶

safe_read_json(path: Path, default: dict[str, Any] | None = None) -> dict[str, Any] | None

Safely read and parse JSON from a file.

Handles file access errors and JSON parse errors gracefully.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the JSON file.	required
`default`	`dict[str, Any] \| None`	Value to return if file cannot be read or parsed.	`None`

Returns:

Type	Description
`dict[str, Any] \| None`	Parsed JSON as dict, or default if reading/parsing fails.

Source code in snakesee/utils.py

def safe_read_json(path: Path, default: dict[str, Any] | None = None) -> dict[str, Any] | None:
    """Safely read and parse JSON from a file.

    Handles file access errors and JSON parse errors gracefully.

    Args:
        path: Path to the JSON file.
        default: Value to return if file cannot be read or parsed.

    Returns:
        Parsed JSON as dict, or default if reading/parsing fails.
    """
    try:
        content = path.read_bytes()
        result: dict[str, Any] = orjson.loads(content)
        return result
    except (FileNotFoundError, OSError, PermissionError, orjson.JSONDecodeError):
        return default

safe_read_text ¶

safe_read_text(path: Path, default: str = '', errors: str = 'ignore') -> str

Safely read text from a file, returning default on error.

Handles common race conditions and encoding issues gracefully.

Parameters:

Name	Type	Description	Default
`path`	`Path`	Path to the file.	required
`default`	`str`	Value to return if file cannot be read.	`''`
`errors`	`str`	How to handle encoding errors (passed to read_text).	`'ignore'`

Returns:

Type	Description
`str`	File contents as string, or default if reading fails.

Source code in snakesee/utils.py

def safe_read_text(path: Path, default: str = "", errors: str = "ignore") -> str:
    """Safely read text from a file, returning default on error.

    Handles common race conditions and encoding issues gracefully.

    Args:
        path: Path to the file.
        default: Value to return if file cannot be read.
        errors: How to handle encoding errors (passed to read_text).

    Returns:
        File contents as string, or default if reading fails.
    """
    try:
        return path.read_text(errors=errors)
    except (FileNotFoundError, OSError, PermissionError):
        return default

utils

Classes¶

MetadataCache ¶

Methods:¶

__init__ ¶

__len__ ¶

clear ¶

get ¶

put ¶

Functions:¶

get_metadata_cache ¶

get_scan_cache ¶

iterate_metadata_files ¶

json_loads ¶

safe_file_size ¶

safe_mtime ¶

safe_read_json ¶

safe_read_text ¶

init ¶

len ¶