Skip to content

utils

Utility functions for parsing.

Small helper functions used across parser modules.

Functions

calculate_input_size

calculate_input_size(file_paths: list[Path]) -> int | None

Calculate total size of input files.

Parameters:

Name Type Description Default
file_paths list[Path]

List of input file paths.

required

Returns:

Type Description
int | None

Total size in bytes, or None if any file doesn't exist.

Source code in snakesee/parser/utils.py
def calculate_input_size(file_paths: list[Path]) -> int | None:
    """
    Calculate total size of input files.

    Args:
        file_paths: List of input file paths.

    Returns:
        Total size in bytes, or None if any file doesn't exist.
    """
    total_size = 0
    for path in file_paths:
        try:
            total_size += path.stat().st_size
        except OSError:
            return None  # File doesn't exist or can't be accessed
    return total_size if file_paths else None

estimate_input_size_from_output

estimate_input_size_from_output(output_path: Path, workflow_dir: Path) -> int | None

Try to estimate input size by looking for related input files.

This is a heuristic that works for common bioinformatics patterns where output files are derived from inputs with predictable naming conventions.

Examples:

  • sample.sorted.bam -> sample.bam
  • sample.fastq.gz -> looks for sample.fq.gz, sample.fastq.gz
  • sample.vcf.gz -> sample.bam

Parameters:

Name Type Description Default
output_path Path

Path to the output file.

required
workflow_dir Path

Workflow root directory.

required

Returns:

Type Description
int | None

Estimated input size in bytes, or None if not determinable.

Source code in snakesee/parser/utils.py
def estimate_input_size_from_output(
    output_path: Path,
    workflow_dir: Path,
) -> int | None:
    """
    Try to estimate input size by looking for related input files.

    This is a heuristic that works for common bioinformatics patterns where
    output files are derived from inputs with predictable naming conventions.

    Examples:
        - sample.sorted.bam -> sample.bam
        - sample.fastq.gz -> looks for sample.fq.gz, sample.fastq.gz
        - sample.vcf.gz -> sample.bam

    Args:
        output_path: Path to the output file.
        workflow_dir: Workflow root directory.

    Returns:
        Estimated input size in bytes, or None if not determinable.
    """
    # Common input file patterns relative to output
    suffixes_to_strip = [
        ".sorted.bam",
        ".sorted",
        ".trimmed",
        ".filtered",
        ".dedup",
        ".aligned",
    ]

    name = output_path.name

    # Try stripping common suffixes to find input
    for suffix in suffixes_to_strip:
        if name.endswith(suffix):
            input_name = name[: -len(suffix)]
            # Try common extensions
            for ext in [".bam", ".fastq.gz", ".fq.gz", ".fa.gz", ".fasta.gz"]:
                candidate = workflow_dir / (input_name + ext)
                if candidate.exists():
                    try:
                        return candidate.stat().st_size
                    except OSError:
                        continue

    # No input found
    return None