Core API Reference¶

Complete API reference for ParquetFrame core functionality.

Core Classes¶

ParquetFrame¶

The main class for working with parquet data.

`parquetframe.ParquetFrame` ¶

A wrapper for pandas and Dask DataFrames to simplify working with parquet files.

The class automatically switches between pandas and Dask based on file size or a manual flag. It delegates all standard DataFrame methods to the active internal dataframe.

Examples:

>>> import parquetframe as pqf
>>> # Read file with automatic backend selection
>>> pf = pqf.pf.read("data.parquet")
>>> # Manual backend control
>>> pf = pqf.pf.read("data", islazy=True)  # Force Dask
>>> # Standard DataFrame operations work transparently
>>> result = pf.groupby("column").sum()

Source code in src/parquetframe/core.py

class ParquetFrame:
    """
    A wrapper for pandas and Dask DataFrames to simplify working with parquet files.

    The class automatically switches between pandas and Dask based on file size
    or a manual flag. It delegates all standard DataFrame methods to the active
    internal dataframe.

    Examples:
        >>> import parquetframe as pqf
        >>> # Read file with automatic backend selection
        >>> pf = pqf.pf.read("data.parquet")
        >>> # Manual backend control
        >>> pf = pqf.pf.read("data", islazy=True)  # Force Dask
        >>> # Standard DataFrame operations work transparently
        >>> result = pf.groupby("column").sum()
    """

    def __init__(
        self,
        df: Optional[Union[pd.DataFrame, dd.DataFrame]] = None,
        islazy: bool = False,
        track_history: bool = False,
    ) -> None:
        """
        Initialize the ParquetFrame.

        Args:
            df: An initial dataframe (pandas or Dask).
            islazy: If True, forces a Dask DataFrame.
            track_history: If True, enables history tracking for CLI sessions.
        """
        self._df = df
        self._islazy = islazy
        self.DEFAULT_THRESHOLD_MB = 10
        self._track_history = track_history
        self._history = [] if track_history else None

    @property
    def islazy(self) -> bool:
        """Get the current backend type (True for Dask, False for pandas)."""
        return self._islazy

    @islazy.setter
    def islazy(self, value: bool) -> None:
        """Set the backend type and convert the dataframe if necessary."""
        if not isinstance(value, bool):
            raise TypeError("islazy must be a boolean")
        if value != self._islazy:
            if value:
                self.to_dask()
            else:
                self.to_pandas()

    def __repr__(self) -> str:
        """String representation of the object."""
        df_type = "Dask" if self.islazy else "pandas"
        if self._df is None:
            return f"ParquetFrame(type={df_type}, df=None)"
        return f"ParquetFrame(type={df_type}, df={self._df.__repr__()})"

    def __getitem__(self, key):
        """
        Support indexing operations (df[column] or df[columns]).
        """
        if self._df is None:
            raise ValueError("No dataframe loaded")

        result = self._df[key]

        # Track operation in history if enabled
        if self._track_history:
            if isinstance(key, list):
                key_repr = repr(key)
            else:
                key_repr = repr(key)
            self._history.append(f"pf = pf[{key_repr}]")

        # If result is a dataframe, wrap it
        if isinstance(result, (pd.DataFrame, dd.DataFrame)):
            new_pf = ParquetFrame(
                result,
                isinstance(result, dd.DataFrame),
                track_history=self._track_history,
            )
            # Inherit history from parent if tracking
            if self._track_history:
                new_pf._history = self._history.copy()
            return new_pf
        return result

    def __len__(self) -> int:
        """
        Return the length of the dataframe.
        """
        if self._df is None:
            return 0
        return len(self._df)

    def __getattr__(self, name: str) -> Any:
        """
        Delegate attribute access to the underlying dataframe.

        This method is called for attributes not found in the ParquetFrame instance.
        It forwards the call to the internal dataframe (_df).
        """
        if self._df is not None:
            attr = getattr(self._df, name)
            if callable(attr):

                def wrapper(*args, **kwargs):
                    # Track operation in history if enabled
                    if self._track_history:
                        args_repr = [repr(arg) for arg in args]
                        kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()]
                        call_repr = f".{name}({', '.join(args_repr + kwargs_repr)})"
                        self._history.append(f"pf = pf{call_repr}")

                    result = attr(*args, **kwargs)
                    # If the result is a dataframe, wrap it in a new ParquetFrame
                    if isinstance(result, (pd.DataFrame, dd.DataFrame)):
                        new_pf = ParquetFrame(
                            result,
                            isinstance(result, dd.DataFrame),
                            track_history=self._track_history,
                        )
                        # Inherit history from parent if tracking
                        if self._track_history:
                            new_pf._history = self._history.copy()
                        return new_pf
                    return result

                return wrapper
            return attr
        raise AttributeError(
            f"'{type(self).__name__}' object has no attribute '{name}'"
        )

    @classmethod
    def _estimate_memory_usage(cls, file_path: Path) -> float:
        """Estimate memory usage for loading file based on file size and compression."""
        try:
            import pyarrow.parquet as pq

            # Get file metadata without loading full file
            _ = pq.ParquetFile(file_path).metadata

            # Estimate memory usage based on:
            # 1. Uncompressed size (compressed size * expansion factor)
            # 2. Data types (strings use more memory than numbers)
            # 3. Null values (can reduce memory usage)

            compressed_size = file_path.stat().st_size / 1024 / 1024  # MB

            # Estimate compression ratio (typical parquet compression is 3-10x)
            # Use conservative estimate of 4x expansion
            expansion_factor = 4.0

            # Additional overhead for pandas DataFrame structure
            pandas_overhead = 1.5

            estimated_memory = compressed_size * expansion_factor * pandas_overhead

            return estimated_memory

        except Exception:
            # Fallback to simple file size estimation
            file_size_mb = file_path.stat().st_size / 1024 / 1024
            return file_size_mb * 5  # Conservative estimate

    @classmethod
    def _get_system_memory(cls) -> float:
        """Get available system memory in MB."""
        try:
            import psutil

            # Get available memory (not just free, but actually available)
            available_mb = psutil.virtual_memory().available / 1024 / 1024
            return available_mb
        except (ImportError, Exception):
            # Conservative fallback if psutil not available or fails
            return 2048  # Assume 2GB available

    @classmethod
    def _should_use_dask(
        cls, file_path: Path, threshold_mb: float, islazy: Optional[bool] = None
    ) -> bool:
        """Intelligently determine whether to use Dask based on multiple factors."""
        if islazy is not None:
            return islazy

        file_size_mb = file_path.stat().st_size / 1024 / 1024

        # Basic threshold check
        if file_size_mb >= threshold_mb:
            return True

        # Advanced checks if file is close to threshold
        if file_size_mb >= threshold_mb * 0.7:  # Within 70% of threshold
            estimated_memory = cls._estimate_memory_usage(file_path)
            available_memory = cls._get_system_memory()

            # Use Dask if estimated memory usage > 50% of available memory
            if estimated_memory > available_memory * 0.5:
                return True

            # Use Dask if file has many partitions (suggests it's meant for parallel processing)
            try:
                import pyarrow.parquet as pq

                metadata = pq.ParquetFile(file_path).metadata
                if metadata.num_row_groups > 10:  # Many row groups suggest chunked data
                    return True
            except Exception:  # nosec B110
                # Intentional broad exception handling for metadata parsing robustness
                # Failure to determine row groups should not crash file read operation
                pass

        return False

    @classmethod
    def read(
        cls,
        file: Union[str, Path],
        threshold_mb: Optional[float] = None,
        islazy: Optional[bool] = None,
        **kwargs,
    ) -> "ParquetFrame":
        """
        Read a parquet file into a ParquetFrame.

        Automatically selects pandas or Dask based on file size, unless overridden.
        Handles file extension detection automatically.

        Args:
            file: Path to the parquet file (extension optional).
            threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
            islazy: Force backend selection (True=Dask, False=pandas, None=auto).
            **kwargs: Additional keyword arguments for read_parquet methods.

        Returns:
            ParquetFrame instance with loaded data.

        Raises:
            FileNotFoundError: If no parquet file is found.

        Examples:
            >>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
            >>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
            >>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask
        """
        # Support URLs and fsspec-based paths without local existence checks
        file_str = str(file)
        is_url = file_str.startswith(("http://", "https://", "s3://", "gs://"))

        if is_url:
            file_path = file_str  # pass through to reader
            file_size_mb = 0.0
        else:
            file_path = cls._resolve_file_path(file)
            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

        # Validate islazy parameter
        if islazy is not None and not isinstance(islazy, bool):
            raise TypeError("islazy parameter must be a boolean or None")

        # Determine backend using explicit override first
        threshold = threshold_mb if threshold_mb is not None else 10
        if islazy is not None:
            use_dask = bool(islazy)
        else:
            # Intelligent switching when not explicitly specified
            use_dask = cls._should_use_dask(
                Path(file_path) if not is_url else Path("."), threshold, None
            )

        # Read the file
        if use_dask:
            df = dd.read_parquet(file_path, **kwargs)
            print(
                f"Reading '{file_path}' as Dask DataFrame (size: {file_size_mb:.2f} MB)"
            )
        else:
            df = pd.read_parquet(file_path, **kwargs)
            print(
                f"Reading '{file_path}' as pandas DataFrame (size: {file_size_mb:.2f} MB)"
            )

        instance = cls(df, use_dask)
        # Track read operation in history if needed
        if hasattr(cls, "_current_session_tracking") and cls._current_session_tracking:
            instance._track_history = True
            instance._history = [
                f"pf = ParquetFrame.read('{file}', threshold_mb={threshold_mb}, islazy={islazy})"
            ]
        return instance

    def save(
        self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
    ) -> "ParquetFrame":
        """
        Save the dataframe to a parquet file.

        Automatically adds .parquet extension if not present.
        Works with both pandas and Dask dataframes.

        Args:
            file: Base name for the output file.
            save_script: If provided, saves session history to this Python script.
            **kwargs: Additional keyword arguments for to_parquet methods.

        Returns:
            Self for method chaining.

        Raises:
            TypeError: If no dataframe is loaded.

        Examples:
            >>> pf.save("output")  # Saves as output.parquet
            >>> pf.save("data.parquet", compression='snappy')
            >>> pf.save("output", save_script="session.py")  # Also saves session history
        """
        if self._df is None:
            raise TypeError("No dataframe loaded to save.")

        file_path = self._ensure_parquet_extension(file)

        # Track save operation in history
        if self._track_history:
            save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
            if save_script:
                save_args.append(f"save_script='{save_script}'")
            self._history.append(f"pf.save({', '.join(save_args)})")

        # Ensure parent directory exists
        file_path.parent.mkdir(parents=True, exist_ok=True)

        if isinstance(self._df, dd.DataFrame):
            self._df.to_parquet(file_path, **kwargs)
            print(f"Dask DataFrame saved to '{file_path}'.")
        elif isinstance(self._df, pd.DataFrame):
            self._df.to_parquet(file_path, **kwargs)
            print(f"pandas DataFrame saved to '{file_path}'.")

        # Save script if requested
        if save_script and self._track_history:
            self._save_history_script(save_script)

        return self

    def to_pandas(self) -> "ParquetFrame":
        """
        Convert the internal Dask dataframe to a pandas dataframe.

        Returns:
            Self for method chaining.
        """
        if self.islazy and isinstance(self._df, dd.DataFrame):
            self._df = self._df.compute()
            # Normalize string dtype back to object for consistency with tests
            try:
                string_cols = list(self._df.select_dtypes(include="string").columns)
                if string_cols:
                    self._df[string_cols] = self._df[string_cols].astype("object")
            except Exception:  # nosec B110
                # Intentional broad exception handling for dtype conversion robustness
                # Failure to normalize string dtypes should not crash pandas conversion
                pass
            self._islazy = False
            print("Converted to pandas DataFrame.")
        else:
            print("Already a pandas DataFrame.")
        return self

    def to_dask(self, npartitions: Optional[int] = None) -> "ParquetFrame":
        """
        Convert the internal pandas dataframe to a Dask dataframe.

        Args:
            npartitions: Number of partitions for the Dask dataframe.
                       Defaults to the number of CPU cores.

        Returns:
            Self for method chaining.
        """
        if npartitions is not None and npartitions <= 0:
            raise ValueError("npartitions must be a positive integer")
        if not self.islazy and isinstance(self._df, pd.DataFrame):
            npart = npartitions if npartitions is not None else os.cpu_count() or 1
            self._df = dd.from_pandas(self._df, npartitions=npart)
            self._islazy = True
            print("Converted to Dask DataFrame.")
        else:
            print("Already a Dask DataFrame.")
        return self

    def sql(self, query: str, **other_frames: "ParquetFrame") -> "ParquetFrame":
        """
        Execute a SQL query on this ParquetFrame using DuckDB.

        The current ParquetFrame is available as 'df' in the query.
        Additional ParquetFrames can be passed as keyword arguments.

        Args:
            query: SQL query string to execute.
            **other_frames: Additional ParquetFrames to use in JOINs.

        Returns:
            New ParquetFrame with query results (always pandas backend).

        Raises:
            ImportError: If DuckDB is not installed.
            ValueError: If query execution fails.

        Examples:
            >>> # Simple query
            >>> result = pf.sql("SELECT * FROM df WHERE age > 25")
            >>>
            >>> # JOIN with another ParquetFrame
            >>> orders = pf.sql(
            ...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
            ...     customers=customers_pf
            ... )
        """
        if self._df is None:
            raise ValueError("No dataframe loaded for SQL query")

        from .sql import query_dataframes

        # Convert other ParquetFrames to their underlying DataFrames
        other_dfs = {name: pf._df for name, pf in other_frames.items()}

        # Execute SQL query
        result_df = query_dataframes(self._df, query, other_dfs)

        # Return as pandas-backed ParquetFrame (SQL results are always pandas)
        return self.__class__(result_df, islazy=False)

    @property
    def bio(self):
        """
        Access bioframe functions with intelligent parallel dispatching.

        Returns BioAccessor that automatically chooses between pandas (eager)
        and Dask (parallel) implementations based on the current backend.

        Returns:
            BioAccessor instance for genomic operations.

        Raises:
            ImportError: If bioframe is not installed.

        Examples:
            >>> # Cluster genomic intervals
            >>> clustered = pf.bio.cluster(min_dist=1000)
            >>>
            >>> # Find overlaps with another ParquetFrame
            >>> overlaps = pf.bio.overlap(other_pf, broadcast=True)
        """
        from .bio import BioAccessor

        return BioAccessor(self)

    @staticmethod
    def _resolve_file_path(file: Union[str, Path]) -> Path:
        """
        Resolve file path and handle extension detection.

        Args:
            file: Input file path.

        Returns:
            Resolved Path object.

        Raises:
            FileNotFoundError: If no parquet file variant is found.
        """
        file_path = Path(file)

        # If extension is already present, use as-is
        if file_path.suffix in (".parquet", ".pqt"):
            if file_path.exists():
                return file_path
            else:
                raise FileNotFoundError(f"File not found: {file_path}")

        # Try different extensions
        for ext in [".parquet", ".pqt"]:
            candidate = file_path.with_suffix(ext)
            if candidate.exists():
                return candidate

        raise FileNotFoundError(
            f"No parquet file found for '{file}' (tried .parquet, .pqt)"
        )

    def _save_history_script(self, script_path: Union[str, Path]) -> None:
        """
        Save the session history to a Python script.

        Args:
            script_path: Path to save the script to.
        """
        if not self._track_history or not self._history:
            print("No history to save (history tracking not enabled or empty).")
            return

        script_path = Path(script_path)
        if script_path.suffix != ".py":
            script_path = script_path.with_suffix(".py")

        header = "# Auto-generated script from ParquetFrame CLI session\n"
        header += (
            "from parquetframe import ParquetFrame\nimport parquetframe as pqf\n\n"
        )

        with open(script_path, "w") as f:
            f.write(header)
            for line in self._history:
                f.write(line + "\n")

        print(f"Session history saved to '{script_path}'")

    def get_history(self) -> Optional[list]:
        """
        Get the current session history.

        Returns:
            List of command strings if history tracking is enabled, None otherwise.
        """
        return self._history.copy() if self._track_history else None

    def clear_history(self) -> None:
        """
        Clear the session history.
        """
        if self._track_history:
            self._history.clear()
            print("Session history cleared.")
        else:
            print("History tracking not enabled.")

    @staticmethod
    def _ensure_parquet_extension(file: Union[str, Path]) -> Path:
        """
        Ensure the file path has a parquet extension.

        Args:
            file: Input file path.

        Returns:
            Path with appropriate parquet extension.
        """
        file_path = Path(file)
        if file_path.suffix not in (".parquet", ".pqt"):
            return file_path.with_suffix(".parquet")
        return file_path

`islazy` `property` `writable` ¶

Get the current backend type (True for Dask, False for pandas).

`bio` `property` ¶

Access bioframe functions with intelligent parallel dispatching.

Returns BioAccessor that automatically chooses between pandas (eager) and Dask (parallel) implementations based on the current backend.

Returns:

Type	Description
	BioAccessor instance for genomic operations.

Raises:

Type	Description
`ImportError`	If bioframe is not installed.

Examples:

>>> # Cluster genomic intervals
>>> clustered = pf.bio.cluster(min_dist=1000)
>>>
>>> # Find overlaps with another ParquetFrame
>>> overlaps = pf.bio.overlap(other_pf, broadcast=True)

`init(df=None, islazy=False, track_history=False)` ¶

Initialize the ParquetFrame.

Parameters:

Name	Type	Description	Default
`df`	`Optional[Union[DataFrame, DataFrame]]`	An initial dataframe (pandas or Dask).	`None`
`islazy`	`bool`	If True, forces a Dask DataFrame.	`False`
`track_history`	`bool`	If True, enables history tracking for CLI sessions.	`False`

Source code in src/parquetframe/core.py

def __init__(
    self,
    df: Optional[Union[pd.DataFrame, dd.DataFrame]] = None,
    islazy: bool = False,
    track_history: bool = False,
) -> None:
    """
    Initialize the ParquetFrame.

    Args:
        df: An initial dataframe (pandas or Dask).
        islazy: If True, forces a Dask DataFrame.
        track_history: If True, enables history tracking for CLI sessions.
    """
    self._df = df
    self._islazy = islazy
    self.DEFAULT_THRESHOLD_MB = 10
    self._track_history = track_history
    self._history = [] if track_history else None

`repr()` ¶

String representation of the object.

Source code in src/parquetframe/core.py

def __repr__(self) -> str:
    """String representation of the object."""
    df_type = "Dask" if self.islazy else "pandas"
    if self._df is None:
        return f"ParquetFrame(type={df_type}, df=None)"
    return f"ParquetFrame(type={df_type}, df={self._df.__repr__()})"

`getitem(key)` ¶

Support indexing operations (df[column] or df[columns]).

Source code in src/parquetframe/core.py

def __getitem__(self, key):
    """
    Support indexing operations (df[column] or df[columns]).
    """
    if self._df is None:
        raise ValueError("No dataframe loaded")

    result = self._df[key]

    # Track operation in history if enabled
    if self._track_history:
        if isinstance(key, list):
            key_repr = repr(key)
        else:
            key_repr = repr(key)
        self._history.append(f"pf = pf[{key_repr}]")

    # If result is a dataframe, wrap it
    if isinstance(result, (pd.DataFrame, dd.DataFrame)):
        new_pf = ParquetFrame(
            result,
            isinstance(result, dd.DataFrame),
            track_history=self._track_history,
        )
        # Inherit history from parent if tracking
        if self._track_history:
            new_pf._history = self._history.copy()
        return new_pf
    return result

`len()` ¶

Return the length of the dataframe.

Source code in src/parquetframe/core.py

def __len__(self) -> int:
    """
    Return the length of the dataframe.
    """
    if self._df is None:
        return 0
    return len(self._df)

`getattr(name)` ¶

Delegate attribute access to the underlying dataframe.

This method is called for attributes not found in the ParquetFrame instance. It forwards the call to the internal dataframe (_df).

Source code in src/parquetframe/core.py

def __getattr__(self, name: str) -> Any:
    """
    Delegate attribute access to the underlying dataframe.

    This method is called for attributes not found in the ParquetFrame instance.
    It forwards the call to the internal dataframe (_df).
    """
    if self._df is not None:
        attr = getattr(self._df, name)
        if callable(attr):

            def wrapper(*args, **kwargs):
                # Track operation in history if enabled
                if self._track_history:
                    args_repr = [repr(arg) for arg in args]
                    kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()]
                    call_repr = f".{name}({', '.join(args_repr + kwargs_repr)})"
                    self._history.append(f"pf = pf{call_repr}")

                result = attr(*args, **kwargs)
                # If the result is a dataframe, wrap it in a new ParquetFrame
                if isinstance(result, (pd.DataFrame, dd.DataFrame)):
                    new_pf = ParquetFrame(
                        result,
                        isinstance(result, dd.DataFrame),
                        track_history=self._track_history,
                    )
                    # Inherit history from parent if tracking
                    if self._track_history:
                        new_pf._history = self._history.copy()
                    return new_pf
                return result

            return wrapper
        return attr
    raise AttributeError(
        f"'{type(self).__name__}' object has no attribute '{name}'"
    )

`read(file, threshold_mb=None, islazy=None, **kwargs)` `classmethod` ¶

Read a parquet file into a ParquetFrame.

Automatically selects pandas or Dask based on file size, unless overridden. Handles file extension detection automatically.

Parameters:

Name	Type	Description	Default
`file`	`Union[str, Path]`	Path to the parquet file (extension optional).	required
`threshold_mb`	`Optional[float]`	Size threshold in MB for backend selection. Defaults to 10MB.	`None`
`islazy`	`Optional[bool]`	Force backend selection (True=Dask, False=pandas, None=auto).	`None`
`**kwargs`		Additional keyword arguments for read_parquet methods.	`{}`

Returns:

Type	Description
`ParquetFrame`	ParquetFrame instance with loaded data.

Raises:

Type	Description
`FileNotFoundError`	If no parquet file is found.

Examples:

>>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
>>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
>>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask

Source code in src/parquetframe/core.py

@classmethod
def read(
    cls,
    file: Union[str, Path],
    threshold_mb: Optional[float] = None,
    islazy: Optional[bool] = None,
    **kwargs,
) -> "ParquetFrame":
    """
    Read a parquet file into a ParquetFrame.

    Automatically selects pandas or Dask based on file size, unless overridden.
    Handles file extension detection automatically.

    Args:
        file: Path to the parquet file (extension optional).
        threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
        islazy: Force backend selection (True=Dask, False=pandas, None=auto).
        **kwargs: Additional keyword arguments for read_parquet methods.

    Returns:
        ParquetFrame instance with loaded data.

    Raises:
        FileNotFoundError: If no parquet file is found.

    Examples:
        >>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
        >>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
        >>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask
    """
    # Support URLs and fsspec-based paths without local existence checks
    file_str = str(file)
    is_url = file_str.startswith(("http://", "https://", "s3://", "gs://"))

    if is_url:
        file_path = file_str  # pass through to reader
        file_size_mb = 0.0
    else:
        file_path = cls._resolve_file_path(file)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

    # Validate islazy parameter
    if islazy is not None and not isinstance(islazy, bool):
        raise TypeError("islazy parameter must be a boolean or None")

    # Determine backend using explicit override first
    threshold = threshold_mb if threshold_mb is not None else 10
    if islazy is not None:
        use_dask = bool(islazy)
    else:
        # Intelligent switching when not explicitly specified
        use_dask = cls._should_use_dask(
            Path(file_path) if not is_url else Path("."), threshold, None
        )

    # Read the file
    if use_dask:
        df = dd.read_parquet(file_path, **kwargs)
        print(
            f"Reading '{file_path}' as Dask DataFrame (size: {file_size_mb:.2f} MB)"
        )
    else:
        df = pd.read_parquet(file_path, **kwargs)
        print(
            f"Reading '{file_path}' as pandas DataFrame (size: {file_size_mb:.2f} MB)"
        )

    instance = cls(df, use_dask)
    # Track read operation in history if needed
    if hasattr(cls, "_current_session_tracking") and cls._current_session_tracking:
        instance._track_history = True
        instance._history = [
            f"pf = ParquetFrame.read('{file}', threshold_mb={threshold_mb}, islazy={islazy})"
        ]
    return instance

`save(file, save_script=None, **kwargs)` ¶

Save the dataframe to a parquet file.

Automatically adds .parquet extension if not present. Works with both pandas and Dask dataframes.

Parameters:

Name	Type	Description	Default
`file`	`Union[str, Path]`	Base name for the output file.	required
`save_script`	`Optional[str]`	If provided, saves session history to this Python script.	`None`
`**kwargs`		Additional keyword arguments for to_parquet methods.	`{}`

Returns:

Type	Description
`ParquetFrame`	Self for method chaining.

Raises:

Type	Description
`TypeError`	If no dataframe is loaded.

Examples:

>>> pf.save("output")  # Saves as output.parquet
>>> pf.save("data.parquet", compression='snappy')
>>> pf.save("output", save_script="session.py")  # Also saves session history

Source code in src/parquetframe/core.py

def save(
    self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
) -> "ParquetFrame":
    """
    Save the dataframe to a parquet file.

    Automatically adds .parquet extension if not present.
    Works with both pandas and Dask dataframes.

    Args:
        file: Base name for the output file.
        save_script: If provided, saves session history to this Python script.
        **kwargs: Additional keyword arguments for to_parquet methods.

    Returns:
        Self for method chaining.

    Raises:
        TypeError: If no dataframe is loaded.

    Examples:
        >>> pf.save("output")  # Saves as output.parquet
        >>> pf.save("data.parquet", compression='snappy')
        >>> pf.save("output", save_script="session.py")  # Also saves session history
    """
    if self._df is None:
        raise TypeError("No dataframe loaded to save.")

    file_path = self._ensure_parquet_extension(file)

    # Track save operation in history
    if self._track_history:
        save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
        if save_script:
            save_args.append(f"save_script='{save_script}'")
        self._history.append(f"pf.save({', '.join(save_args)})")

    # Ensure parent directory exists
    file_path.parent.mkdir(parents=True, exist_ok=True)

    if isinstance(self._df, dd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"Dask DataFrame saved to '{file_path}'.")
    elif isinstance(self._df, pd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"pandas DataFrame saved to '{file_path}'.")

    # Save script if requested
    if save_script and self._track_history:
        self._save_history_script(save_script)

    return self

`to_pandas()` ¶

Convert the internal Dask dataframe to a pandas dataframe.

Returns:

Type	Description
`ParquetFrame`	Self for method chaining.

Source code in src/parquetframe/core.py

def to_pandas(self) -> "ParquetFrame":
    """
    Convert the internal Dask dataframe to a pandas dataframe.

    Returns:
        Self for method chaining.
    """
    if self.islazy and isinstance(self._df, dd.DataFrame):
        self._df = self._df.compute()
        # Normalize string dtype back to object for consistency with tests
        try:
            string_cols = list(self._df.select_dtypes(include="string").columns)
            if string_cols:
                self._df[string_cols] = self._df[string_cols].astype("object")
        except Exception:  # nosec B110
            # Intentional broad exception handling for dtype conversion robustness
            # Failure to normalize string dtypes should not crash pandas conversion
            pass
        self._islazy = False
        print("Converted to pandas DataFrame.")
    else:
        print("Already a pandas DataFrame.")
    return self

`to_dask(npartitions=None)` ¶

Convert the internal pandas dataframe to a Dask dataframe.

Parameters:

Name	Type	Description	Default
`npartitions`	`Optional[int]`	Number of partitions for the Dask dataframe. Defaults to the number of CPU cores.	`None`

Returns:

Type	Description
`ParquetFrame`	Self for method chaining.

Source code in src/parquetframe/core.py

def to_dask(self, npartitions: Optional[int] = None) -> "ParquetFrame":
    """
    Convert the internal pandas dataframe to a Dask dataframe.

    Args:
        npartitions: Number of partitions for the Dask dataframe.
                   Defaults to the number of CPU cores.

    Returns:
        Self for method chaining.
    """
    if npartitions is not None and npartitions <= 0:
        raise ValueError("npartitions must be a positive integer")
    if not self.islazy and isinstance(self._df, pd.DataFrame):
        npart = npartitions if npartitions is not None else os.cpu_count() or 1
        self._df = dd.from_pandas(self._df, npartitions=npart)
        self._islazy = True
        print("Converted to Dask DataFrame.")
    else:
        print("Already a Dask DataFrame.")
    return self

`sql(query, **other_frames)` ¶

Execute a SQL query on this ParquetFrame using DuckDB.

The current ParquetFrame is available as 'df' in the query. Additional ParquetFrames can be passed as keyword arguments.

Parameters:

Name	Type	Description	Default
`query`	`str`	SQL query string to execute.	required
`**other_frames`	`ParquetFrame`	Additional ParquetFrames to use in JOINs.	`{}`

Returns:

Type	Description
`ParquetFrame`	New ParquetFrame with query results (always pandas backend).

Raises:

Type	Description
`ImportError`	If DuckDB is not installed.
`ValueError`	If query execution fails.

Examples:

>>> # Simple query
>>> result = pf.sql("SELECT * FROM df WHERE age > 25")
>>>
>>> # JOIN with another ParquetFrame
>>> orders = pf.sql(
...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
...     customers=customers_pf
... )

Source code in src/parquetframe/core.py

def sql(self, query: str, **other_frames: "ParquetFrame") -> "ParquetFrame":
    """
    Execute a SQL query on this ParquetFrame using DuckDB.

    The current ParquetFrame is available as 'df' in the query.
    Additional ParquetFrames can be passed as keyword arguments.

    Args:
        query: SQL query string to execute.
        **other_frames: Additional ParquetFrames to use in JOINs.

    Returns:
        New ParquetFrame with query results (always pandas backend).

    Raises:
        ImportError: If DuckDB is not installed.
        ValueError: If query execution fails.

    Examples:
        >>> # Simple query
        >>> result = pf.sql("SELECT * FROM df WHERE age > 25")
        >>>
        >>> # JOIN with another ParquetFrame
        >>> orders = pf.sql(
        ...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
        ...     customers=customers_pf
        ... )
    """
    if self._df is None:
        raise ValueError("No dataframe loaded for SQL query")

    from .sql import query_dataframes

    # Convert other ParquetFrames to their underlying DataFrames
    other_dfs = {name: pf._df for name, pf in other_frames.items()}

    # Execute SQL query
    result_df = query_dataframes(self._df, query, other_dfs)

    # Return as pandas-backed ParquetFrame (SQL results are always pandas)
    return self.__class__(result_df, islazy=False)

`get_history()` ¶

Get the current session history.

Returns:

Type	Description
`Optional[list]`	List of command strings if history tracking is enabled, None otherwise.

Source code in src/parquetframe/core.py

def get_history(self) -> Optional[list]:
    """
    Get the current session history.

    Returns:
        List of command strings if history tracking is enabled, None otherwise.
    """
    return self._history.copy() if self._track_history else None

`clear_history()` ¶

Clear the session history.

Source code in src/parquetframe/core.py

def clear_history(self) -> None:
    """
    Clear the session history.
    """
    if self._track_history:
        self._history.clear()
        print("Session history cleared.")
    else:
        print("History tracking not enabled.")

Core Functions¶

Reading Data¶

Functions for loading data from various sources.

`parquetframe.read(file, threshold_mb=None, islazy=None, **kwargs)` ¶

Read a parquet file into a ParquetFrame.

This is a convenience function that wraps ParquetFrame.read().

Parameters:

Name	Type	Description	Default
`file`	`Union[str, Path]`	Path to the parquet file (extension optional).	required
`threshold_mb`	`Optional[float]`	Size threshold in MB for backend selection. Defaults to 10MB.	`None`
`islazy`	`Optional[bool]`	Force backend selection (True=Dask, False=pandas, None=auto).	`None`
`**kwargs`		Additional keyword arguments for read_parquet methods.	`{}`

Returns:

Type	Description
`ParquetFrame`	ParquetFrame instance with loaded data.

Examples:

>>> import parquetframe as pqf
>>> df = pqf.read("data")  # Auto-detect extension and backend
>>> df = pqf.read("data.parquet", threshold_mb=50)
>>> df = pqf.read("data", islazy=True)  # Force Dask

Source code in src/parquetframe/__init__.py

def read(
    file: Union[str, Path],
    threshold_mb: Optional[float] = None,
    islazy: Optional[bool] = None,
    **kwargs,
) -> ParquetFrame:
    """
    Read a parquet file into a ParquetFrame.

    This is a convenience function that wraps ParquetFrame.read().

    Args:
        file: Path to the parquet file (extension optional).
        threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
        islazy: Force backend selection (True=Dask, False=pandas, None=auto).
        **kwargs: Additional keyword arguments for read_parquet methods.

    Returns:
        ParquetFrame instance with loaded data.

    Examples:
        >>> import parquetframe as pqf
        >>> df = pqf.read("data")  # Auto-detect extension and backend
        >>> df = pqf.read("data.parquet", threshold_mb=50)
        >>> df = pqf.read("data", islazy=True)  # Force Dask
    """
    return ParquetFrame.read(file, threshold_mb=threshold_mb, islazy=islazy, **kwargs)

Writing Data¶

Functions for saving data to various formats.

`parquetframe.ParquetFrame.save(file, save_script=None, **kwargs)` ¶

Save the dataframe to a parquet file.

Automatically adds .parquet extension if not present. Works with both pandas and Dask dataframes.

Parameters:

Name	Type	Description	Default
`file`	`Union[str, Path]`	Base name for the output file.	required
`save_script`	`Optional[str]`	If provided, saves session history to this Python script.	`None`
`**kwargs`		Additional keyword arguments for to_parquet methods.	`{}`

Returns:

Type	Description
`ParquetFrame`	Self for method chaining.

Raises:

Type	Description
`TypeError`	If no dataframe is loaded.

Examples:

>>> pf.save("output")  # Saves as output.parquet
>>> pf.save("data.parquet", compression='snappy')
>>> pf.save("output", save_script="session.py")  # Also saves session history

Source code in src/parquetframe/core.py

def save(
    self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
) -> "ParquetFrame":
    """
    Save the dataframe to a parquet file.

    Automatically adds .parquet extension if not present.
    Works with both pandas and Dask dataframes.

    Args:
        file: Base name for the output file.
        save_script: If provided, saves session history to this Python script.
        **kwargs: Additional keyword arguments for to_parquet methods.

    Returns:
        Self for method chaining.

    Raises:
        TypeError: If no dataframe is loaded.

    Examples:
        >>> pf.save("output")  # Saves as output.parquet
        >>> pf.save("data.parquet", compression='snappy')
        >>> pf.save("output", save_script="session.py")  # Also saves session history
    """
    if self._df is None:
        raise TypeError("No dataframe loaded to save.")

    file_path = self._ensure_parquet_extension(file)

    # Track save operation in history
    if self._track_history:
        save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
        if save_script:
            save_args.append(f"save_script='{save_script}'")
        self._history.append(f"pf.save({', '.join(save_args)})")

    # Ensure parent directory exists
    file_path.parent.mkdir(parents=True, exist_ok=True)

    if isinstance(self._df, dd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"Dask DataFrame saved to '{file_path}'.")
    elif isinstance(self._df, pd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"pandas DataFrame saved to '{file_path}'.")

    # Save script if requested
    if save_script and self._track_history:
        self._save_history_script(save_script)

    return self

Data Processing¶

Filtering and Selection¶

Methods for filtering and selecting data.

Aggregation¶

Methods for data aggregation and grouping.

Transformation¶

Methods for data transformation and feature engineering.

Summary¶

The core API provides comprehensive functionality for data loading, processing, and saving with optimal performance.

Examples¶

import parquetframe as pf

# Create ParquetFrame instance
df = pf.ParquetFrame()

# Load data
df = pf.read("data.parquet")

# Process data
filtered = df.filter("column > 100")
grouped = df.groupby("category").sum()

# Save results
df.save("output.parquet")

Core API Reference¶

Core Classes¶

ParquetFrame¶

parquetframe.ParquetFrame ¶

islazy property writable ¶

bio property ¶

__init__(df=None, islazy=False, track_history=False) ¶

__repr__() ¶

__getitem__(key) ¶

__len__() ¶

__getattr__(name) ¶

read(file, threshold_mb=None, islazy=None, **kwargs) classmethod ¶

save(file, save_script=None, **kwargs) ¶

to_pandas() ¶

to_dask(npartitions=None) ¶

sql(query, **other_frames) ¶

get_history() ¶

clear_history() ¶

Core Functions¶

Reading Data¶

parquetframe.read(file, threshold_mb=None, islazy=None, **kwargs) ¶

Writing Data¶

parquetframe.ParquetFrame.save(file, save_script=None, **kwargs) ¶

Data Processing¶

Filtering and Selection¶

Aggregation¶

Transformation¶

Summary¶

Examples¶

Further Reading¶

`parquetframe.ParquetFrame` ¶

`islazy` `property` `writable` ¶

`bio` `property` ¶

`init(df=None, islazy=False, track_history=False)` ¶

`repr()` ¶

`getitem(key)` ¶

`len()` ¶

`getattr(name)` ¶

`read(file, threshold_mb=None, islazy=None, **kwargs)` `classmethod` ¶

`save(file, save_script=None, **kwargs)` ¶

`to_pandas()` ¶

`to_dask(npartitions=None)` ¶

`sql(query, **other_frames)` ¶

`get_history()` ¶

`clear_history()` ¶

`parquetframe.read(file, threshold_mb=None, islazy=None, **kwargs)` ¶

`parquetframe.ParquetFrame.save(file, save_script=None, **kwargs)` ¶