Skip to content

Core API Reference

Complete API reference for ParquetFrame core functionality.

Core Classes

ParquetFrame

The main class for working with parquet data.

parquetframe.ParquetFrame

A wrapper for pandas and Dask DataFrames to simplify working with parquet files.

The class automatically switches between pandas and Dask based on file size or a manual flag. It delegates all standard DataFrame methods to the active internal dataframe.

Examples:

>>> import parquetframe as pqf
>>> # Read file with automatic backend selection
>>> pf = pqf.pf.read("data.parquet")
>>> # Manual backend control
>>> pf = pqf.pf.read("data", islazy=True)  # Force Dask
>>> # Standard DataFrame operations work transparently
>>> result = pf.groupby("column").sum()
Source code in src/parquetframe/core.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
class ParquetFrame:
    """
    A wrapper for pandas and Dask DataFrames to simplify working with parquet files.

    The class automatically switches between pandas and Dask based on file size
    or a manual flag. It delegates all standard DataFrame methods to the active
    internal dataframe.

    Examples:
        >>> import parquetframe as pqf
        >>> # Read file with automatic backend selection
        >>> pf = pqf.pf.read("data.parquet")
        >>> # Manual backend control
        >>> pf = pqf.pf.read("data", islazy=True)  # Force Dask
        >>> # Standard DataFrame operations work transparently
        >>> result = pf.groupby("column").sum()
    """

    def __init__(
        self,
        df: Optional[Union[pd.DataFrame, dd.DataFrame]] = None,
        islazy: bool = False,
        track_history: bool = False,
    ) -> None:
        """
        Initialize the ParquetFrame.

        Args:
            df: An initial dataframe (pandas or Dask).
            islazy: If True, forces a Dask DataFrame.
            track_history: If True, enables history tracking for CLI sessions.
        """
        self._df = df
        self._islazy = islazy
        self.DEFAULT_THRESHOLD_MB = 10
        self._track_history = track_history
        self._history = [] if track_history else None

    @property
    def islazy(self) -> bool:
        """Get the current backend type (True for Dask, False for pandas)."""
        return self._islazy

    @islazy.setter
    def islazy(self, value: bool) -> None:
        """Set the backend type and convert the dataframe if necessary."""
        if not isinstance(value, bool):
            raise TypeError("islazy must be a boolean")
        if value != self._islazy:
            if value:
                self.to_dask()
            else:
                self.to_pandas()

    def __repr__(self) -> str:
        """String representation of the object."""
        df_type = "Dask" if self.islazy else "pandas"
        if self._df is None:
            return f"ParquetFrame(type={df_type}, df=None)"
        return f"ParquetFrame(type={df_type}, df={self._df.__repr__()})"

    def __getitem__(self, key):
        """
        Support indexing operations (df[column] or df[columns]).
        """
        if self._df is None:
            raise ValueError("No dataframe loaded")

        result = self._df[key]

        # Track operation in history if enabled
        if self._track_history:
            if isinstance(key, list):
                key_repr = repr(key)
            else:
                key_repr = repr(key)
            self._history.append(f"pf = pf[{key_repr}]")

        # If result is a dataframe, wrap it
        if isinstance(result, (pd.DataFrame, dd.DataFrame)):
            new_pf = ParquetFrame(
                result,
                isinstance(result, dd.DataFrame),
                track_history=self._track_history,
            )
            # Inherit history from parent if tracking
            if self._track_history:
                new_pf._history = self._history.copy()
            return new_pf
        return result

    def __len__(self) -> int:
        """
        Return the length of the dataframe.
        """
        if self._df is None:
            return 0
        return len(self._df)

    def __getattr__(self, name: str) -> Any:
        """
        Delegate attribute access to the underlying dataframe.

        This method is called for attributes not found in the ParquetFrame instance.
        It forwards the call to the internal dataframe (_df).
        """
        if self._df is not None:
            attr = getattr(self._df, name)
            if callable(attr):

                def wrapper(*args, **kwargs):
                    # Track operation in history if enabled
                    if self._track_history:
                        args_repr = [repr(arg) for arg in args]
                        kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()]
                        call_repr = f".{name}({', '.join(args_repr + kwargs_repr)})"
                        self._history.append(f"pf = pf{call_repr}")

                    result = attr(*args, **kwargs)
                    # If the result is a dataframe, wrap it in a new ParquetFrame
                    if isinstance(result, (pd.DataFrame, dd.DataFrame)):
                        new_pf = ParquetFrame(
                            result,
                            isinstance(result, dd.DataFrame),
                            track_history=self._track_history,
                        )
                        # Inherit history from parent if tracking
                        if self._track_history:
                            new_pf._history = self._history.copy()
                        return new_pf
                    return result

                return wrapper
            return attr
        raise AttributeError(
            f"'{type(self).__name__}' object has no attribute '{name}'"
        )

    @classmethod
    def _estimate_memory_usage(cls, file_path: Path) -> float:
        """Estimate memory usage for loading file based on file size and compression."""
        try:
            import pyarrow.parquet as pq

            # Get file metadata without loading full file
            _ = pq.ParquetFile(file_path).metadata

            # Estimate memory usage based on:
            # 1. Uncompressed size (compressed size * expansion factor)
            # 2. Data types (strings use more memory than numbers)
            # 3. Null values (can reduce memory usage)

            compressed_size = file_path.stat().st_size / 1024 / 1024  # MB

            # Estimate compression ratio (typical parquet compression is 3-10x)
            # Use conservative estimate of 4x expansion
            expansion_factor = 4.0

            # Additional overhead for pandas DataFrame structure
            pandas_overhead = 1.5

            estimated_memory = compressed_size * expansion_factor * pandas_overhead

            return estimated_memory

        except Exception:
            # Fallback to simple file size estimation
            file_size_mb = file_path.stat().st_size / 1024 / 1024
            return file_size_mb * 5  # Conservative estimate

    @classmethod
    def _get_system_memory(cls) -> float:
        """Get available system memory in MB."""
        try:
            import psutil

            # Get available memory (not just free, but actually available)
            available_mb = psutil.virtual_memory().available / 1024 / 1024
            return available_mb
        except (ImportError, Exception):
            # Conservative fallback if psutil not available or fails
            return 2048  # Assume 2GB available

    @classmethod
    def _should_use_dask(
        cls, file_path: Path, threshold_mb: float, islazy: Optional[bool] = None
    ) -> bool:
        """Intelligently determine whether to use Dask based on multiple factors."""
        if islazy is not None:
            return islazy

        file_size_mb = file_path.stat().st_size / 1024 / 1024

        # Basic threshold check
        if file_size_mb >= threshold_mb:
            return True

        # Advanced checks if file is close to threshold
        if file_size_mb >= threshold_mb * 0.7:  # Within 70% of threshold
            estimated_memory = cls._estimate_memory_usage(file_path)
            available_memory = cls._get_system_memory()

            # Use Dask if estimated memory usage > 50% of available memory
            if estimated_memory > available_memory * 0.5:
                return True

            # Use Dask if file has many partitions (suggests it's meant for parallel processing)
            try:
                import pyarrow.parquet as pq

                metadata = pq.ParquetFile(file_path).metadata
                if metadata.num_row_groups > 10:  # Many row groups suggest chunked data
                    return True
            except Exception:  # nosec B110
                # Intentional broad exception handling for metadata parsing robustness
                # Failure to determine row groups should not crash file read operation
                pass

        return False

    @classmethod
    def read(
        cls,
        file: Union[str, Path],
        threshold_mb: Optional[float] = None,
        islazy: Optional[bool] = None,
        **kwargs,
    ) -> "ParquetFrame":
        """
        Read a parquet file into a ParquetFrame.

        Automatically selects pandas or Dask based on file size, unless overridden.
        Handles file extension detection automatically.

        Args:
            file: Path to the parquet file (extension optional).
            threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
            islazy: Force backend selection (True=Dask, False=pandas, None=auto).
            **kwargs: Additional keyword arguments for read_parquet methods.

        Returns:
            ParquetFrame instance with loaded data.

        Raises:
            FileNotFoundError: If no parquet file is found.

        Examples:
            >>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
            >>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
            >>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask
        """
        # Support URLs and fsspec-based paths without local existence checks
        file_str = str(file)
        is_url = file_str.startswith(("http://", "https://", "s3://", "gs://"))

        if is_url:
            file_path = file_str  # pass through to reader
            file_size_mb = 0.0
        else:
            file_path = cls._resolve_file_path(file)
            file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

        # Validate islazy parameter
        if islazy is not None and not isinstance(islazy, bool):
            raise TypeError("islazy parameter must be a boolean or None")

        # Determine backend using explicit override first
        threshold = threshold_mb if threshold_mb is not None else 10
        if islazy is not None:
            use_dask = bool(islazy)
        else:
            # Intelligent switching when not explicitly specified
            use_dask = cls._should_use_dask(
                Path(file_path) if not is_url else Path("."), threshold, None
            )

        # Read the file
        if use_dask:
            df = dd.read_parquet(file_path, **kwargs)
            print(
                f"Reading '{file_path}' as Dask DataFrame (size: {file_size_mb:.2f} MB)"
            )
        else:
            df = pd.read_parquet(file_path, **kwargs)
            print(
                f"Reading '{file_path}' as pandas DataFrame (size: {file_size_mb:.2f} MB)"
            )

        instance = cls(df, use_dask)
        # Track read operation in history if needed
        if hasattr(cls, "_current_session_tracking") and cls._current_session_tracking:
            instance._track_history = True
            instance._history = [
                f"pf = ParquetFrame.read('{file}', threshold_mb={threshold_mb}, islazy={islazy})"
            ]
        return instance

    def save(
        self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
    ) -> "ParquetFrame":
        """
        Save the dataframe to a parquet file.

        Automatically adds .parquet extension if not present.
        Works with both pandas and Dask dataframes.

        Args:
            file: Base name for the output file.
            save_script: If provided, saves session history to this Python script.
            **kwargs: Additional keyword arguments for to_parquet methods.

        Returns:
            Self for method chaining.

        Raises:
            TypeError: If no dataframe is loaded.

        Examples:
            >>> pf.save("output")  # Saves as output.parquet
            >>> pf.save("data.parquet", compression='snappy')
            >>> pf.save("output", save_script="session.py")  # Also saves session history
        """
        if self._df is None:
            raise TypeError("No dataframe loaded to save.")

        file_path = self._ensure_parquet_extension(file)

        # Track save operation in history
        if self._track_history:
            save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
            if save_script:
                save_args.append(f"save_script='{save_script}'")
            self._history.append(f"pf.save({', '.join(save_args)})")

        # Ensure parent directory exists
        file_path.parent.mkdir(parents=True, exist_ok=True)

        if isinstance(self._df, dd.DataFrame):
            self._df.to_parquet(file_path, **kwargs)
            print(f"Dask DataFrame saved to '{file_path}'.")
        elif isinstance(self._df, pd.DataFrame):
            self._df.to_parquet(file_path, **kwargs)
            print(f"pandas DataFrame saved to '{file_path}'.")

        # Save script if requested
        if save_script and self._track_history:
            self._save_history_script(save_script)

        return self

    def to_pandas(self) -> "ParquetFrame":
        """
        Convert the internal Dask dataframe to a pandas dataframe.

        Returns:
            Self for method chaining.
        """
        if self.islazy and isinstance(self._df, dd.DataFrame):
            self._df = self._df.compute()
            # Normalize string dtype back to object for consistency with tests
            try:
                string_cols = list(self._df.select_dtypes(include="string").columns)
                if string_cols:
                    self._df[string_cols] = self._df[string_cols].astype("object")
            except Exception:  # nosec B110
                # Intentional broad exception handling for dtype conversion robustness
                # Failure to normalize string dtypes should not crash pandas conversion
                pass
            self._islazy = False
            print("Converted to pandas DataFrame.")
        else:
            print("Already a pandas DataFrame.")
        return self

    def to_dask(self, npartitions: Optional[int] = None) -> "ParquetFrame":
        """
        Convert the internal pandas dataframe to a Dask dataframe.

        Args:
            npartitions: Number of partitions for the Dask dataframe.
                       Defaults to the number of CPU cores.

        Returns:
            Self for method chaining.
        """
        if npartitions is not None and npartitions <= 0:
            raise ValueError("npartitions must be a positive integer")
        if not self.islazy and isinstance(self._df, pd.DataFrame):
            npart = npartitions if npartitions is not None else os.cpu_count() or 1
            self._df = dd.from_pandas(self._df, npartitions=npart)
            self._islazy = True
            print("Converted to Dask DataFrame.")
        else:
            print("Already a Dask DataFrame.")
        return self

    def sql(self, query: str, **other_frames: "ParquetFrame") -> "ParquetFrame":
        """
        Execute a SQL query on this ParquetFrame using DuckDB.

        The current ParquetFrame is available as 'df' in the query.
        Additional ParquetFrames can be passed as keyword arguments.

        Args:
            query: SQL query string to execute.
            **other_frames: Additional ParquetFrames to use in JOINs.

        Returns:
            New ParquetFrame with query results (always pandas backend).

        Raises:
            ImportError: If DuckDB is not installed.
            ValueError: If query execution fails.

        Examples:
            >>> # Simple query
            >>> result = pf.sql("SELECT * FROM df WHERE age > 25")
            >>>
            >>> # JOIN with another ParquetFrame
            >>> orders = pf.sql(
            ...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
            ...     customers=customers_pf
            ... )
        """
        if self._df is None:
            raise ValueError("No dataframe loaded for SQL query")

        from .sql import query_dataframes

        # Convert other ParquetFrames to their underlying DataFrames
        other_dfs = {name: pf._df for name, pf in other_frames.items()}

        # Execute SQL query
        result_df = query_dataframes(self._df, query, other_dfs)

        # Return as pandas-backed ParquetFrame (SQL results are always pandas)
        return self.__class__(result_df, islazy=False)

    @property
    def bio(self):
        """
        Access bioframe functions with intelligent parallel dispatching.

        Returns BioAccessor that automatically chooses between pandas (eager)
        and Dask (parallel) implementations based on the current backend.

        Returns:
            BioAccessor instance for genomic operations.

        Raises:
            ImportError: If bioframe is not installed.

        Examples:
            >>> # Cluster genomic intervals
            >>> clustered = pf.bio.cluster(min_dist=1000)
            >>>
            >>> # Find overlaps with another ParquetFrame
            >>> overlaps = pf.bio.overlap(other_pf, broadcast=True)
        """
        from .bio import BioAccessor

        return BioAccessor(self)

    @staticmethod
    def _resolve_file_path(file: Union[str, Path]) -> Path:
        """
        Resolve file path and handle extension detection.

        Args:
            file: Input file path.

        Returns:
            Resolved Path object.

        Raises:
            FileNotFoundError: If no parquet file variant is found.
        """
        file_path = Path(file)

        # If extension is already present, use as-is
        if file_path.suffix in (".parquet", ".pqt"):
            if file_path.exists():
                return file_path
            else:
                raise FileNotFoundError(f"File not found: {file_path}")

        # Try different extensions
        for ext in [".parquet", ".pqt"]:
            candidate = file_path.with_suffix(ext)
            if candidate.exists():
                return candidate

        raise FileNotFoundError(
            f"No parquet file found for '{file}' (tried .parquet, .pqt)"
        )

    def _save_history_script(self, script_path: Union[str, Path]) -> None:
        """
        Save the session history to a Python script.

        Args:
            script_path: Path to save the script to.
        """
        if not self._track_history or not self._history:
            print("No history to save (history tracking not enabled or empty).")
            return

        script_path = Path(script_path)
        if script_path.suffix != ".py":
            script_path = script_path.with_suffix(".py")

        header = "# Auto-generated script from ParquetFrame CLI session\n"
        header += (
            "from parquetframe import ParquetFrame\nimport parquetframe as pqf\n\n"
        )

        with open(script_path, "w") as f:
            f.write(header)
            for line in self._history:
                f.write(line + "\n")

        print(f"Session history saved to '{script_path}'")

    def get_history(self) -> Optional[list]:
        """
        Get the current session history.

        Returns:
            List of command strings if history tracking is enabled, None otherwise.
        """
        return self._history.copy() if self._track_history else None

    def clear_history(self) -> None:
        """
        Clear the session history.
        """
        if self._track_history:
            self._history.clear()
            print("Session history cleared.")
        else:
            print("History tracking not enabled.")

    @staticmethod
    def _ensure_parquet_extension(file: Union[str, Path]) -> Path:
        """
        Ensure the file path has a parquet extension.

        Args:
            file: Input file path.

        Returns:
            Path with appropriate parquet extension.
        """
        file_path = Path(file)
        if file_path.suffix not in (".parquet", ".pqt"):
            return file_path.with_suffix(".parquet")
        return file_path

islazy property writable

Get the current backend type (True for Dask, False for pandas).

bio property

Access bioframe functions with intelligent parallel dispatching.

Returns BioAccessor that automatically chooses between pandas (eager) and Dask (parallel) implementations based on the current backend.

Returns:

Type Description

BioAccessor instance for genomic operations.

Raises:

Type Description
ImportError

If bioframe is not installed.

Examples:

>>> # Cluster genomic intervals
>>> clustered = pf.bio.cluster(min_dist=1000)
>>>
>>> # Find overlaps with another ParquetFrame
>>> overlaps = pf.bio.overlap(other_pf, broadcast=True)

__init__(df=None, islazy=False, track_history=False)

Initialize the ParquetFrame.

Parameters:

Name Type Description Default
df Optional[Union[DataFrame, DataFrame]]

An initial dataframe (pandas or Dask).

None
islazy bool

If True, forces a Dask DataFrame.

False
track_history bool

If True, enables history tracking for CLI sessions.

False
Source code in src/parquetframe/core.py
def __init__(
    self,
    df: Optional[Union[pd.DataFrame, dd.DataFrame]] = None,
    islazy: bool = False,
    track_history: bool = False,
) -> None:
    """
    Initialize the ParquetFrame.

    Args:
        df: An initial dataframe (pandas or Dask).
        islazy: If True, forces a Dask DataFrame.
        track_history: If True, enables history tracking for CLI sessions.
    """
    self._df = df
    self._islazy = islazy
    self.DEFAULT_THRESHOLD_MB = 10
    self._track_history = track_history
    self._history = [] if track_history else None

__repr__()

String representation of the object.

Source code in src/parquetframe/core.py
def __repr__(self) -> str:
    """String representation of the object."""
    df_type = "Dask" if self.islazy else "pandas"
    if self._df is None:
        return f"ParquetFrame(type={df_type}, df=None)"
    return f"ParquetFrame(type={df_type}, df={self._df.__repr__()})"

__getitem__(key)

Support indexing operations (df[column] or df[columns]).

Source code in src/parquetframe/core.py
def __getitem__(self, key):
    """
    Support indexing operations (df[column] or df[columns]).
    """
    if self._df is None:
        raise ValueError("No dataframe loaded")

    result = self._df[key]

    # Track operation in history if enabled
    if self._track_history:
        if isinstance(key, list):
            key_repr = repr(key)
        else:
            key_repr = repr(key)
        self._history.append(f"pf = pf[{key_repr}]")

    # If result is a dataframe, wrap it
    if isinstance(result, (pd.DataFrame, dd.DataFrame)):
        new_pf = ParquetFrame(
            result,
            isinstance(result, dd.DataFrame),
            track_history=self._track_history,
        )
        # Inherit history from parent if tracking
        if self._track_history:
            new_pf._history = self._history.copy()
        return new_pf
    return result

__len__()

Return the length of the dataframe.

Source code in src/parquetframe/core.py
def __len__(self) -> int:
    """
    Return the length of the dataframe.
    """
    if self._df is None:
        return 0
    return len(self._df)

__getattr__(name)

Delegate attribute access to the underlying dataframe.

This method is called for attributes not found in the ParquetFrame instance. It forwards the call to the internal dataframe (_df).

Source code in src/parquetframe/core.py
def __getattr__(self, name: str) -> Any:
    """
    Delegate attribute access to the underlying dataframe.

    This method is called for attributes not found in the ParquetFrame instance.
    It forwards the call to the internal dataframe (_df).
    """
    if self._df is not None:
        attr = getattr(self._df, name)
        if callable(attr):

            def wrapper(*args, **kwargs):
                # Track operation in history if enabled
                if self._track_history:
                    args_repr = [repr(arg) for arg in args]
                    kwargs_repr = [f"{k}={v!r}" for k, v in kwargs.items()]
                    call_repr = f".{name}({', '.join(args_repr + kwargs_repr)})"
                    self._history.append(f"pf = pf{call_repr}")

                result = attr(*args, **kwargs)
                # If the result is a dataframe, wrap it in a new ParquetFrame
                if isinstance(result, (pd.DataFrame, dd.DataFrame)):
                    new_pf = ParquetFrame(
                        result,
                        isinstance(result, dd.DataFrame),
                        track_history=self._track_history,
                    )
                    # Inherit history from parent if tracking
                    if self._track_history:
                        new_pf._history = self._history.copy()
                    return new_pf
                return result

            return wrapper
        return attr
    raise AttributeError(
        f"'{type(self).__name__}' object has no attribute '{name}'"
    )

read(file, threshold_mb=None, islazy=None, **kwargs) classmethod

Read a parquet file into a ParquetFrame.

Automatically selects pandas or Dask based on file size, unless overridden. Handles file extension detection automatically.

Parameters:

Name Type Description Default
file Union[str, Path]

Path to the parquet file (extension optional).

required
threshold_mb Optional[float]

Size threshold in MB for backend selection. Defaults to 10MB.

None
islazy Optional[bool]

Force backend selection (True=Dask, False=pandas, None=auto).

None
**kwargs

Additional keyword arguments for read_parquet methods.

{}

Returns:

Type Description
ParquetFrame

ParquetFrame instance with loaded data.

Raises:

Type Description
FileNotFoundError

If no parquet file is found.

Examples:

>>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
>>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
>>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask
Source code in src/parquetframe/core.py
@classmethod
def read(
    cls,
    file: Union[str, Path],
    threshold_mb: Optional[float] = None,
    islazy: Optional[bool] = None,
    **kwargs,
) -> "ParquetFrame":
    """
    Read a parquet file into a ParquetFrame.

    Automatically selects pandas or Dask based on file size, unless overridden.
    Handles file extension detection automatically.

    Args:
        file: Path to the parquet file (extension optional).
        threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
        islazy: Force backend selection (True=Dask, False=pandas, None=auto).
        **kwargs: Additional keyword arguments for read_parquet methods.

    Returns:
        ParquetFrame instance with loaded data.

    Raises:
        FileNotFoundError: If no parquet file is found.

    Examples:
        >>> pf = ParquetFrame.read("data")  # Auto-detects .parquet/.pqt
        >>> pf = ParquetFrame.read("data.parquet", threshold_mb=50)
        >>> pf = ParquetFrame.read("data", islazy=True)  # Force Dask
    """
    # Support URLs and fsspec-based paths without local existence checks
    file_str = str(file)
    is_url = file_str.startswith(("http://", "https://", "s3://", "gs://"))

    if is_url:
        file_path = file_str  # pass through to reader
        file_size_mb = 0.0
    else:
        file_path = cls._resolve_file_path(file)
        file_size_mb = os.path.getsize(file_path) / (1024 * 1024)

    # Validate islazy parameter
    if islazy is not None and not isinstance(islazy, bool):
        raise TypeError("islazy parameter must be a boolean or None")

    # Determine backend using explicit override first
    threshold = threshold_mb if threshold_mb is not None else 10
    if islazy is not None:
        use_dask = bool(islazy)
    else:
        # Intelligent switching when not explicitly specified
        use_dask = cls._should_use_dask(
            Path(file_path) if not is_url else Path("."), threshold, None
        )

    # Read the file
    if use_dask:
        df = dd.read_parquet(file_path, **kwargs)
        print(
            f"Reading '{file_path}' as Dask DataFrame (size: {file_size_mb:.2f} MB)"
        )
    else:
        df = pd.read_parquet(file_path, **kwargs)
        print(
            f"Reading '{file_path}' as pandas DataFrame (size: {file_size_mb:.2f} MB)"
        )

    instance = cls(df, use_dask)
    # Track read operation in history if needed
    if hasattr(cls, "_current_session_tracking") and cls._current_session_tracking:
        instance._track_history = True
        instance._history = [
            f"pf = ParquetFrame.read('{file}', threshold_mb={threshold_mb}, islazy={islazy})"
        ]
    return instance

save(file, save_script=None, **kwargs)

Save the dataframe to a parquet file.

Automatically adds .parquet extension if not present. Works with both pandas and Dask dataframes.

Parameters:

Name Type Description Default
file Union[str, Path]

Base name for the output file.

required
save_script Optional[str]

If provided, saves session history to this Python script.

None
**kwargs

Additional keyword arguments for to_parquet methods.

{}

Returns:

Type Description
ParquetFrame

Self for method chaining.

Raises:

Type Description
TypeError

If no dataframe is loaded.

Examples:

>>> pf.save("output")  # Saves as output.parquet
>>> pf.save("data.parquet", compression='snappy')
>>> pf.save("output", save_script="session.py")  # Also saves session history
Source code in src/parquetframe/core.py
def save(
    self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
) -> "ParquetFrame":
    """
    Save the dataframe to a parquet file.

    Automatically adds .parquet extension if not present.
    Works with both pandas and Dask dataframes.

    Args:
        file: Base name for the output file.
        save_script: If provided, saves session history to this Python script.
        **kwargs: Additional keyword arguments for to_parquet methods.

    Returns:
        Self for method chaining.

    Raises:
        TypeError: If no dataframe is loaded.

    Examples:
        >>> pf.save("output")  # Saves as output.parquet
        >>> pf.save("data.parquet", compression='snappy')
        >>> pf.save("output", save_script="session.py")  # Also saves session history
    """
    if self._df is None:
        raise TypeError("No dataframe loaded to save.")

    file_path = self._ensure_parquet_extension(file)

    # Track save operation in history
    if self._track_history:
        save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
        if save_script:
            save_args.append(f"save_script='{save_script}'")
        self._history.append(f"pf.save({', '.join(save_args)})")

    # Ensure parent directory exists
    file_path.parent.mkdir(parents=True, exist_ok=True)

    if isinstance(self._df, dd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"Dask DataFrame saved to '{file_path}'.")
    elif isinstance(self._df, pd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"pandas DataFrame saved to '{file_path}'.")

    # Save script if requested
    if save_script and self._track_history:
        self._save_history_script(save_script)

    return self

to_pandas()

Convert the internal Dask dataframe to a pandas dataframe.

Returns:

Type Description
ParquetFrame

Self for method chaining.

Source code in src/parquetframe/core.py
def to_pandas(self) -> "ParquetFrame":
    """
    Convert the internal Dask dataframe to a pandas dataframe.

    Returns:
        Self for method chaining.
    """
    if self.islazy and isinstance(self._df, dd.DataFrame):
        self._df = self._df.compute()
        # Normalize string dtype back to object for consistency with tests
        try:
            string_cols = list(self._df.select_dtypes(include="string").columns)
            if string_cols:
                self._df[string_cols] = self._df[string_cols].astype("object")
        except Exception:  # nosec B110
            # Intentional broad exception handling for dtype conversion robustness
            # Failure to normalize string dtypes should not crash pandas conversion
            pass
        self._islazy = False
        print("Converted to pandas DataFrame.")
    else:
        print("Already a pandas DataFrame.")
    return self

to_dask(npartitions=None)

Convert the internal pandas dataframe to a Dask dataframe.

Parameters:

Name Type Description Default
npartitions Optional[int]

Number of partitions for the Dask dataframe. Defaults to the number of CPU cores.

None

Returns:

Type Description
ParquetFrame

Self for method chaining.

Source code in src/parquetframe/core.py
def to_dask(self, npartitions: Optional[int] = None) -> "ParquetFrame":
    """
    Convert the internal pandas dataframe to a Dask dataframe.

    Args:
        npartitions: Number of partitions for the Dask dataframe.
                   Defaults to the number of CPU cores.

    Returns:
        Self for method chaining.
    """
    if npartitions is not None and npartitions <= 0:
        raise ValueError("npartitions must be a positive integer")
    if not self.islazy and isinstance(self._df, pd.DataFrame):
        npart = npartitions if npartitions is not None else os.cpu_count() or 1
        self._df = dd.from_pandas(self._df, npartitions=npart)
        self._islazy = True
        print("Converted to Dask DataFrame.")
    else:
        print("Already a Dask DataFrame.")
    return self

sql(query, **other_frames)

Execute a SQL query on this ParquetFrame using DuckDB.

The current ParquetFrame is available as 'df' in the query. Additional ParquetFrames can be passed as keyword arguments.

Parameters:

Name Type Description Default
query str

SQL query string to execute.

required
**other_frames ParquetFrame

Additional ParquetFrames to use in JOINs.

{}

Returns:

Type Description
ParquetFrame

New ParquetFrame with query results (always pandas backend).

Raises:

Type Description
ImportError

If DuckDB is not installed.

ValueError

If query execution fails.

Examples:

>>> # Simple query
>>> result = pf.sql("SELECT * FROM df WHERE age > 25")
>>>
>>> # JOIN with another ParquetFrame
>>> orders = pf.sql(
...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
...     customers=customers_pf
... )
Source code in src/parquetframe/core.py
def sql(self, query: str, **other_frames: "ParquetFrame") -> "ParquetFrame":
    """
    Execute a SQL query on this ParquetFrame using DuckDB.

    The current ParquetFrame is available as 'df' in the query.
    Additional ParquetFrames can be passed as keyword arguments.

    Args:
        query: SQL query string to execute.
        **other_frames: Additional ParquetFrames to use in JOINs.

    Returns:
        New ParquetFrame with query results (always pandas backend).

    Raises:
        ImportError: If DuckDB is not installed.
        ValueError: If query execution fails.

    Examples:
        >>> # Simple query
        >>> result = pf.sql("SELECT * FROM df WHERE age > 25")
        >>>
        >>> # JOIN with another ParquetFrame
        >>> orders = pf.sql(
        ...     "SELECT * FROM df JOIN customers ON df.cust_id = customers.id",
        ...     customers=customers_pf
        ... )
    """
    if self._df is None:
        raise ValueError("No dataframe loaded for SQL query")

    from .sql import query_dataframes

    # Convert other ParquetFrames to their underlying DataFrames
    other_dfs = {name: pf._df for name, pf in other_frames.items()}

    # Execute SQL query
    result_df = query_dataframes(self._df, query, other_dfs)

    # Return as pandas-backed ParquetFrame (SQL results are always pandas)
    return self.__class__(result_df, islazy=False)

get_history()

Get the current session history.

Returns:

Type Description
Optional[list]

List of command strings if history tracking is enabled, None otherwise.

Source code in src/parquetframe/core.py
def get_history(self) -> Optional[list]:
    """
    Get the current session history.

    Returns:
        List of command strings if history tracking is enabled, None otherwise.
    """
    return self._history.copy() if self._track_history else None

clear_history()

Clear the session history.

Source code in src/parquetframe/core.py
def clear_history(self) -> None:
    """
    Clear the session history.
    """
    if self._track_history:
        self._history.clear()
        print("Session history cleared.")
    else:
        print("History tracking not enabled.")

Core Functions

Reading Data

Functions for loading data from various sources.

parquetframe.read(file, threshold_mb=None, islazy=None, **kwargs)

Read a parquet file into a ParquetFrame.

This is a convenience function that wraps ParquetFrame.read().

Parameters:

Name Type Description Default
file Union[str, Path]

Path to the parquet file (extension optional).

required
threshold_mb Optional[float]

Size threshold in MB for backend selection. Defaults to 10MB.

None
islazy Optional[bool]

Force backend selection (True=Dask, False=pandas, None=auto).

None
**kwargs

Additional keyword arguments for read_parquet methods.

{}

Returns:

Type Description
ParquetFrame

ParquetFrame instance with loaded data.

Examples:

>>> import parquetframe as pqf
>>> df = pqf.read("data")  # Auto-detect extension and backend
>>> df = pqf.read("data.parquet", threshold_mb=50)
>>> df = pqf.read("data", islazy=True)  # Force Dask
Source code in src/parquetframe/__init__.py
def read(
    file: Union[str, Path],
    threshold_mb: Optional[float] = None,
    islazy: Optional[bool] = None,
    **kwargs,
) -> ParquetFrame:
    """
    Read a parquet file into a ParquetFrame.

    This is a convenience function that wraps ParquetFrame.read().

    Args:
        file: Path to the parquet file (extension optional).
        threshold_mb: Size threshold in MB for backend selection. Defaults to 10MB.
        islazy: Force backend selection (True=Dask, False=pandas, None=auto).
        **kwargs: Additional keyword arguments for read_parquet methods.

    Returns:
        ParquetFrame instance with loaded data.

    Examples:
        >>> import parquetframe as pqf
        >>> df = pqf.read("data")  # Auto-detect extension and backend
        >>> df = pqf.read("data.parquet", threshold_mb=50)
        >>> df = pqf.read("data", islazy=True)  # Force Dask
    """
    return ParquetFrame.read(file, threshold_mb=threshold_mb, islazy=islazy, **kwargs)

Writing Data

Functions for saving data to various formats.

parquetframe.ParquetFrame.save(file, save_script=None, **kwargs)

Save the dataframe to a parquet file.

Automatically adds .parquet extension if not present. Works with both pandas and Dask dataframes.

Parameters:

Name Type Description Default
file Union[str, Path]

Base name for the output file.

required
save_script Optional[str]

If provided, saves session history to this Python script.

None
**kwargs

Additional keyword arguments for to_parquet methods.

{}

Returns:

Type Description
ParquetFrame

Self for method chaining.

Raises:

Type Description
TypeError

If no dataframe is loaded.

Examples:

>>> pf.save("output")  # Saves as output.parquet
>>> pf.save("data.parquet", compression='snappy')
>>> pf.save("output", save_script="session.py")  # Also saves session history
Source code in src/parquetframe/core.py
def save(
    self, file: Union[str, Path], save_script: Optional[str] = None, **kwargs
) -> "ParquetFrame":
    """
    Save the dataframe to a parquet file.

    Automatically adds .parquet extension if not present.
    Works with both pandas and Dask dataframes.

    Args:
        file: Base name for the output file.
        save_script: If provided, saves session history to this Python script.
        **kwargs: Additional keyword arguments for to_parquet methods.

    Returns:
        Self for method chaining.

    Raises:
        TypeError: If no dataframe is loaded.

    Examples:
        >>> pf.save("output")  # Saves as output.parquet
        >>> pf.save("data.parquet", compression='snappy')
        >>> pf.save("output", save_script="session.py")  # Also saves session history
    """
    if self._df is None:
        raise TypeError("No dataframe loaded to save.")

    file_path = self._ensure_parquet_extension(file)

    # Track save operation in history
    if self._track_history:
        save_args = [f"'{file}'"] + [f"{k}={v!r}" for k, v in kwargs.items()]
        if save_script:
            save_args.append(f"save_script='{save_script}'")
        self._history.append(f"pf.save({', '.join(save_args)})")

    # Ensure parent directory exists
    file_path.parent.mkdir(parents=True, exist_ok=True)

    if isinstance(self._df, dd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"Dask DataFrame saved to '{file_path}'.")
    elif isinstance(self._df, pd.DataFrame):
        self._df.to_parquet(file_path, **kwargs)
        print(f"pandas DataFrame saved to '{file_path}'.")

    # Save script if requested
    if save_script and self._track_history:
        self._save_history_script(save_script)

    return self

Data Processing

Filtering and Selection

Methods for filtering and selecting data.

Aggregation

Methods for data aggregation and grouping.

Transformation

Methods for data transformation and feature engineering.

Summary

The core API provides comprehensive functionality for data loading, processing, and saving with optimal performance.

Examples

import parquetframe as pf

# Create ParquetFrame instance
df = pf.ParquetFrame()

# Load data
df = pf.read("data.parquet")

# Process data
filtered = df.filter("column > 100")
grouped = df.groupby("category").sum()

# Save results
df.save("output.parquet")

Further Reading