perf: Use count_rows on fragment to reduce lance scans with limit pushdowns only (#5120)

colin-ho · web-flow · commit d321b3995316 · 2025-09-04T17:16:04.000-07:00
## Changes Made When reading lance with a limit pushdown, we can limit the number of fragments scanned by inspecting the num rows of each fragment. This reduces the # of scan tasks created and executed. ## Related Issues  ## Checklist - [ ] Documented in API Docs (if applicable) - [ ] Documented in User Guide (if applicable) - [ ] If adding a new documentation page, doc is added to `docs/mkdocs.yml` navigation - [ ] Documentation builds and is formatted properly (tag @/ccmao1130 for docs review)
diff --git a/daft/io/lance/lance_scan.py b/daft/io/lance/lance_scan.py
@@ -164,10 +164,48 @@ def to_scan_tasks(self, pushdowns: PyPushdowns) -> Iterator[ScanTask]:
                 pushdowns=pushdowns,
                 stats=None,
             )
+        # Check if there is a limit pushdown and no filters
+        elif pushdowns.limit is not None and self._pushed_filters is None:
+            yield from self._create_scan_tasks_with_limit_and_no_filters(pushdowns, required_columns)
         else:
-            # Regular scan without count pushdown
             yield from self._create_regular_scan_tasks(pushdowns, required_columns)
 
+    def _create_scan_tasks_with_limit_and_no_filters(
+        self, pushdowns: PyPushdowns, required_columns: Optional[list[str]]
+    ) -> Iterator[ScanTask]:
+        """Create scan tasks optimized for limit pushdown with no filters."""
+        assert self._pushed_filters is None, "Expected no filters when creating scan tasks with limit and no filters"
+        assert pushdowns.limit is not None, "Expected a limit when creating scan tasks with limit and no filters"
+
+        fragments = self._ds.get_fragments()
+        remaining_limit = pushdowns.limit
+
+        for fragment in fragments:
+            if remaining_limit <= 0:
+                # No more rows needed, stop creating scan tasks
+                break
+
+            # Calculate effective rows using fragment.count_rows()
+            # This is not expensive because count_rows simply checks physical_rows - num_deletions when there are no filters
+            # https://github.com/lancedb/lance/blob/v0.34.0/rust/lance/src/dataset/fragment.rs#L1049-L1055
+            effective_rows = fragment.count_rows()
+
+            if effective_rows > 0:
+                # Determine how many rows this fragment should contribute
+                rows_to_scan = min(remaining_limit, effective_rows)
+                remaining_limit -= rows_to_scan
+
+                yield ScanTask.python_factory_func_scan_task(
+                    module=_lancedb_table_factory_function.__module__,
+                    func_name=_lancedb_table_factory_function.__name__,
+                    func_args=(self._ds, [fragment.fragment_id], required_columns, None, rows_to_scan),
+                    schema=self.schema()._schema,
+                    num_rows=rows_to_scan,
+                    size_bytes=None,
+                    pushdowns=pushdowns,
+                    stats=None,
+                )
+
     def _create_regular_scan_tasks(
         self, pushdowns: PyPushdowns, required_columns: Optional[list[str]]
     ) -> Iterator[ScanTask]:
diff --git a/tests/io/lancedb/test_lancedb_reads.py b/tests/io/lancedb/test_lancedb_reads.py
@@ -39,11 +39,65 @@ def test_lancedb_read_filter(lance_dataset_path):
     assert df.to_pydict() == {"vector": data["vector"][:1]}
 
 
-def test_lancedb_read_limit(lance_dataset_path):
-    df = daft.read_lance(lance_dataset_path)
-    df = df.limit(1)
-    df = df.select("vector")
-    assert df.to_pydict() == {"vector": data["vector"][:1]}
+@pytest.fixture(scope="function")
+def large_lance_dataset_path(tmp_path_factory):
+    """Create a large Lance dataset with multiple fragments for testing limit operations."""
+    tmp_dir = tmp_path_factory.mktemp("large_lance")
+
+    # Create 10 fragments of 1000 rows each (10,000 total rows)
+    for frag_idx in range(10):
+        # Generate data for this fragment
+        vectors = [[float(i * 0.1 + frag_idx * 1000), float(i * 0.2 + frag_idx * 1000)] for i in range(1000)]
+        big_ints = [i + frag_idx * 1000 for i in range(1000)]
+
+        fragment_data = {"vector": vectors, "big_int": big_ints}
+
+        # Write fragment (first write creates dataset, subsequent writes append)
+        mode = "append" if frag_idx > 0 else None
+        lance.write_dataset(pa.Table.from_pydict(fragment_data), tmp_dir, mode=mode)
+
+    yield str(tmp_dir)
+
+
+@pytest.mark.parametrize(
+    "limit_size,expected_scan_tasks",
+    [
+        # Small limits
+        (1000, 1),
+        (1001, 2),
+        # Big limits
+        (9000, 9),
+        (9001, 10),
+        (10000, 10),
+    ],
+)
+def test_lancedb_read_limit_large_dataset(large_lance_dataset_path, limit_size, expected_scan_tasks):
+    """Test limit operation on a large Lance dataset with multiple fragments."""
+    import io
+
+    df = daft.read_lance(large_lance_dataset_path)
+
+    # Test with different limit sizes
+    df = df.limit(limit_size)
+    df = df.select("vector", "big_int")
+
+    # Capture the explain output
+    string_io = io.StringIO()
+    df.explain(True, file=string_io)
+    explain_output = string_io.getvalue()
+
+    # Assert that we have the expected number of scan tasks
+    assert f"Num Scan Tasks = {expected_scan_tasks}" in explain_output
+
+    result = df.to_pydict()
+
+    # Verify we got the expected number of rows
+    assert len(result["vector"]) == limit_size
+    assert len(result["big_int"]) == limit_size
+
+    # Verify the data is ordered correctly (should get first N rows)
+    expected_big_ints = list(range(limit_size))
+    assert result["big_int"] == expected_big_ints
 
 
 def test_lancedb_with_version(lance_dataset_path):