coiled · hayesgb · Sep 14, 2022 · Sep 15, 2022 · Sep 29, 2022 · Sep 29, 2022
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -31,52 +31,52 @@ jobs:
       matrix:
         os: [ubuntu-latest]
         python-version: ["3.9"]
-        pytest_args: [tests]
+        pytest_args: [tests/benchmarks/test_dataframe.py]
         runtime-version: [upstream, latest, "0.0.4", "0.1.0"]
-        include:
-          # Run stability tests on Python 3.8
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: upstream
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: latest
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: "0.0.4"
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.8"
-            runtime-version: "0.1.0"
-            os: ubuntu-latest
-          # Run stability tests on Python 3.10
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: upstream
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: latest
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: "0.0.4"
-            os: ubuntu-latest
-          - pytest_args: tests/stability
-            python-version: "3.10"
-            runtime-version: "0.1.0"
-            os: ubuntu-latest
-          # Run stability tests on Python Windows and MacOS (latest py39 only)
-          - pytest_args: tests/stability
-            python-version: "3.9"
-            runtime-version: latest
-            os: windows-latest
-          - pytest_args: tests/stability
-            python-version: "3.9"
-            runtime-version: latest
-            os: macos-latest
+        # include:
+        #   # Run stability tests on Python 3.8
+        #   - pytest_args: tests/stability
+        #     python-version: "3.8"
+        #     runtime-version: upstream
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.8"
+        #     runtime-version: latest
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.8"
+        #     runtime-version: "0.0.4"
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.8"
+        #     runtime-version: "0.1.0"
+        #     os: ubuntu-latest
+        #   # Run stability tests on Python 3.10
+        #   - pytest_args: tests/stability
+        #     python-version: "3.10"
+        #     runtime-version: upstream
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.10"
+        #     runtime-version: latest
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.10"
+        #     runtime-version: "0.0.4"
+        #     os: ubuntu-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.10"
+        #     runtime-version: "0.1.0"
+        #     os: ubuntu-latest
+        #   # Run stability tests on Python Windows and MacOS (latest py39 only)
+        #   - pytest_args: tests/stability
+        #     python-version: "3.9"
+        #     runtime-version: latest
+        #     os: windows-latest
+        #   - pytest_args: tests/stability
+        #     python-version: "3.9"
+        #     runtime-version: latest
+        #     os: macos-latest
 
     steps:
       - name: Checkout

diff --git a/tests/benchmark.db b/tests/benchmark.db
diff --git a/tests/benchmarks/benchmark.db b/tests/benchmarks/benchmark.db
diff --git a/tests/benchmarks/test_dataframe.py b/tests/benchmarks/test_dataframe.py
@@ -1,3 +1,7 @@
+from time import time
+
+import numpy as np
+from dask.datasets import timeseries
 from dask.sizeof import sizeof
 from dask.utils import format_bytes
 
@@ -58,3 +62,24 @@ def test_shuffle(small_client):
     shuf = df.shuffle(0, shuffle="tasks")
     result = shuf.size
     wait(result, small_client, 20 * 60)
+
+
+def test_ddf_isin(small_client):
+    """
+    Checks the efficiency of serializing a large list for filtering
+    a dask dataframe, and filtering the dataframe by column
+    based on that list
+    """
+    start = time()
+    n = 10_000_000
+    rs = np.random.RandomState(42)
+    ddf = timeseries(end="2000-05-01", dtypes={"A": float, "B": int}, seed=42)
+    ddf.A = ddf.A.mul(1e7)
+    ddf.A = ddf.A.astype(int).persist()
+    a_column_unique_values = np.arange(1, n // 10)
+    filter_values_list = sorted(
+        rs.choice(a_column_unique_values, len(a_column_unique_values) // 2).tolist()
+    )
+    tmp_ddf = ddf.loc[ddf["A"].isin(filter_values_list)]
+    wait(tmp_ddf, small_client, 20 * 60)
+    print(f"Total time to run test_isin:  {time() - start} seconds")