updated

superduper-io · Aug 12, 2024 · 3316c3c · 3316c3c
1 parent cc70a2e
commit 3316c3c
Show file tree

Hide file tree

Showing 51 changed files with 232 additions and 286 deletions.
diff --git a/.github/workflows/ci_code.yml b/.github/workflows/ci_code.yml
@@ -62,9 +62,9 @@ jobs:
       run: |
         # Install core and testsuite dependencies on the cached python environment.
         python -m pip install .
+        # TODO: We currently need a default plugin to run tests using MongoDB. 
+        # Once the local file database is complete, we may need to update this section.
         python -m pip install plugins/mongodb
-        python -m pip install -r deploy/installations/testenv_requirements.txt
-        
 
     - name: Install DevKit (docs, testing, etc)
       run: |
@@ -78,44 +78,6 @@ jobs:
       run: |
         make unit_testing pytest_arguments="--cov=superduper --cov-report=xml"
 
-    - name: Ext Testing
-      run: |
-        make ext_testing
-
     - name: Usecase Testing
       run: |
         make usecase_testing
-
-  # # ---------------------------------
-  # # Integration Testing
-  # # ---------------------------------
-  # integration-testing:
-  #   needs: [ 'unit_testing' ]
-  #   runs-on: ubuntu-latest
-  #   steps:
-  #     - name: Checkout repository
-  #       uses: actions/checkout@v4
-  #
-  #     - name: Build Docker Image
-  #       run: |
-  #         make build_sandbox
-  #
-  #     - name: Start Testing Environment
-  #       run: |
-  #         # Update hostnames
-  #         echo  127.0.0.1 mongodb | sudo tee -a /etc/hosts
-  #
-  #         # Run the integrated testing environment
-  #         make testenv_init
-  #         # To make sure that pytest etc. are installed in the container
-  #         docker exec testenv-sandbox-1 sh -c 'make install_devkit'
-  #
-  #     - name: Data-backend Testing
-  #       run: |
-  #         # Run the test-suite within the sandbox image from the test environment
-  #         docker exec testenv-sandbox-1 sh -c 'make databackend_testing'
-  #
-  #     - name: Shutdown Testing Environment
-  #       run: |
-  #         # Run the integrated testing environment
-  #         make testenv_shutdown
diff --git a/.github/workflows/ci_plugins.yaml b/.github/workflows/ci_plugins.yaml
@@ -54,6 +54,13 @@ jobs:
           python-version: '3.10'
           cache: 'pip' # caching pip dependencies
 
+      - name: Cache Python Installation
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.pythonLocation }} # Cache the whole python installation dir.
+          key: ${{ matrix.plugin }}_${{ hashFiles('pyproject.toml', '*/pyproject.toml') }}
+
+
       - name: Install SuperDuperDB Project
         run: |
           # Install core and testsuite dependencies on the cached python environment.
@@ -62,6 +69,11 @@ jobs:
       - name: Install DevKit (docs, testing, etc)
         run: |
           make install_devkit
+
+      - name: Lint and type-check
+        run: |
+          make lint-and-type-check DIRECTORIES="plugins/${{ matrix.plugin }}"
+
       - name: Install Plugin
         run: |
           python -m pip install 'plugins/${{ matrix.plugin }}[test]'
@@ -75,11 +87,18 @@ jobs:
             echo "No custom CI script found, skipping..."
           fi
    
-      # - name: Lint and type-check
-      #   run: |
-      #     make lint-and-type-check
-
       - name: Plugin Testing
         run: |
           export PYTHONPATH=./
           pytest --cov=superduper --cov-report=xml plugins/${{ matrix.plugin }}/plugin_test
+
+      - name: Optionally run the base testing
+        run: |
+          SUPERDUPER_CONFIG="plugins/${{ matrix.plugin }}/plugin_test/config.yaml"
+          if [ -f "$SUPERDUPER_CONFIG" ]; then
+            echo "Running the base testing..."
+            make unit_testing SUPERDUPER_CONFIG=$SUPERDUPER_CONFIG
+            make usecase_testing SUPERDUPER_CONFIG=$SUPERDUPER_CONFIG
+          else
+            echo "No config file found, skipping..."
+          fi
diff --git a/Makefile b/Makefile
@@ -1,4 +1,5 @@
-DIRECTORIES = superduper test
+DIRECTORIES ?= superduper test
+SUPERDUPER_CONFIG ?= test/configs/default.yaml
 PYTEST_ARGUMENTS ?=
 BACKENDS ?= mongodb_community sqlite duckdb pandas
 
@@ -26,8 +27,7 @@ CURRENT_RELEASE=$(shell git describe --abbrev=0 --tags)
 CURRENT_COMMIT=$(shell git rev-parse --short HEAD)
 
 new_release: ## Release a new version of superduper.io
-	@ if [[ -z "${RELEASE_VERSION}" ]]; then echo "VERSION is not set"; exit 1; fi
-	@ if [[ "$(RELEASE_VERSION)" == "v$(CURRENT_RELEASE)" ]]; then echo "No new release version. Please update VERSION file."; exit 1; fi
+	@ if [[ -z "${RELEASE_VERSION}" ]]; then echo "VERSION is not set"; exit 1; fi @ if [[ "$(RELEASE_VERSION)" == "v$(CURRENT_RELEASE)" ]]; then echo "No new release version. Please update VERSION file."; exit 1; fi
 	# Switch to release branch
 	@echo "** Switching to branch release-$(RELEASE_VERSION)"
 	@git checkout -b release-$(RELEASE_VERSION)
@@ -238,27 +238,7 @@ testdb_shutdown: check_db_variable ## Shutdown Databases Containers (DB=<mongodb
 ##@ CI Testing Functions
 
 unit_testing: ## Execute unit testing
-	# TODO After we have completed separating the plugins, we can run the tests only on default.yaml.
-	# SUPERDUPER_CONFIG=test/configs/mongodb.yaml pytest $(PYTEST_ARGUMENTS) ./test/unittest
-	# SUPERDUPER_CONFIG=test/configs/ibis.yaml pytest $(PYTEST_ARGUMENTS) ./test/unittest
-	SUPERDUPER_CONFIG=test/configs/default.yaml pytest $(PYTEST_ARGUMENTS) ./test/unittest
-
-# databackend_testing: ## Execute integration testing
-# 	@echo "TESTING BACKENDS"
-# 	@for backend in $(BACKENDS); do \
-# 		echo "TESTING $$backend"; \
-# 		SUPERDUPER_CONFIG=deploy/testenv/env/integration/backends/$$backend.yaml pytest $(PYTEST_ARGUMENTS) ./test/integration/backends; \
-# 	done
-# 	@echo "TODO -- implement more backends integration testing..."
-
-# ext_testing: ## Execute integration testing
-# 	find ./test -type d -name __pycache__ -exec rm -r {} +
-# 	find ./test -type f -name "*.pyc" -delete
-# 	pytest $(PYTEST_ARGUMENTS) ./test/integration/ext
-
+	SUPERDUPER_CONFIG=$(SUPERDUPER_CONFIG) pytest $(PYTEST_ARGUMENTS) ./test/unittest
 
 usecase_testing: ## Execute usecase testing
-	# TODO After we have completed separating the plugins, we can run the tests only on default.yaml.
-	# SUPERDUPER_CONFIG=test/configs/mongodb.yaml pytest $(PYTEST_ARGUMENTS) ./test/integration/usecase
-	# SUPERDUPER_CONFIG=test/configs/ibis.yaml pytest $(PYTEST_ARGUMENTS) ./test/integration/usecase
-	SUPERDUPER_CONFIG=test/configs/default.yaml pytest $(PYTEST_ARGUMENTS) ./test/integration/usecase
+	SUPERDUPER_CONFIG=$(SUPERDUPER_CONFIG) pytest $(PYTEST_ARGUMENTS) ./test/integration/usecase
diff --git a/plugins/anthropic/pyproject.toml b/plugins/anthropic/pyproject.toml
@@ -77,3 +77,4 @@ combine-as-imports = true
 
 [tool.ruff.lint.per-file-ignores]
 "test/**" = ["D"]
+"plugin_test/**" = ["D"]
diff --git a/plugins/anthropic/superduper_anthropic/__init__.py b/plugins/anthropic/superduper_anthropic/__init__.py
@@ -4,7 +4,6 @@
 
 from .model import AnthropicCompletions
 
-
 __version__ = '0.3.0'
 
 __all__ = ('AnthropicCompletions',)
diff --git a/plugins/cohere/pyproject.toml b/plugins/cohere/pyproject.toml
@@ -77,3 +77,4 @@ combine-as-imports = true
 
 [tool.ruff.lint.per-file-ignores]
 "test/**" = ["D"]
+"plugin_test/**" = ["D"]
diff --git a/plugins/ibis/plugin_test/test_end_2_end.py b/plugins/ibis/plugin_test/test_end_2_end.py
@@ -1,5 +1,3 @@
-import os
-
 import PIL.Image
 import pytest
 from superduper import CFG, superduper
@@ -11,65 +9,67 @@
 @pytest.mark.skip
 def test_end_2_end():
     memory_table = False
-    if CFG.data_backend.endswith('csv'):
+    if CFG.data_backend.endswith("csv"):
         memory_table = True
     _end_2_end(superduper(), memory_table=memory_table)
 
-#TODO: Fix the test without torch
+
+# TODO: Fix the test without torch
 def _end_2_end(db, memory_table=False):
     import torch.nn
     import torchvision
     from superduper.ext.torch.encoder import tensor
     from superduper.ext.torch.model import TorchModel
+    from superduper_pillow import pil_image
 
     schema = Schema(
-        identifier='my_table',
+        identifier="my_table",
         fields={
-            'id': FieldType(identifier='str'),
-            'health': FieldType(identifier='int32'),
-            'age': FieldType(identifier='int32'),
-            'image': pil_image,
+            "id": FieldType(identifier="str"),
+            "health": FieldType(identifier="int32"),
+            "age": FieldType(identifier="int32"),
+            "image": pil_image,
         },
     )
-    im = PIL.Image.open('test/material/data/test-image.jpeg')
+    im = PIL.Image.open("test/material/data/test-image.jpeg")
 
     data_to_insert = [
-        {'id': '1', 'health': 0, 'age': 25, 'image': im},
-        {'id': '2', 'health': 1, 'age': 26, 'image': im},
-        {'id': '3', 'health': 0, 'age': 27, 'image': im},
-        {'id': '4', 'health': 1, 'age': 28, 'image': im},
+        {"id": "1", "health": 0, "age": 25, "image": im},
+        {"id": "2", "health": 1, "age": 26, "image": im},
+        {"id": "3", "health": 0, "age": 27, "image": im},
+        {"id": "4", "health": 1, "age": 28, "image": im},
     ]
 
     from superduper.components.table import Table
 
-    t = Table(identifier='my_table', schema=schema, db=db)
+    t = Table(identifier="my_table", schema=schema, db=db)
 
     db.add(t)
-    t = db['my_table']
+    t = db["my_table"]
 
     insert = t.insert(
         [
             D(
                 {
-                    'id': d['id'],
-                    'health': d['health'],
-                    'age': d['age'],
-                    'image': d['image'],
+                    "id": d["id"],
+                    "health": d["health"],
+                    "age": d["age"],
+                    "image": d["image"],
                 }
             )
             for d in data_to_insert
         ]
     )
     db.execute(insert)
 
-    q = t.select('image', 'age', 'health')
+    q = t.select("image", "age", "health")
 
     result = db.execute(q)
     for img in result:
         img = img.unpack()
-        assert isinstance(img['image'], PIL.Image.Image)
-        assert isinstance(img['age'], int)
-        assert isinstance(img['health'], int)
+        assert isinstance(img["image"], PIL.Image.Image)
+        assert isinstance(img["age"], int)
+        assert isinstance(img["health"], int)
 
     # preprocessing function
     preprocess = torchvision.transforms.Compose(
@@ -88,29 +88,29 @@ def postprocess(x):
 
     # create a torchvision model
     resnet = TorchModel(
-        identifier='resnet18',
+        identifier="resnet18",
         preprocess=preprocess,
         postprocess=postprocess,
         object=torchvision.models.resnet18(pretrained=False),
-        datatype=FieldType('int32'),
+        datatype=FieldType("int32"),
     )
 
     # Apply the torchvision model
     listener1 = Listener(
         model=resnet,
-        key='image',
-        select=t.select('id', 'image'),
-        predict_kwargs={'max_chunk_size': 3000},
-        identifier='listener1',
+        key="image",
+        select=t.select("id", "image"),
+        predict_kwargs={"max_chunk_size": 3000},
+        identifier="listener1",
     )
     db.add(listener1)
 
     # also add a vectorizing model
     vectorize = TorchModel(
         preprocess=lambda x: torch.randn(32),
         object=torch.nn.Linear(32, 16),
-        identifier='model_linear_a',
-        datatype=tensor(dtype='float', shape=(16,)),
+        identifier="model_linear_a",
+        datatype=tensor(dtype="float", shape=(16,)),
     )
 
     # create outputs query
@@ -121,22 +121,22 @@ def postprocess(x):
         model=vectorize,
         key=listener1.outputs,
         select=q,
-        predict_kwargs={'max_chunk_size': 3000},
-        identifier='listener2',
+        predict_kwargs={"max_chunk_size": 3000},
+        identifier="listener2",
     )
     db.add(listener2)
 
     # Build query to get the results back
-    q = t.outputs(listener2.outputs).select('id', 'image', 'age').filter(t.age > 25)
+    q = t.outputs(listener2.outputs).select("id", "image", "age").filter(t.age > 25)
 
     # Get the results
     result = list(db.execute(q))
     assert result
-    assert 'image' in result[0].unpack()
+    assert "image" in result[0].unpack()
 
     # TODO: Make this work
 
-    q = t.select('id', 'image', 'age').filter(t.age > 25).outputs(listener2.outputs)
+    q = t.select("id", "image", "age").filter(t.age > 25).outputs(listener2.outputs)
 
     # Get the results
     result = list(db.execute(q))
@@ -147,24 +147,24 @@ def test_nested_query():
     db = superduper()
 
     memory_table = False
-    if CFG.data_backend.endswith('csv'):
+    if CFG.data_backend.endswith("csv"):
         memory_table = True
     schema = Schema(
-        identifier='my_table',
+        identifier="my_table",
         fields={
-            'id': FieldType(identifier='int64'),
-            'health': FieldType(identifier='int32'),
-            'age': FieldType(identifier='int32'),
+            "id": FieldType(identifier="int64"),
+            "health": FieldType(identifier="int32"),
+            "age": FieldType(identifier="int32"),
         },
     )
 
     from superduper.components.table import Table
 
-    t = Table(identifier='my_table', schema=schema)
+    t = Table(identifier="my_table", schema=schema)
 
     db.add(t)
 
-    t = db['my_table']
+    t = db["my_table"]
     q = t.filter(t.age >= 10)
 
     expr_ = q.compile(db)