Fix QTD, video-search and MNIST notebooks

superduper-io · Nov 28, 2023 · d951858 · d951858
1 parent 4584103
commit d951858
Show file tree

Hide file tree

Showing 6 changed files with 94 additions and 34 deletions.
diff --git a/examples/mnist_torch.ipynb b/examples/mnist_torch.ipynb
@@ -24,7 +24,10 @@
    "cell_type": "markdown",
    "id": "95f897a45b2a02cc",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Prerequisites\n",
@@ -75,7 +78,7 @@
     "\n",
     "# SuperDuperDB, now handles your MongoDB database\n",
     "# It just super dupers your database \n",
-    "db = superduper(mongodb_uri)\n",
+    "db = superduper(mongodb_uri, artifact_store='filesystem://./data/')\n",
     "\n",
     "# Create a collection for MNIST\n",
     "mnist_collection = Collection('mnist')"
@@ -270,15 +273,17 @@
     "            select=Collection('mnist').find({'_fold': 'valid'}),\n",
     "        )\n",
     "    ],\n",
-    "    distributed=False, # Set to True if distributed training is enabled\n",
     ")"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "fdf5cccb2fe0b97b",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Monitoring Training Efficiency\n",
@@ -375,7 +380,10 @@
    "cell_type": "markdown",
    "id": "dee36a804224cbb6",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "We can verify that the model is activated, by inserting the rest of the data:"
@@ -416,7 +424,7 @@
     "# where the 'update' field is True\n",
     "sample_document = db.execute(mnist_collection.find_one({'update': True}))['_outputs']\n",
     "\n",
-    "# A sample document \n",
+    "# A sample document\n",
     "print(sample_document)"
    ]
   }
@@ -437,7 +445,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/examples/question_the_docs.ipynb b/examples/question_the_docs.ipynb
@@ -46,7 +46,10 @@
    "cell_type": "markdown",
    "id": "f98f1c7ae8e02278",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Prerequisites\n",
@@ -93,7 +96,10 @@
    "cell_type": "markdown",
    "id": "85c1a0f7572c43ba",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Connect to datastore \n",
@@ -141,7 +147,10 @@
    "cell_type": "markdown",
    "id": "737497f7d5032bf",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Load Dataset\n",
@@ -175,7 +184,10 @@
    "cell_type": "markdown",
    "id": "c9803aef243ad58c",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "Otherwise, you can load the data from an external source. The text chunks include code snippets and explanations, which will be utilized to construct the document Q&A chatbot."
@@ -389,7 +401,10 @@
    "cell_type": "markdown",
    "id": "e0922a0dc623d7bf",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Create a Chat-Completion Component\n",
@@ -411,7 +426,7 @@
     "prompt = (\n",
     "    'Use the following description and code snippets about SuperDuperDB to answer this question about SuperDuperDB\\n'\n",
     "    'Do not use any other information you might have learned about other python packages\\n'\n",
-    "    'Only base your answer on the code snippets retrieved\\n'\n",
+    "    'Only base your answer on the code snippets retrieved and provide a very concise answer\\n'\n",
     "    '{context}\\n\\n'\n",
     "    'Here\\'s the question:\\n'\n",
     ")\n",
@@ -423,7 +438,7 @@
     "db.add(chat)\n",
     "\n",
     "# Print information about the models in the SuperDuperDB database\n",
-    "print(db.show('model'))\n"
+    "print(db.show('model'))"
    ]
   },
   {
@@ -488,7 +503,18 @@
     "db.remove('listener', 'text-embedding-ada-002/txt', force=True)\n",
     "\n",
     "# Remove a model with the identifier 'text-embedding-ada-002'\n",
-    "db.remove('model', 'text-embedding-ada-002', force=True)\n"
+    "db.remove('model', 'text-embedding-ada-002', force=True)\n",
+    "db.remove('model', 'gpt-3.5-turbo', force=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03e7956b-10d1-4986-885f-f9c061bdae7d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db.show('listener')"
    ]
   }
  ],
@@ -508,7 +534,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/examples/video_search.ipynb b/examples/video_search.ipynb
@@ -31,7 +31,10 @@
    "cell_type": "markdown",
    "id": "6eec562900dd0cff",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Prerequisites\n",
@@ -73,19 +76,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from superduperdb import superduper, Collection, CFG\n",
+    "from superduperdb import superduper, CFG\n",
+    "from superduperdb.backends.mongodb import Collection\n",
     "import os\n",
     "\n",
-    "# Set configuration options for downloads\n",
-    "CFG.downloads.hybrid = True\n",
-    "CFG.downloads.root = './'\n",
+    "# Use hybrid storage\n",
+    "CFG.force_set('downloads_folder', './data')\n",
     "\n",
     "# Define the MongoDB URI, with a default value if not provided\n",
     "mongodb_uri = os.getenv(\"MONGODB_URI\", \"mongomock://test\")\n",
     "\n",
     "# SuperDuperDB, now handles your MongoDB database\n",
     "# It just super dupers your database by initializing a SuperDuperDB datalayer instance with a MongoDB backend and filesystem-based artifact store\n",
-    "db = superduper(mongodb_uri, artifact_store='filesystem://./data/')\n",
+    "db = superduper(mongodb_uri, artifact_store='filesystem://./data/', downloads_folder='./data')\n",
     "\n",
     "# Create a collection named 'videos'\n",
     "video_collection = Collection('videos')"
@@ -95,7 +98,10 @@
    "cell_type": "markdown",
    "id": "1e53ce4113115246",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Load Dataset\n",
@@ -147,14 +153,27 @@
     ")\n",
     "\n",
     "# Display the list of videos in the 'videos' collection\n",
-    "list(db.execute(Collection('videos').find()))\n"
+    "list(db.execute(Collection('videos').find()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f6bf3fa-2a2b-44c4-8a95-328a515c90c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db.execute(video_collection.find_one())"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "441fe6d6a9dee06b",
    "metadata": {
-    "collapsed": false
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    }
    },
    "source": [
     "## Register Encoders\n",
@@ -244,7 +263,7 @@
     "# Add a listener to process videos using the video2images model\n",
     "db.add(\n",
     "   Listener(\n",
-    "       model=video2images,  # Assuming video2images is your SuperDuperDB model\n",
+    "       model=video2images_model,  # Assuming video2images is your SuperDuperDB model\n",
     "       select=video_collection.find(),\n",
     "       key='video',\n",
     "   )\n",
@@ -254,7 +273,9 @@
     "outputs = db.execute(Collection('_outputs.video.video2images').find_one()).unpack()\n",
     "\n",
     "# Display the image output from the processed video\n",
-    "image_output = outputs['_outputs']['video']['video2images']['image']\n"
+    "image_output = outputs['_outputs']['video']['video2images']['0']['image']\n",
+    "\n",
+    "image_output"
    ]
   },
   {
@@ -330,7 +351,7 @@
     "        identifier='video_search_index',\n",
     "        indexing_listener=Listener(\n",
     "            model=visual_model, # Visual model for image processing\n",
-    "            key='_outputs.video.video2images.image', # Visual model for image processing\n",
+    "            key='_outputs.video.video2images.0.image', # Visual model for image processing\n",
     "            select=Collection('_outputs.video.video2images').find(), # Collection containing video image data\n",
     "        ),\n",
     "        compatible_listener=Listener(\n",
@@ -372,7 +393,7 @@
     "))\n",
     "\n",
     "# Extract the timestamp from the search result\n",
-    "search_timestamp = r['_outputs']['video']['video2images']['current_timestamp']\n",
+    "search_timestamp = r['_outputs']['video']['video2images']['0']['current_timestamp']\n",
     "\n",
     "# Retrieve the back reference to the original video using the '_source' field\n",
     "video = db.execute(Collection('videos').find_one({'_id': r['_source']}))"
@@ -435,7 +456,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.11.5"
+   "version": "3.11.6"
   }
  },
  "nbformat": 4,

diff --git a/superduperdb/base/config.py b/superduperdb/base/config.py
@@ -105,7 +105,6 @@ class Config(BaseConfigJSONable):
     :param cluster: Settings distributed computing and change data capture
     :param retries: Settings for retrying failed operations
 
-    :param hybrid_storage: Toggle on to save large downloads on disk
     :param downloads_folder: Settings for downloading files
 
     :param fold_probability: The probability of validation fold
@@ -130,8 +129,7 @@ def self_hosted_vector_search(self) -> bool:
     cluster: Cluster = Factory(Cluster)
     retries: Retry = Factory(Retry)
 
-    hybrid_storage: bool = False
-    downloads_folder: str = '.superduperdb/downloads'
+    downloads_folder: t.Optional[str] = None
     fold_probability: float = 0.05
 
     log_level: LogLevel = LogLevel.DEBUG
@@ -140,6 +138,10 @@ def self_hosted_vector_search(self) -> bool:
     class Config(JSONable.Config):
         protected_namespaces = ()
 
+    @property
+    def hybrid_storage(self):
+        return self.downloads_folder is not None
+
     @property
     def comparables(self):
         """

diff --git a/superduperdb/base/datalayer.py b/superduperdb/base/datalayer.py
@@ -915,6 +915,7 @@ def _add(
             object.on_load(self)
             return object.schedule_jobs(self, dependencies=dependencies), object
         except Exception as e:
+
             raise exceptions.DatalayerException(
                 f'Error while adding object with id: {object.identifier}'
             ) from e

diff --git a/superduperdb/misc/files.py b/superduperdb/misc/files.py
@@ -1,6 +1,8 @@
 import hashlib
 import typing as t
 
+from superduperdb import CFG
+
 
 def get_file_from_uri(uri):
     """
@@ -18,7 +20,7 @@ def get_file_from_uri(uri):
         or uri.startswith('https://')
         or uri.startswith('s3://')
     ):
-        file = hashlib.sha1(uri.encode()).hexdigest()
+        file = f'{CFG.downloads_folder}/{hashlib.sha1(uri.encode()).hexdigest()}'
     else:
         raise NotImplementedError(f'File type of {file} not supported')
     return file