Skip to content

Commit

Permalink
Fix QTD, video-search and MNIST notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
blythed committed Nov 28, 2023
1 parent 4584103 commit d951858
Show file tree
Hide file tree
Showing 6 changed files with 94 additions and 34 deletions.
22 changes: 15 additions & 7 deletions examples/mnist_torch.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
"cell_type": "markdown",
"id": "95f897a45b2a02cc",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Prerequisites\n",
Expand Down Expand Up @@ -75,7 +78,7 @@
"\n",
"# SuperDuperDB, now handles your MongoDB database\n",
"# It just super dupers your database \n",
"db = superduper(mongodb_uri)\n",
"db = superduper(mongodb_uri, artifact_store='filesystem://./data/')\n",
"\n",
"# Create a collection for MNIST\n",
"mnist_collection = Collection('mnist')"
Expand Down Expand Up @@ -270,15 +273,17 @@
" select=Collection('mnist').find({'_fold': 'valid'}),\n",
" )\n",
" ],\n",
" distributed=False, # Set to True if distributed training is enabled\n",
")"
]
},
{
"cell_type": "markdown",
"id": "fdf5cccb2fe0b97b",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Monitoring Training Efficiency\n",
Expand Down Expand Up @@ -375,7 +380,10 @@
"cell_type": "markdown",
"id": "dee36a804224cbb6",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"We can verify that the model is activated, by inserting the rest of the data:"
Expand Down Expand Up @@ -416,7 +424,7 @@
"# where the 'update' field is True\n",
"sample_document = db.execute(mnist_collection.find_one({'update': True}))['_outputs']\n",
"\n",
"# A sample document \n",
"# A sample document\n",
"print(sample_document)"
]
}
Expand All @@ -437,7 +445,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down
44 changes: 35 additions & 9 deletions examples/question_the_docs.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,10 @@
"cell_type": "markdown",
"id": "f98f1c7ae8e02278",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Prerequisites\n",
Expand Down Expand Up @@ -93,7 +96,10 @@
"cell_type": "markdown",
"id": "85c1a0f7572c43ba",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Connect to datastore \n",
Expand Down Expand Up @@ -141,7 +147,10 @@
"cell_type": "markdown",
"id": "737497f7d5032bf",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Load Dataset\n",
Expand Down Expand Up @@ -175,7 +184,10 @@
"cell_type": "markdown",
"id": "c9803aef243ad58c",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"Otherwise, you can load the data from an external source. The text chunks include code snippets and explanations, which will be utilized to construct the document Q&A chatbot."
Expand Down Expand Up @@ -389,7 +401,10 @@
"cell_type": "markdown",
"id": "e0922a0dc623d7bf",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Create a Chat-Completion Component\n",
Expand All @@ -411,7 +426,7 @@
"prompt = (\n",
" 'Use the following description and code snippets about SuperDuperDB to answer this question about SuperDuperDB\\n'\n",
" 'Do not use any other information you might have learned about other python packages\\n'\n",
" 'Only base your answer on the code snippets retrieved\\n'\n",
" 'Only base your answer on the code snippets retrieved and provide a very concise answer\\n'\n",
" '{context}\\n\\n'\n",
" 'Here\\'s the question:\\n'\n",
")\n",
Expand All @@ -423,7 +438,7 @@
"db.add(chat)\n",
"\n",
"# Print information about the models in the SuperDuperDB database\n",
"print(db.show('model'))\n"
"print(db.show('model'))"
]
},
{
Expand Down Expand Up @@ -488,7 +503,18 @@
"db.remove('listener', 'text-embedding-ada-002/txt', force=True)\n",
"\n",
"# Remove a model with the identifier 'text-embedding-ada-002'\n",
"db.remove('model', 'text-embedding-ada-002', force=True)\n"
"db.remove('model', 'text-embedding-ada-002', force=True)\n",
"db.remove('model', 'gpt-3.5-turbo', force=True)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "03e7956b-10d1-4986-885f-f9c061bdae7d",
"metadata": {},
"outputs": [],
"source": [
"db.show('listener')"
]
}
],
Expand All @@ -508,7 +534,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down
49 changes: 35 additions & 14 deletions examples/video_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,10 @@
"cell_type": "markdown",
"id": "6eec562900dd0cff",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Prerequisites\n",
Expand Down Expand Up @@ -73,19 +76,19 @@
"metadata": {},
"outputs": [],
"source": [
"from superduperdb import superduper, Collection, CFG\n",
"from superduperdb import superduper, CFG\n",
"from superduperdb.backends.mongodb import Collection\n",
"import os\n",
"\n",
"# Set configuration options for downloads\n",
"CFG.downloads.hybrid = True\n",
"CFG.downloads.root = './'\n",
"# Use hybrid storage\n",
"CFG.force_set('downloads_folder', './data')\n",
"\n",
"# Define the MongoDB URI, with a default value if not provided\n",
"mongodb_uri = os.getenv(\"MONGODB_URI\", \"mongomock://test\")\n",
"\n",
"# SuperDuperDB, now handles your MongoDB database\n",
"# It just super dupers your database by initializing a SuperDuperDB datalayer instance with a MongoDB backend and filesystem-based artifact store\n",
"db = superduper(mongodb_uri, artifact_store='filesystem://./data/')\n",
"db = superduper(mongodb_uri, artifact_store='filesystem://./data/', downloads_folder='./data')\n",
"\n",
"# Create a collection named 'videos'\n",
"video_collection = Collection('videos')"
Expand All @@ -95,7 +98,10 @@
"cell_type": "markdown",
"id": "1e53ce4113115246",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Load Dataset\n",
Expand Down Expand Up @@ -147,14 +153,27 @@
")\n",
"\n",
"# Display the list of videos in the 'videos' collection\n",
"list(db.execute(Collection('videos').find()))\n"
"list(db.execute(Collection('videos').find()))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8f6bf3fa-2a2b-44c4-8a95-328a515c90c0",
"metadata": {},
"outputs": [],
"source": [
"db.execute(video_collection.find_one())"
]
},
{
"cell_type": "markdown",
"id": "441fe6d6a9dee06b",
"metadata": {
"collapsed": false
"collapsed": false,
"jupyter": {
"outputs_hidden": false
}
},
"source": [
"## Register Encoders\n",
Expand Down Expand Up @@ -244,7 +263,7 @@
"# Add a listener to process videos using the video2images model\n",
"db.add(\n",
" Listener(\n",
" model=video2images, # Assuming video2images is your SuperDuperDB model\n",
" model=video2images_model, # Assuming video2images is your SuperDuperDB model\n",
" select=video_collection.find(),\n",
" key='video',\n",
" )\n",
Expand All @@ -254,7 +273,9 @@
"outputs = db.execute(Collection('_outputs.video.video2images').find_one()).unpack()\n",
"\n",
"# Display the image output from the processed video\n",
"image_output = outputs['_outputs']['video']['video2images']['image']\n"
"image_output = outputs['_outputs']['video']['video2images']['0']['image']\n",
"\n",
"image_output"
]
},
{
Expand Down Expand Up @@ -330,7 +351,7 @@
" identifier='video_search_index',\n",
" indexing_listener=Listener(\n",
" model=visual_model, # Visual model for image processing\n",
" key='_outputs.video.video2images.image', # Visual model for image processing\n",
" key='_outputs.video.video2images.0.image', # Visual model for image processing\n",
" select=Collection('_outputs.video.video2images').find(), # Collection containing video image data\n",
" ),\n",
" compatible_listener=Listener(\n",
Expand Down Expand Up @@ -372,7 +393,7 @@
"))\n",
"\n",
"# Extract the timestamp from the search result\n",
"search_timestamp = r['_outputs']['video']['video2images']['current_timestamp']\n",
"search_timestamp = r['_outputs']['video']['video2images']['0']['current_timestamp']\n",
"\n",
"# Retrieve the back reference to the original video using the '_source' field\n",
"video = db.execute(Collection('videos').find_one({'_id': r['_source']}))"
Expand Down Expand Up @@ -435,7 +456,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
"version": "3.11.6"
}
},
"nbformat": 4,
Expand Down
8 changes: 5 additions & 3 deletions superduperdb/base/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ class Config(BaseConfigJSONable):
:param cluster: Settings distributed computing and change data capture
:param retries: Settings for retrying failed operations
:param hybrid_storage: Toggle on to save large downloads on disk
:param downloads_folder: Settings for downloading files
:param fold_probability: The probability of validation fold
Expand All @@ -130,8 +129,7 @@ def self_hosted_vector_search(self) -> bool:
cluster: Cluster = Factory(Cluster)
retries: Retry = Factory(Retry)

hybrid_storage: bool = False
downloads_folder: str = '.superduperdb/downloads'
downloads_folder: t.Optional[str] = None
fold_probability: float = 0.05

log_level: LogLevel = LogLevel.DEBUG
Expand All @@ -140,6 +138,10 @@ def self_hosted_vector_search(self) -> bool:
class Config(JSONable.Config):
protected_namespaces = ()

@property
def hybrid_storage(self):
return self.downloads_folder is not None

@property
def comparables(self):
"""
Expand Down
1 change: 1 addition & 0 deletions superduperdb/base/datalayer.py
Original file line number Diff line number Diff line change
Expand Up @@ -915,6 +915,7 @@ def _add(
object.on_load(self)
return object.schedule_jobs(self, dependencies=dependencies), object
except Exception as e:

raise exceptions.DatalayerException(
f'Error while adding object with id: {object.identifier}'
) from e
Expand Down
4 changes: 3 additions & 1 deletion superduperdb/misc/files.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import hashlib
import typing as t

from superduperdb import CFG


def get_file_from_uri(uri):
"""
Expand All @@ -18,7 +20,7 @@ def get_file_from_uri(uri):
or uri.startswith('https://')
or uri.startswith('s3://')
):
file = hashlib.sha1(uri.encode()).hexdigest()
file = f'{CFG.downloads_folder}/{hashlib.sha1(uri.encode()).hexdigest()}'
else:
raise NotImplementedError(f'File type of {file} not supported')
return file
Expand Down

0 comments on commit d951858

Please sign in to comment.