Merge pull request #151 from dusty-nv/20240512-research-group

updated VLM
NVIDIA-AI-IOT · May 15, 2024 · 8f9af7d · 8f9af7d
2 parents 301a879 + fd005fe
commit 8f9af7d
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 1 deletion.
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
@@ -2,17 +2,19 @@
 {% extends "base.html" %}
 
 <!-- Announcement bar -->
+{#
 {% block announce %}
   <style>
     .md-announce a { color: #76b900; text-decoration: underline;}
     .md-announce a:focus { color: hsl(82, 100%, 72%);  text-decoration: underline; }
     .md-announce a:hover { color: hsl(82, 100%, 72%); text-decoration: underline;}
   </style>
-    <div class="md-announce">The next research group meeting is on <a href="research.html#meeting-schedule">May 15th</a> at 9am PT! Catch up on the <a href="research.html#past-meetings">recordings</a> of the recent meetings.</div>
+    <!--<div class="md-announce">The next research group meeting is on <a href="research.html#meeting-schedule">May 15th</a> at 9am PT! Catch up on the <a href="research.html#past-meetings">recordings</a> of the recent meetings.</div>-->
     <!--<div class="md-announce">Congratulations to all the winners and participants of the <a href="https://blogs.nvidia.com/blog/glados-robot-hackster/" target="_blank">Hackster.io AI Innovation Challenge!</a></div>-->
     <!--<div class="md-announce">Microsoft's open <a href="https://blogs.nvidia.com/blog/microsoft-open-phi-3-mini-language-models/" target="_blank">Phi-3 Mini</a> language models are out!  Try them today on Jetson with <a href="tutorial_ollama.html">ollama</a>.</div>-->
 
 {% endblock %}
+#}
 
 {% block scripts %}
 <script src="//assets.adobedtm.com/5d4962a43b79/814eb6e9b4e1/launch-4bc07f1e0b0b.min.js"></script>

diff --git a/docs/tutorial_nano-vlm.md b/docs/tutorial_nano-vlm.md
@@ -127,6 +127,24 @@ The [Live Llava](tutorial_live-llava.md) tutorial shows how to enable additional
 <div><iframe width="500" height="280" src="https://www.youtube.com/embed/8Eu6zG0eEGY" style="display: inline-block;" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
 
 <iframe width="500" height="280" src="https://www.youtube.com/embed/wZq7ynbgRoE" style="display: inline-block;" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe></div>
+
+## Video Sequences
+
+The VILA-1.5 family of models can handle multiple images per query, enabling video summarization and temporal change detection.  By manipulating the KV cache and dropping off the last frame from the chat history, we can keep the stream rolling continuously beyond the maximum context length of the model.  The [`vision/video.py`](https://github.com/dusty-nv/NanoLLM/blob/main/nano_llm/vision/video.py){:target="_blank"} example shows how to use this:
+
+``` bash
+jetson-containers run $(autotag nano_llm) \
+  python3 -m nano_llm.vision.video \
+    --model Efficient-Large-Model/VILA1.5-3b \
+    --max-images 8 \
+    --max-new-tokens 48 \
+    --video-input /data/my_video.mp4 \
+    --video-output /data/my_output.mp4 \
+    --prompt 'What changes occurred in the video?'
+```
+
+<iframe width="720" height="405" src="https://www.youtube.com/embed/_7gughth8C0" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
+
 
 ## Python Code