ploomber · edublancas · Oct 11, 2023 · Sep 22, 2023 · Sep 22, 2023 · Sep 22, 2023
@@ -2,6 +2,7 @@
 
 ## 0.10.3dev
 
+* [Fix] Fix empty result in certain duckdb `SELECT` and `SUMMARIZE` queries with leading comments (#892)
 * [Fix] Remove force deleted snippets from dependent snippet's `with` (#717)
 
 ## 0.10.2 (2023-09-22)

diff --git a/doc/jupyterlab/sharing.md b/doc/jupyterlab/sharing.md
@@ -31,4 +31,4 @@ you with a unique URL that you can share.
 pip install jupysql
 ```
 
-> **For detailed instructions [click here.](https://docs.cloud.ploomber.io/en/latest/dashboards/jupyterlab-plugin.html)**
+> **For detailed instructions [click here.](https://docs.cloud.ploomber.io/en/latest/apps/jupyterlab-plugin.html)**
@@ -18,6 +18,8 @@
 )
 from IPython.core.error import UsageError
 import sqlglot
+from sqlglot import parse_one, exp
+from sqlglot.generator import Generator
 import sqlparse
 from ploomber_core.exceptions import modify_exceptions
 
@@ -717,34 +719,40 @@ def _connection_execute(self, query, parameters=None):
             Parameters to use in the query (:variable format)
         """
         parameters = parameters or {}
-
         # we do not support multiple statements
         if len(sqlparse.split(query)) > 1:
             raise NotImplementedError("Only one statement is supported.")
 
-        words = query.split()
-
-        if words:
-            first_word_statement = words[0].lower()
-        else:
-            first_word_statement = ""
-
-        # NOTE: in duckdb db "from TABLE_NAME" is valid
-        # TODO: we can parse the query to ensure that it's a SELECT statement
-        # for example, it might start with WITH but the final statement might
-        # not be a SELECT
-        # `summarize` is added to support %sql SUMMARIZE table in duckdb
-        is_select = first_word_statement in {"select", "with", "from", "summarize"}
-
         operation = partial(self._execute_with_parameters, query, parameters)
         out = self._execute_with_error_handling(operation)
 
         if self._requires_manual_commit:
-            # calling connection.commit() when using duckdb-engine will yield
-            # empty results if we commit after a SELECT statement
-            # see: https://github.com/Mause/duckdb_engine/issues/734
-            if is_select and self.dialect == "duckdb":
-                return out
+            # Calling connection.commit() when using duckdb-engine will yield
+            # empty results if we commit after a SELECT or SUMMARIZE statement,
+            # see: https://github.com/Mause/duckdb_engine/issues/734.
+            if self.dialect == "duckdb":
+                is_duckdb_sqlalchemy = not self.is_dbapi_connection
+                if is_duckdb_sqlalchemy:
+                    parse_dialect = "tsql"
+                else:
+                    parse_dialect = "duckdb"
+
+                # Attempt to use sqlglot to detect SELECT and SUMMARIZE.
+                try:
+                    expression = parse_one(query, dialect=parse_dialect)
+                    sql_stripped = Generator(comments=False).generate(expression)
+                    words = sql_stripped.split()
+                    if (
+                        words
+                        and (
+                            words[0].lower() == "select"
+                            or words[0].lower() == "summarize"
+                        )
+                        or isinstance(expression, exp.Select)
+                    ):
+                        return out
+                except sqlglot.errors.ParseError:
+                    pass
 
             # in sqlalchemy 1.x, connection has no commit attribute
             if IS_SQLALCHEMY_ONE:

@@ -1987,3 +1987,73 @@ def test_accessing_previously_nonexisting_file(ip_empty, tmp_empty, capsys):
     ip_empty.run_cell("%sql SELECT * FROM 'data.csv' LIMIT 3")
     out, _ = capsys.readouterr()
     assert expected in out
+
+
+def test_comments_in_duckdb_select_summarize(ip_empty):
+    expected_summarize = {
+        "column_name": ("memid",),
+        "column_type": ("BIGINT",),
+        "min": ("1",),
+        "max": ("8",),
+        "approx_unique": ("5",),
+        "avg": ("3.8",),
+        "std": ("2.7748873851023217",),
+        "q25": ("2",),
+        "q50": ("3",),
+        "q75": ("6",),
+        "count": (5,),
+        "null_percentage": ("0.0%",),
+    }
+
+    df = pd.DataFrame(
+        data=dict(
+            memid=[1, 2, 3, 5, 8],
+        ),
+    )
+    _ = df
+
+    ip_empty.run_cell("%sql duckdb://")
+
+    out = ip_empty.run_cell("%sql /* x */ SUMMARIZE df").result
+    assert out.dict() == expected_summarize
+
+    out = ip_empty.run_cell("%sql /*x*//*x*/ SUMMARIZE /*x*/ df").result
+    assert out.dict() == expected_summarize
+
+    out = ip_empty.run_cell(
+        """%%sql
+        /*x*/
+        SUMMARIZE df
+        """
+    ).result
+    assert out.dict() == expected_summarize
+
+    out = ip_empty.run_cell(
+        """%%sql
+        /*x*/
+
+        /*x*/
+        -- comment
+        SUMMARIZE df
+        /*x*/
+        """
+    ).result
+    assert out.dict() == expected_summarize
+
+    expected_select = {"memid": (1, 2, 3, 5, 8)}
+
+    out = ip_empty.run_cell(
+        """%%sql
+        /*x*/
+        SELECT * FROM df
+        """
+    ).result
+    assert out.dict() == expected_select
+
+    out = ip_empty.run_cell(
+        """%%sql
+        /*x*/
+        FROM df SELECT *
+        """
+    ).result
+    assert out.dict() == expected_select